mirror of
https://github.com/fadden/6502bench.git
synced 2024-10-31 19:04:44 +00:00
Experiment on uncategorized data analysis
Tried something to speed it up. Didn't help. Cleaned up the code a bit though.
This commit is contained in:
parent
61d6cd597a
commit
8d0ce87ec7
@ -254,6 +254,38 @@ namespace CommonUtil {
|
|||||||
return (FindValue(val) >= 0);
|
return (FindValue(val) >= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if false
|
||||||
|
/// <summary>
|
||||||
|
/// Finds a range that contains searchVal, or identifies the one that immediately
|
||||||
|
/// follows. The caller can determine which by checking to see if range.Low is
|
||||||
|
/// greater than searchVal.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="searchVal">Value to find.</param>
|
||||||
|
/// <param name="range">Result.</param>
|
||||||
|
/// <returns>True if a valid range was returned.</returns>
|
||||||
|
public bool GetContainingOrSubsequentRange(int searchVal, out Range range) {
|
||||||
|
int index = FindValue(searchVal);
|
||||||
|
if (index >= 0) {
|
||||||
|
// found a range that contains val
|
||||||
|
range = mRangeList[index];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No matching range, so the index of the insertion point was returned. The
|
||||||
|
// indexed range will have a "low" value that is greater than searchVal. If
|
||||||
|
// we've reached the end of the list, the index will be past the end.
|
||||||
|
index = -index - 1;
|
||||||
|
if (index >= mRangeList.Count) {
|
||||||
|
// reached the end of the list
|
||||||
|
range = new Range(-128, -128);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
range = mRangeList[index];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Adds a value to the set. If the value is already present, nothing changes.
|
/// Adds a value to the set. If the value is already present, nothing changes.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@ -352,8 +384,18 @@ namespace CommonUtil {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void DebugDump(string name) {
|
||||||
|
Debug.WriteLine(name + " has " + DebugRangeCount + " ranges");
|
||||||
|
IEnumerator<Range> iter = RangeListIterator;
|
||||||
|
while (iter.MoveNext()) {
|
||||||
|
Range rng = iter.Current;
|
||||||
|
Debug.WriteLine("[+{0:x6},+{1:x6}]", rng.Low, rng.High);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Internal test function.
|
/// Internal test helper function.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private static bool CheckRangeSet(RangeSet set, int expectedRanges, int[] expected) {
|
private static bool CheckRangeSet(RangeSet set, int expectedRanges, int[] expected) {
|
||||||
if (set.DebugRangeCount != expectedRanges) {
|
if (set.DebugRangeCount != expectedRanges) {
|
||||||
|
@ -271,6 +271,38 @@ namespace CommonUtil {
|
|||||||
return (FindValue(val) >= 0);
|
return (FindValue(val) >= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if false
|
||||||
|
/// <summary>
|
||||||
|
/// Finds a range that contains searchVal, or identifies the one that immediately
|
||||||
|
/// follows. The caller can determine which by checking to see if range.Low is
|
||||||
|
/// greater than searchVal.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="searchVal">Value to find.</param>
|
||||||
|
/// <param name="range">Result.</param>
|
||||||
|
/// <returns>True if a valid range was returned.</returns>
|
||||||
|
public bool GetContainingOrSubsequentRange(int searchVal, out TypedRange range) {
|
||||||
|
int index = FindValue(searchVal);
|
||||||
|
if (index >= 0) {
|
||||||
|
// found a range that contains val
|
||||||
|
range = mRangeList[index];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No matching range, so the index of the insertion point was returned. The
|
||||||
|
// indexed range will have a "low" value that is greater than searchVal. If
|
||||||
|
// we've reached the end of the list, the index will be past the end.
|
||||||
|
index = -index - 1;
|
||||||
|
if (index >= mRangeList.Count) {
|
||||||
|
// reached the end of the list
|
||||||
|
range = new TypedRange(-128, -128, -128);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
range = mRangeList[index];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Gets the type of the specified value.
|
/// Gets the type of the specified value.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@ -356,9 +388,12 @@ namespace CommonUtil {
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="low">Lowest value (inclusive).</param>
|
/// <param name="low">Lowest value (inclusive).</param>
|
||||||
/// <param name="high">Highest value (inclusive).</param>
|
/// <param name="high">Highest value (inclusive).</param>
|
||||||
/// <param name="high">Value type.</param>
|
/// <param name="type">Value type.</param>
|
||||||
public void AddRange(int low, int high, int type) {
|
public void AddRange(int low, int high, int type) {
|
||||||
// There's probably some very efficient way to do this. Keeping it simple for now.
|
// There's probably some very efficient way to do this. Keeping it simple for now.
|
||||||
|
// (TODO: do a quick check to see if there's anything overlapping; if not, just
|
||||||
|
// create a new item and insert it into the list. Should handle the common case.)
|
||||||
|
Debug.Assert(low <= high); // adding an empty set is valid but weird
|
||||||
for (int i = low; i <= high; i++) {
|
for (int i = low; i <= high; i++) {
|
||||||
Add(i, type);
|
Add(i, type);
|
||||||
}
|
}
|
||||||
@ -399,6 +434,16 @@ namespace CommonUtil {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void DebugDump(string name) {
|
||||||
|
Debug.WriteLine(name + " has " + RangeCount + " ranges");
|
||||||
|
IEnumerator<TypedRange> iter = RangeListIterator;
|
||||||
|
while (iter.MoveNext()) {
|
||||||
|
TypedRange rng = iter.Current;
|
||||||
|
Debug.WriteLine("[+{0:x6},+{1:x6}] ({2:x2})", rng.Low, rng.High, rng.Type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Internal test function.
|
/// Internal test function.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
@ -29,14 +29,21 @@ namespace SourceGen {
|
|||||||
public class DataAnalysis {
|
public class DataAnalysis {
|
||||||
// Minimum number of consecutive identical bytes for something to be called a "run".
|
// Minimum number of consecutive identical bytes for something to be called a "run".
|
||||||
private const int MIN_RUN_LENGTH = 5;
|
private const int MIN_RUN_LENGTH = 5;
|
||||||
|
|
||||||
// Minimum length for treating data as a run if the byte is a valid ASCII value.
|
// Minimum length for treating data as a run if the byte is a valid ASCII value.
|
||||||
// (Alternatively, the maximum length of an ASCII string composed of single characters.)
|
// (Alternatively, the maximum length of an ASCII string composed of single characters.)
|
||||||
// Anything shorter than this is handled with a string directive, anything this long or
|
// Anything shorter than this is handled with a string directive, anything this long or
|
||||||
// longer becomes FILL. This should be larger than the MinCharsForString parameter.
|
// longer becomes FILL. This should be larger than the MinCharsForString parameter.
|
||||||
private const int MIN_RUN_LENGTH_ASCII = 62;
|
private const int MIN_RUN_LENGTH_ASCII = 62;
|
||||||
|
|
||||||
|
// Absolute minimum string length for auto-detection. This is used when generating the
|
||||||
|
// data tables.
|
||||||
|
public const int MIN_STRING_LENGTH = 3;
|
||||||
|
|
||||||
// Minimum length for an ASCII string. Anything shorter is just output as bytes.
|
// Minimum length for an ASCII string. Anything shorter is just output as bytes.
|
||||||
|
// This is the default value; the actual value is configured as a project preference.
|
||||||
public const int DEFAULT_MIN_STRING_LENGTH = 4;
|
public const int DEFAULT_MIN_STRING_LENGTH = 4;
|
||||||
|
|
||||||
// Set min chars to this to disable string detection.
|
// Set min chars to this to disable string detection.
|
||||||
public const int MIN_CHARS_FOR_STRING_DISABLED = int.MaxValue;
|
public const int MIN_CHARS_FOR_STRING_DISABLED = int.MaxValue;
|
||||||
|
|
||||||
@ -454,15 +461,6 @@ namespace SourceGen {
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns>True on success.</returns>
|
/// <returns>True on success.</returns>
|
||||||
public void AnalyzeUncategorized() {
|
public void AnalyzeUncategorized() {
|
||||||
// TODO(someday): we can make this faster. The data doesn't change, so we
|
|
||||||
// only need to do a full scan once, when the file is first loaded. We can
|
|
||||||
// create a TypedRangeSet for runs of identical bytes, using the byte value
|
|
||||||
// as the type. A second TypedRangeSet would identify runs of ASCII chars,
|
|
||||||
// with different types for high/low ASCII (and PETSCII?). AnalyzeRange() would
|
|
||||||
// then just need to find the intersection with the sets, which should be
|
|
||||||
// significantly faster. We would need to re-do the scan if the parameters
|
|
||||||
// for things like min match length change.
|
|
||||||
|
|
||||||
FormatDescriptor oneByteDefault = FormatDescriptor.Create(1,
|
FormatDescriptor oneByteDefault = FormatDescriptor.Create(1,
|
||||||
FormatDescriptor.Type.Default, FormatDescriptor.SubType.None);
|
FormatDescriptor.Type.Default, FormatDescriptor.SubType.None);
|
||||||
FormatDescriptor.DebugPrefabBump(-1);
|
FormatDescriptor.DebugPrefabBump(-1);
|
||||||
@ -503,6 +501,7 @@ namespace SourceGen {
|
|||||||
}
|
}
|
||||||
if (attr.IsInstruction) {
|
if (attr.IsInstruction) {
|
||||||
// Because of embedded instructions, we can't simply leap forward.
|
// Because of embedded instructions, we can't simply leap forward.
|
||||||
|
// [or can we?]
|
||||||
offset++;
|
offset++;
|
||||||
} else {
|
} else {
|
||||||
Debug.Assert(attr.Length > 0);
|
Debug.Assert(attr.Length > 0);
|
||||||
@ -549,81 +548,208 @@ namespace SourceGen {
|
|||||||
/// <param name="start">Offset of first byte in range.</param>
|
/// <param name="start">Offset of first byte in range.</param>
|
||||||
/// <param name="end">Offset of last byte in range.</param>
|
/// <param name="end">Offset of last byte in range.</param>
|
||||||
private void AnalyzeRange(int start, int end) {
|
private void AnalyzeRange(int start, int end) {
|
||||||
// TODO(someday): consider copying the buffer into a string and using Regex. This
|
// We want to identify runs of identical bytes, and runs of more than N human-
|
||||||
|
// readable characters (ASCII, high ASCII, PETSCII, whatever). There are a few
|
||||||
|
// ways to do this.
|
||||||
|
//
|
||||||
|
// The simple approach is to walk through the data from start to end, checking at
|
||||||
|
// each offset for runs of bytes matching the criteria. Because the data doesn't
|
||||||
|
// change, we can pre-analyze the data at project load time to speed things up.
|
||||||
|
//
|
||||||
|
// One approach is to put runs into TypedRangeSet (setting the type to the byte
|
||||||
|
// value so a run of 0x00 doesn't merge into an adjacent run of 0x01), and the
|
||||||
|
// various character encodings into individual RangeSets. Then, for any given
|
||||||
|
// byte address, you can query the length of a potential run directly. This could
|
||||||
|
// be made faster with a mergesort-like algorithm that walked through the various
|
||||||
|
// range sets, rather than iterating over every byte in the range. However, the
|
||||||
|
// ranges passed into this method tend to be small, so the initial setup time for
|
||||||
|
// each region can dominate the performance. (The optimized implementation of this
|
||||||
|
// approach is also fairly complicated.)
|
||||||
|
//
|
||||||
|
// A memory-hungry alternative is to create arrays of integers, one entry per byte
|
||||||
|
// in the file, and set each entry to the number of bytes in the run that would
|
||||||
|
// follow at that point. So if a run of 20 zeroes began at off set 5, you would
|
||||||
|
// set run[5]=20, run[6]=19, and so on. That avoids searching in the sets, at the
|
||||||
|
// cost of potentially several megabytes for a large 65816 file.
|
||||||
|
//
|
||||||
|
// It's even possible that Regex would handle this faster and more easily. This
|
||||||
// can be done fairly quickly with "unsafe" code, e.g.:
|
// can be done fairly quickly with "unsafe" code, e.g.:
|
||||||
// https://stackoverflow.com/questions/3028768/net-regular-expressions-on-bytes-instead-of-chars
|
// https://stackoverflow.com/questions/3028768/net-regular-expressions-on-bytes-instead-of-chars
|
||||||
// Could be useful for ASCII stuff and the repeated-byte detector, e.g.:
|
|
||||||
// https://stackoverflow.com/questions/1660694/regular-expression-to-match-any-character-being-repeated-more-than-10-times
|
// https://stackoverflow.com/questions/1660694/regular-expression-to-match-any-character-being-repeated-more-than-10-times
|
||||||
|
//
|
||||||
|
// Ultimately we're just not spending that much time here. Setting
|
||||||
|
// AnalyzeUncategorizedData=false reveals that most of the time is spent in
|
||||||
|
// the caller, identifying the regions, so a significant improvement here won't
|
||||||
|
// have much impact on the user experience.
|
||||||
|
|
||||||
mDebugLog.LogI("Analyzing +" + start.ToString("x6") + " - +" + end.ToString("x6"));
|
mDebugLog.LogI("Analyzing [+" + start.ToString("x6") + ",+" + end.ToString("x6") +"]");
|
||||||
|
|
||||||
int minStringChars = mAnalysisParams.MinCharsForString;
|
|
||||||
bool doAnalysis = mAnalysisParams.AnalyzeUncategorizedData;
|
|
||||||
FormatDescriptor oneByteDefault = FormatDescriptor.Create(1,
|
FormatDescriptor oneByteDefault = FormatDescriptor.Create(1,
|
||||||
FormatDescriptor.Type.Default, FormatDescriptor.SubType.None);
|
FormatDescriptor.Type.Default, FormatDescriptor.SubType.None);
|
||||||
FormatDescriptor.DebugPrefabBump(-1);
|
FormatDescriptor.DebugPrefabBump(-1);
|
||||||
|
if (!mAnalysisParams.AnalyzeUncategorizedData) {
|
||||||
while (start <= end) {
|
|
||||||
if (!doAnalysis) {
|
|
||||||
// Analysis is disabled, so just mark everything as single-byte data.
|
// Analysis is disabled, so just mark everything as single-byte data.
|
||||||
|
while (start <= end) {
|
||||||
mAnattribs[start].DataDescriptor = oneByteDefault;
|
mAnattribs[start].DataDescriptor = oneByteDefault;
|
||||||
FormatDescriptor.DebugPrefabBump();
|
FormatDescriptor.DebugPrefabBump();
|
||||||
start++;
|
start++;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int minStringChars = mAnalysisParams.MinCharsForString;
|
||||||
|
|
||||||
|
#if false // this is actually slower (and uses more memory)
|
||||||
|
while (start <= end) {
|
||||||
|
// This is used to let us skip forward. It starts past the end of the block,
|
||||||
|
// and moves backward as we identify potential points of interest.
|
||||||
|
int minNextStart = end + 1;
|
||||||
|
|
||||||
|
bool found = mProject.RepeatedBytes.GetContainingOrSubsequentRange(start,
|
||||||
|
out TypedRangeSet.TypedRange tyRange);
|
||||||
|
if (found) {
|
||||||
|
if (tyRange.Low <= start) {
|
||||||
|
// found a matching range
|
||||||
|
Debug.Assert(tyRange.Low <= start && tyRange.High >= start);
|
||||||
|
int clampEnd = Math.Min(tyRange.High, end);
|
||||||
|
int repLen = clampEnd - start + 1;
|
||||||
|
if (repLen >= MIN_RUN_LENGTH) {
|
||||||
|
bool isAscii =
|
||||||
|
TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f));
|
||||||
|
|
||||||
|
// IF the run isn't ASCII, OR it's so long that we don't want to
|
||||||
|
// encode it as a string, OR it's so short that we don't want to
|
||||||
|
// treat it as a string, THEN output it as a run. Otherwise, just
|
||||||
|
// let the ASCII-catcher handle it later.
|
||||||
|
if (!isAscii ||
|
||||||
|
repLen > MIN_RUN_LENGTH_ASCII || repLen < minStringChars) {
|
||||||
|
LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
|
||||||
|
repLen + " bytes");
|
||||||
|
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
|
||||||
|
repLen, FormatDescriptor.Type.Fill,
|
||||||
|
FormatDescriptor.SubType.None);
|
||||||
|
start += repLen;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// We didn't like this range. We probably won't like it for any other
|
||||||
|
// point within the range, so start again past it. Ideally we'd use
|
||||||
|
// Range.Low of the range that followed the one that was returned, but
|
||||||
|
// we don't have that handy.
|
||||||
|
minNextStart = Math.Min(minNextStart, tyRange.High + 1);
|
||||||
|
} else {
|
||||||
|
// no match; try to advance to the start of the next range.
|
||||||
|
Debug.Assert(tyRange.Low > start);
|
||||||
|
minNextStart = Math.Min(minNextStart, tyRange.Low);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
found = mProject.StdAsciiBytes.GetContainingOrSubsequentRange(start,
|
||||||
|
out RangeSet.Range range);
|
||||||
|
if (found) {
|
||||||
|
if (range.Low <= start) {
|
||||||
|
// found a matching range
|
||||||
|
Debug.Assert(range.Low <= start && range.High >= start);
|
||||||
|
int clampEnd = Math.Min(range.High, end);
|
||||||
|
int repLen = clampEnd - start + 1;
|
||||||
|
if (repLen >= minStringChars) {
|
||||||
|
LogV(start, "Std ASCII string, len=" + repLen + " bytes");
|
||||||
|
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(repLen,
|
||||||
|
FormatDescriptor.Type.String, FormatDescriptor.SubType.None);
|
||||||
|
start += repLen;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
minNextStart = Math.Min(minNextStart, range.High + 1);
|
||||||
|
} else {
|
||||||
|
Debug.Assert(range.Low > start);
|
||||||
|
minNextStart = Math.Min(minNextStart, range.Low);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
found = mProject.HighAsciiBytes.GetContainingOrSubsequentRange(start,
|
||||||
|
out range);
|
||||||
|
if (found) {
|
||||||
|
if (range.Low <= start) {
|
||||||
|
// found a matching range
|
||||||
|
Debug.Assert(range.Low <= start && range.High >= start);
|
||||||
|
int clampEnd = Math.Min(range.High, end);
|
||||||
|
int repLen = clampEnd - start + 1;
|
||||||
|
if (repLen >= minStringChars) {
|
||||||
|
LogV(start, "High ASCII string, len=" + repLen + " bytes");
|
||||||
|
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(repLen,
|
||||||
|
FormatDescriptor.Type.String, FormatDescriptor.SubType.None);
|
||||||
|
start += repLen;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
minNextStart = Math.Min(minNextStart, range.High + 1);
|
||||||
|
} else {
|
||||||
|
Debug.Assert(range.Low > start);
|
||||||
|
minNextStart = Math.Min(minNextStart, range.Low);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advance to the next possible run location.
|
||||||
|
int nextStart = minNextStart > 0 ? minNextStart : start + 1;
|
||||||
|
Debug.Assert(nextStart > start);
|
||||||
|
|
||||||
|
// No runs found, output as single bytes. This is the easiest form for users
|
||||||
|
// to edit.
|
||||||
|
while (start < nextStart) {
|
||||||
|
mAnattribs[start].DataDescriptor = oneByteDefault;
|
||||||
|
FormatDescriptor.DebugPrefabBump();
|
||||||
|
start++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
while (start <= end) {
|
||||||
// Check for block of repeated values.
|
// Check for block of repeated values.
|
||||||
int length = RecognizeRun(mFileData, start, end);
|
int runLen = RecognizeRun(mFileData, start, end);
|
||||||
bool isAscii = TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f));
|
bool isAscii = TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f));
|
||||||
if (length >= MIN_RUN_LENGTH) {
|
if (runLen >= MIN_RUN_LENGTH) {
|
||||||
// Output as run or ASCII string. Prefer ASCII if the string is short
|
// Output as run or ASCII string. Prefer ASCII if the string is short
|
||||||
// enough to fit on one line (e.g. 64 chars including delimiters) and
|
// enough to fit on one line (e.g. 64 chars including delimiters) and
|
||||||
// meets the minimum string length threshold.
|
// meets the minimum string length threshold.
|
||||||
if (isAscii && length <= MIN_RUN_LENGTH_ASCII && length >= minStringChars) {
|
if (isAscii && runLen <= MIN_RUN_LENGTH_ASCII && runLen >= minStringChars) {
|
||||||
// string -- if we create the descriptor here, we save a little time,
|
// String -- if we create the descriptor here, we save a little time,
|
||||||
// but strings like "*****hello" turn into two separate strings.
|
// but strings like "*****hello" turn into two separate strings. So
|
||||||
//LogV(start, "String from run of '" + (char)(mFileData[start] & 0x7f) +
|
// just fall through and let the ASCII recognizer handle it.
|
||||||
// "': " + length + " bytes");
|
|
||||||
//mAnattribs[start].DataDescriptor = FormatDescriptor.CreateDescriptor(
|
|
||||||
// length, FormatDescriptor.Type.String,
|
|
||||||
// FormatDescriptor.SubType.None);
|
|
||||||
//start += length;
|
|
||||||
//continue;
|
|
||||||
} else {
|
} else {
|
||||||
// run
|
// run
|
||||||
LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
|
LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
|
||||||
length + " bytes");
|
runLen + " bytes");
|
||||||
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
|
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
|
||||||
length, FormatDescriptor.Type.Fill,
|
runLen, FormatDescriptor.Type.Fill,
|
||||||
FormatDescriptor.SubType.None);
|
FormatDescriptor.SubType.None);
|
||||||
start += length;
|
start += runLen;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
length = RecognizeAscii(mFileData, start, end);
|
int asciiLen = RecognizeAscii(mFileData, start, end);
|
||||||
if (length >= minStringChars) {
|
if (asciiLen >= minStringChars) {
|
||||||
LogV(start, "ASCII string, len=" + length + " bytes");
|
LogV(start, "ASCII string, len=" + asciiLen + " bytes");
|
||||||
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(length,
|
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(asciiLen,
|
||||||
FormatDescriptor.Type.String, FormatDescriptor.SubType.None);
|
FormatDescriptor.Type.String, FormatDescriptor.SubType.None);
|
||||||
start += length;
|
start += asciiLen;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Nothing found, output as single byte. This is the easiest form for users
|
// Nothing found, output as single byte. This is the easiest form for users
|
||||||
// to edit.
|
// to edit. If we found a run, but it was too short, we can go ahead and
|
||||||
mAnattribs[start].DataDescriptor = oneByteDefault;
|
// mark all bytes in the run because we know the later matches will also be
|
||||||
|
// too short.
|
||||||
|
Debug.Assert(runLen > 0);
|
||||||
|
while (runLen-- != 0) {
|
||||||
|
mAnattribs[start++].DataDescriptor = oneByteDefault;
|
||||||
FormatDescriptor.DebugPrefabBump();
|
FormatDescriptor.DebugPrefabBump();
|
||||||
|
|
||||||
// It's tempting to advance by the "length" result from RecognizeRun, and if
|
|
||||||
// we were just looking for runs of identical bytes we could. However, that
|
|
||||||
// would lose short ASCII strings that began with repeated bytes, e.g. "---%".
|
|
||||||
|
|
||||||
start++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
#region Static analyzer methods
|
#region Static analyzer methods
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Checks for a repeated run of the same byte.
|
/// Checks for a repeated run of the same byte.
|
||||||
@ -940,7 +1066,7 @@ namespace SourceGen {
|
|||||||
return stringCount;
|
return stringCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion // Static analyzers
|
#endregion // Static analyzers
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ namespace SourceGen {
|
|||||||
private const long MAGIC = 6982516645493599905;
|
private const long MAGIC = 6982516645493599905;
|
||||||
|
|
||||||
|
|
||||||
#region Data that is saved and restored
|
#region Data that is saved to the project file
|
||||||
// All data held by structures in this section are persistent, and will be
|
// All data held by structures in this section are persistent, and will be
|
||||||
// written to the project file. Anything not in this section may be discarded
|
// written to the project file. Anything not in this section may be discarded
|
||||||
// at any time. Smaller items are kept in arrays, with one entry per byte
|
// at any time. Smaller items are kept in arrays, with one entry per byte
|
||||||
@ -143,6 +143,13 @@ namespace SourceGen {
|
|||||||
// Project and platform symbols that are being referenced from code.
|
// Project and platform symbols that are being referenced from code.
|
||||||
public List<DefSymbol> ActiveDefSymbolList { get; private set; }
|
public List<DefSymbol> ActiveDefSymbolList { get; private set; }
|
||||||
|
|
||||||
|
#if false
|
||||||
|
// Data scan results.
|
||||||
|
public TypedRangeSet RepeatedBytes { get; private set; }
|
||||||
|
public RangeSet StdAsciiBytes { get; private set; }
|
||||||
|
public RangeSet HighAsciiBytes { get; private set; }
|
||||||
|
#endif
|
||||||
|
|
||||||
// List of changes for undo/redo.
|
// List of changes for undo/redo.
|
||||||
private List<ChangeSet> mUndoList = new List<ChangeSet>();
|
private List<ChangeSet> mUndoList = new List<ChangeSet>();
|
||||||
|
|
||||||
@ -226,6 +233,9 @@ namespace SourceGen {
|
|||||||
mFileData = fileData;
|
mFileData = fileData;
|
||||||
mDataFileName = dataFileName;
|
mDataFileName = dataFileName;
|
||||||
FileDataCrc32 = CommonUtil.CRC32.OnWholeBuffer(0, mFileData);
|
FileDataCrc32 = CommonUtil.CRC32.OnWholeBuffer(0, mFileData);
|
||||||
|
#if false
|
||||||
|
ScanFileData();
|
||||||
|
#endif
|
||||||
|
|
||||||
// Mark the first byte as code so we have something to do. This may get
|
// Mark the first byte as code so we have something to do. This may get
|
||||||
// overridden later.
|
// overridden later.
|
||||||
@ -303,8 +313,89 @@ namespace SourceGen {
|
|||||||
Debug.Assert(CRC32.OnWholeBuffer(0, fileData) == FileDataCrc32);
|
Debug.Assert(CRC32.OnWholeBuffer(0, fileData) == FileDataCrc32);
|
||||||
mFileData = fileData;
|
mFileData = fileData;
|
||||||
mDataFileName = dataFileName;
|
mDataFileName = dataFileName;
|
||||||
|
|
||||||
|
#if false
|
||||||
|
ScanFileData();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if false
|
||||||
|
private delegate bool ByteTest(byte val); // for ScanFileData()
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Scans the contents of the file data array, noting runs of identical bytes and
|
||||||
|
/// other interesting bits.
|
||||||
|
///
|
||||||
|
/// The file data is guaranteed not to change, so doing a bit of work here can save
|
||||||
|
/// us time during data analysis.
|
||||||
|
/// </summary>
|
||||||
|
private void ScanFileData() {
|
||||||
|
DateTime startWhen = DateTime.Now;
|
||||||
|
// Find runs of identical bytes.
|
||||||
|
TypedRangeSet repeats = new TypedRangeSet();
|
||||||
|
|
||||||
|
Debug.Assert(mFileData.Length > 0);
|
||||||
|
byte matchByte = mFileData[0];
|
||||||
|
int count = 1;
|
||||||
|
for (int i = 1; i < mFileData.Length; i++) {
|
||||||
|
if (mFileData[i] == matchByte) {
|
||||||
|
count++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (count >= DataAnalysis.MIN_RUN_LENGTH) {
|
||||||
|
repeats.AddRange(i - count, i - 1, matchByte);
|
||||||
|
}
|
||||||
|
matchByte = mFileData[i];
|
||||||
|
count = 1;
|
||||||
|
}
|
||||||
|
if (count >= DataAnalysis.MIN_RUN_LENGTH) {
|
||||||
|
repeats.AddRange(mFileData.Length - count, mFileData.Length - 1, matchByte);
|
||||||
|
}
|
||||||
|
|
||||||
|
RangeSet ascii = new RangeSet();
|
||||||
|
CreateByteRangeSet(ascii, mFileData, DataAnalysis.MIN_STRING_LENGTH,
|
||||||
|
delegate (byte val) {
|
||||||
|
return val >= 0x20 && val < 0x7f;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
RangeSet highAscii = new RangeSet();
|
||||||
|
CreateByteRangeSet(highAscii, mFileData, DataAnalysis.MIN_STRING_LENGTH,
|
||||||
|
delegate (byte val) {
|
||||||
|
return val >= 0xa0 && val < 0xff;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
repeats.DebugDump("Repeated-Bytes (" + DataAnalysis.MIN_RUN_LENGTH + "+)");
|
||||||
|
ascii.DebugDump("Standard-ASCII (" + DataAnalysis.MIN_STRING_LENGTH + "+)");
|
||||||
|
highAscii.DebugDump("High-ASCII (" + DataAnalysis.MIN_STRING_LENGTH + "+)");
|
||||||
|
}
|
||||||
|
Debug.WriteLine("ScanFileData took " +
|
||||||
|
((DateTime.Now - startWhen).Milliseconds) + " ms");
|
||||||
|
|
||||||
|
RepeatedBytes = repeats;
|
||||||
|
StdAsciiBytes = ascii;
|
||||||
|
HighAsciiBytes = highAscii;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void CreateByteRangeSet(RangeSet set, byte[] data, int minLen, ByteTest tester) {
|
||||||
|
int count = 0;
|
||||||
|
for (int i = 0; i < data.Length; i++) {
|
||||||
|
if (tester(data[i])) {
|
||||||
|
count++;
|
||||||
|
} else if (count < minLen) {
|
||||||
|
count = 0;
|
||||||
|
} else {
|
||||||
|
set.AddRange(i - count, i - 1);
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (count >= minLen) {
|
||||||
|
set.AddRange(data.Length - count, data.Length - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Loads platform symbol files and extension scripts.
|
/// Loads platform symbol files and extension scripts.
|
||||||
///
|
///
|
||||||
@ -486,11 +577,16 @@ namespace SourceGen {
|
|||||||
|
|
||||||
reanalysisTimer.StartTask("GenerateActiveDefSymbolList");
|
reanalysisTimer.StartTask("GenerateActiveDefSymbolList");
|
||||||
// Generate the list of project/platform symbols that are being used. This forms
|
// Generate the list of project/platform symbols that are being used. This forms
|
||||||
// the list of EQUates at the top of the file.
|
// the list of EQUates at the top of the file. The active set is identified from
|
||||||
|
// the cross-reference data.
|
||||||
GenerateActiveDefSymbolList();
|
GenerateActiveDefSymbolList();
|
||||||
reanalysisTimer.EndTask("GenerateActiveDefSymbolList");
|
reanalysisTimer.EndTask("GenerateActiveDefSymbolList");
|
||||||
|
|
||||||
|
#if DEBUG
|
||||||
|
reanalysisTimer.StartTask("Validate");
|
||||||
Validate();
|
Validate();
|
||||||
|
reanalysisTimer.EndTask("Validate");
|
||||||
|
#endif
|
||||||
|
|
||||||
reanalysisTimer.EndTask("DisasmProject.Analyze()");
|
reanalysisTimer.EndTask("DisasmProject.Analyze()");
|
||||||
//reanalysisTimer.DumpTimes("DisasmProject timers:", debugLog);
|
//reanalysisTimer.DumpTimes("DisasmProject timers:", debugLog);
|
||||||
|
@ -63,6 +63,9 @@ method in <code>DisasmProject.cs</code>):</p>
|
|||||||
the list that is displayed in .EQ directives.</li>
|
the list that is displayed in .EQ directives.</li>
|
||||||
<li>Generate cross-reference lists. This is done for file data and
|
<li>Generate cross-reference lists. This is done for file data and
|
||||||
for any platform/project symbols that are referenced.</li>
|
for any platform/project symbols that are referenced.</li>
|
||||||
|
<li>If annotated auto-labels are enabled, the simple labels are
|
||||||
|
replaced with the annotated versions here. (This can't be done earlier
|
||||||
|
because the annotations are generated from the cross-reference data.)</li>
|
||||||
<li>In a debug build, some validity checks are performed.</li>
|
<li>In a debug build, some validity checks are performed.</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
@ -84,7 +87,8 @@ determines the width of the numeric value or length of the string. For
|
|||||||
this reason, instructions do not need a format descriptor, but all
|
this reason, instructions do not need a format descriptor, but all
|
||||||
data items do.</p>
|
data items do.</p>
|
||||||
<p>Symbolic references are format descriptors with a symbol attached.
|
<p>Symbolic references are format descriptors with a symbol attached.
|
||||||
The symbol reference also specifies low/high/bank.</p>
|
The symbol reference also specifies low/high/bank, for partial symbol
|
||||||
|
references like <code>LDA #>symbol</code>.</p>
|
||||||
<p>Every offset marked as a start point gets its own line in the on-screen
|
<p>Every offset marked as a start point gets its own line in the on-screen
|
||||||
display list. Embedded instructions are identified internally by
|
display list. Embedded instructions are identified internally by
|
||||||
looking for instruction-start offsets inside instructions.</p>
|
looking for instruction-start offsets inside instructions.</p>
|
||||||
|
@ -44,9 +44,9 @@ just outputs raw hex bytes for MVN/MVP instructions. This yields the
|
|||||||
correct code for all versions of the assembler, but is ugly and
|
correct code for all versions of the assembler, but is ugly and
|
||||||
annoying. So we want to output actual MVN/MVP instructions when producing
|
annoying. So we want to output actual MVN/MVP instructions when producing
|
||||||
code for newer versions of the assembler.</p>
|
code for newer versions of the assembler.</p>
|
||||||
<p>When you configure a cross-assembler, SourceGen executes it and
|
<p>When you configure a cross-assembler, SourceGen runs the executable with
|
||||||
extracts the version information from the command-line output stream.
|
version query args, and extracts the version information from the output
|
||||||
This is used by the generator to ensure that the output will compile.
|
stream. This is used by the generator to ensure that the output will compile.
|
||||||
If no assembler is configured, SourceGen will produce code optimized
|
If no assembler is configured, SourceGen will produce code optimized
|
||||||
for the latest version of the assembler.</p>
|
for the latest version of the assembler.</p>
|
||||||
|
|
||||||
@ -58,7 +58,8 @@ generators may produce multiple source files, perhaps a link script or
|
|||||||
symbol definition header to go with the assembly source. To avoid
|
symbol definition header to go with the assembly source. To avoid
|
||||||
spreading files across the filesystem, SourceGen does all of its work
|
spreading files across the filesystem, SourceGen does all of its work
|
||||||
in the same directory where the project lives. Before you can generate
|
in the same directory where the project lives. Before you can generate
|
||||||
code, you have to have given your project a name by saving it.</p>
|
code, you have to have assigned your project a directory. This is why
|
||||||
|
you can't assemble a project until you've saved it for the first time.</p>
|
||||||
|
|
||||||
<p>The Generate and Assemble dialog has a drop-down list near the top
|
<p>The Generate and Assemble dialog has a drop-down list near the top
|
||||||
that lets you pick which assembler to target. The name of the assembler
|
that lets you pick which assembler to target. The name of the assembler
|
||||||
|
@ -6,6 +6,7 @@ NOTE: some tests may fail if you use a version of the assembler that is
|
|||||||
different from the one used to generate the expected output. The current
|
different from the one used to generate the expected output. The current
|
||||||
set was generated with:
|
set was generated with:
|
||||||
|
|
||||||
|
* 64tass v1.53.1515
|
||||||
* cc65 v2.17
|
* cc65 v2.17
|
||||||
* Merlin 32 v1.0
|
* Merlin 32 v1.0
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user