Experiment on uncategorized data analysis

Tried something to speed it up.  Didn't help.  Cleaned up the code
a bit though.
This commit is contained in:
Andy McFadden 2019-04-18 10:41:01 -07:00
parent 61d6cd597a
commit 8d0ce87ec7
7 changed files with 376 additions and 61 deletions

View File

@ -254,6 +254,38 @@ namespace CommonUtil {
return (FindValue(val) >= 0);
}
#if false
/// <summary>
/// Finds a range that contains searchVal, or identifies the one that immediately
/// follows. The caller can determine which by checking to see if range.Low is
/// greater than searchVal.
/// </summary>
/// <param name="searchVal">Value to find.</param>
/// <param name="range">Result.</param>
/// <returns>True if a valid range was returned.</returns>
public bool GetContainingOrSubsequentRange(int searchVal, out Range range) {
int index = FindValue(searchVal);
if (index >= 0) {
// found a range that contains val
range = mRangeList[index];
return true;
}
// No matching range, so the index of the insertion point was returned. The
// indexed range will have a "low" value that is greater than searchVal. If
// we've reached the end of the list, the index will be past the end.
index = -index - 1;
if (index >= mRangeList.Count) {
// reached the end of the list
range = new Range(-128, -128);
return false;
}
range = mRangeList[index];
return true;
}
#endif
/// <summary>
/// Adds a value to the set. If the value is already present, nothing changes.
/// </summary>
@ -352,8 +384,18 @@ namespace CommonUtil {
}
public void DebugDump(string name) {
Debug.WriteLine(name + " has " + DebugRangeCount + " ranges");
IEnumerator<Range> iter = RangeListIterator;
while (iter.MoveNext()) {
Range rng = iter.Current;
Debug.WriteLine("[+{0:x6},+{1:x6}]", rng.Low, rng.High);
}
}
/// <summary>
/// Internal test function.
/// Internal test helper function.
/// </summary>
private static bool CheckRangeSet(RangeSet set, int expectedRanges, int[] expected) {
if (set.DebugRangeCount != expectedRanges) {

View File

@ -271,6 +271,38 @@ namespace CommonUtil {
return (FindValue(val) >= 0);
}
#if false
/// <summary>
/// Finds a range that contains searchVal, or identifies the one that immediately
/// follows. The caller can determine which by checking to see if range.Low is
/// greater than searchVal.
/// </summary>
/// <param name="searchVal">Value to find.</param>
/// <param name="range">Result.</param>
/// <returns>True if a valid range was returned.</returns>
public bool GetContainingOrSubsequentRange(int searchVal, out TypedRange range) {
int index = FindValue(searchVal);
if (index >= 0) {
// found a range that contains val
range = mRangeList[index];
return true;
}
// No matching range, so the index of the insertion point was returned. The
// indexed range will have a "low" value that is greater than searchVal. If
// we've reached the end of the list, the index will be past the end.
index = -index - 1;
if (index >= mRangeList.Count) {
// reached the end of the list
range = new TypedRange(-128, -128, -128);
return false;
}
range = mRangeList[index];
return true;
}
#endif
/// <summary>
/// Gets the type of the specified value.
/// </summary>
@ -356,9 +388,12 @@ namespace CommonUtil {
/// </summary>
/// <param name="low">Lowest value (inclusive).</param>
/// <param name="high">Highest value (inclusive).</param>
/// <param name="high">Value type.</param>
/// <param name="type">Value type.</param>
public void AddRange(int low, int high, int type) {
// There's probably some very efficient way to do this. Keeping it simple for now.
// (TODO: do a quick check to see if there's anything overlapping; if not, just
// create a new item and insert it into the list. Should handle the common case.)
Debug.Assert(low <= high); // adding an empty set is valid but weird
for (int i = low; i <= high; i++) {
Add(i, type);
}
@ -399,6 +434,16 @@ namespace CommonUtil {
}
public void DebugDump(string name) {
Debug.WriteLine(name + " has " + RangeCount + " ranges");
IEnumerator<TypedRange> iter = RangeListIterator;
while (iter.MoveNext()) {
TypedRange rng = iter.Current;
Debug.WriteLine("[+{0:x6},+{1:x6}] ({2:x2})", rng.Low, rng.High, rng.Type);
}
}
/// <summary>
/// Internal test function.
/// </summary>

View File

@ -29,14 +29,21 @@ namespace SourceGen {
public class DataAnalysis {
// Minimum number of consecutive identical bytes for something to be called a "run".
private const int MIN_RUN_LENGTH = 5;
// Minimum length for treating data as a run if the byte is a valid ASCII value.
// (Alternatively, the maximum length of an ASCII string composed of single characters.)
// Anything shorter than this is handled with a string directive, anything this long or
// longer becomes FILL. This should be larger than the MinCharsForString parameter.
private const int MIN_RUN_LENGTH_ASCII = 62;
// Absolute minimum string length for auto-detection. This is used when generating the
// data tables.
public const int MIN_STRING_LENGTH = 3;
// Minimum length for an ASCII string. Anything shorter is just output as bytes.
// This is the default value; the actual value is configured as a project preference.
public const int DEFAULT_MIN_STRING_LENGTH = 4;
// Set min chars to this to disable string detection.
public const int MIN_CHARS_FOR_STRING_DISABLED = int.MaxValue;
@ -454,15 +461,6 @@ namespace SourceGen {
/// </summary>
/// <returns>True on success.</returns>
public void AnalyzeUncategorized() {
// TODO(someday): we can make this faster. The data doesn't change, so we
// only need to do a full scan once, when the file is first loaded. We can
// create a TypedRangeSet for runs of identical bytes, using the byte value
// as the type. A second TypedRangeSet would identify runs of ASCII chars,
// with different types for high/low ASCII (and PETSCII?). AnalyzeRange() would
// then just need to find the intersection with the sets, which should be
// significantly faster. We would need to re-do the scan if the parameters
// for things like min match length change.
FormatDescriptor oneByteDefault = FormatDescriptor.Create(1,
FormatDescriptor.Type.Default, FormatDescriptor.SubType.None);
FormatDescriptor.DebugPrefabBump(-1);
@ -503,6 +501,7 @@ namespace SourceGen {
}
if (attr.IsInstruction) {
// Because of embedded instructions, we can't simply leap forward.
// [or can we?]
offset++;
} else {
Debug.Assert(attr.Length > 0);
@ -549,81 +548,208 @@ namespace SourceGen {
/// <param name="start">Offset of first byte in range.</param>
/// <param name="end">Offset of last byte in range.</param>
private void AnalyzeRange(int start, int end) {
// TODO(someday): consider copying the buffer into a string and using Regex. This
// can be done fairly quickly with "unsafe" code, e.g.:
// We want to identify runs of identical bytes, and runs of more than N human-
// readable characters (ASCII, high ASCII, PETSCII, whatever). There are a few
// ways to do this.
//
// The simple approach is to walk through the data from start to end, checking at
// each offset for runs of bytes matching the criteria. Because the data doesn't
// change, we can pre-analyze the data at project load time to speed things up.
//
// One approach is to put runs into TypedRangeSet (setting the type to the byte
// value so a run of 0x00 doesn't merge into an adjacent run of 0x01), and the
// various character encodings into individual RangeSets. Then, for any given
// byte address, you can query the length of a potential run directly. This could
// be made faster with a mergesort-like algorithm that walked through the various
// range sets, rather than iterating over every byte in the range. However, the
// ranges passed into this method tend to be small, so the initial setup time for
// each region can dominate the performance. (The optimized implementation of this
// approach is also fairly complicated.)
//
// A memory-hungry alternative is to create arrays of integers, one entry per byte
// in the file, and set each entry to the number of bytes in the run that would
// follow at that point. So if a run of 20 zeroes began at off set 5, you would
// set run[5]=20, run[6]=19, and so on. That avoids searching in the sets, at the
// cost of potentially several megabytes for a large 65816 file.
//
// It's even possible that Regex would handle this faster and more easily. This
// can be done fairly quickly with "unsafe" code, e.g.:
// https://stackoverflow.com/questions/3028768/net-regular-expressions-on-bytes-instead-of-chars
// Could be useful for ASCII stuff and the repeated-byte detector, e.g.:
// https://stackoverflow.com/questions/1660694/regular-expression-to-match-any-character-being-repeated-more-than-10-times
//
// Ultimately we're just not spending that much time here. Setting
// AnalyzeUncategorizedData=false reveals that most of the time is spent in
// the caller, identifying the regions, so a significant improvement here won't
// have much impact on the user experience.
mDebugLog.LogI("Analyzing +" + start.ToString("x6") + " - +" + end.ToString("x6"));
mDebugLog.LogI("Analyzing [+" + start.ToString("x6") + ",+" + end.ToString("x6") +"]");
int minStringChars = mAnalysisParams.MinCharsForString;
bool doAnalysis = mAnalysisParams.AnalyzeUncategorizedData;
FormatDescriptor oneByteDefault = FormatDescriptor.Create(1,
FormatDescriptor.Type.Default, FormatDescriptor.SubType.None);
FormatDescriptor.DebugPrefabBump(-1);
while (start <= end) {
if (!doAnalysis) {
// Analysis is disabled, so just mark everything as single-byte data.
if (!mAnalysisParams.AnalyzeUncategorizedData) {
// Analysis is disabled, so just mark everything as single-byte data.
while (start <= end) {
mAnattribs[start].DataDescriptor = oneByteDefault;
FormatDescriptor.DebugPrefabBump();
start++;
continue;
}
return;
}
int minStringChars = mAnalysisParams.MinCharsForString;
#if false // this is actually slower (and uses more memory)
while (start <= end) {
// This is used to let us skip forward. It starts past the end of the block,
// and moves backward as we identify potential points of interest.
int minNextStart = end + 1;
bool found = mProject.RepeatedBytes.GetContainingOrSubsequentRange(start,
out TypedRangeSet.TypedRange tyRange);
if (found) {
if (tyRange.Low <= start) {
// found a matching range
Debug.Assert(tyRange.Low <= start && tyRange.High >= start);
int clampEnd = Math.Min(tyRange.High, end);
int repLen = clampEnd - start + 1;
if (repLen >= MIN_RUN_LENGTH) {
bool isAscii =
TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f));
// IF the run isn't ASCII, OR it's so long that we don't want to
// encode it as a string, OR it's so short that we don't want to
// treat it as a string, THEN output it as a run. Otherwise, just
// let the ASCII-catcher handle it later.
if (!isAscii ||
repLen > MIN_RUN_LENGTH_ASCII || repLen < minStringChars) {
LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
repLen + " bytes");
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
repLen, FormatDescriptor.Type.Fill,
FormatDescriptor.SubType.None);
start += repLen;
continue;
}
}
// We didn't like this range. We probably won't like it for any other
// point within the range, so start again past it. Ideally we'd use
// Range.Low of the range that followed the one that was returned, but
// we don't have that handy.
minNextStart = Math.Min(minNextStart, tyRange.High + 1);
} else {
// no match; try to advance to the start of the next range.
Debug.Assert(tyRange.Low > start);
minNextStart = Math.Min(minNextStart, tyRange.Low);
}
}
found = mProject.StdAsciiBytes.GetContainingOrSubsequentRange(start,
out RangeSet.Range range);
if (found) {
if (range.Low <= start) {
// found a matching range
Debug.Assert(range.Low <= start && range.High >= start);
int clampEnd = Math.Min(range.High, end);
int repLen = clampEnd - start + 1;
if (repLen >= minStringChars) {
LogV(start, "Std ASCII string, len=" + repLen + " bytes");
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(repLen,
FormatDescriptor.Type.String, FormatDescriptor.SubType.None);
start += repLen;
continue;
}
minNextStart = Math.Min(minNextStart, range.High + 1);
} else {
Debug.Assert(range.Low > start);
minNextStart = Math.Min(minNextStart, range.Low);
}
}
found = mProject.HighAsciiBytes.GetContainingOrSubsequentRange(start,
out range);
if (found) {
if (range.Low <= start) {
// found a matching range
Debug.Assert(range.Low <= start && range.High >= start);
int clampEnd = Math.Min(range.High, end);
int repLen = clampEnd - start + 1;
if (repLen >= minStringChars) {
LogV(start, "High ASCII string, len=" + repLen + " bytes");
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(repLen,
FormatDescriptor.Type.String, FormatDescriptor.SubType.None);
start += repLen;
continue;
}
minNextStart = Math.Min(minNextStart, range.High + 1);
} else {
Debug.Assert(range.Low > start);
minNextStart = Math.Min(minNextStart, range.Low);
}
}
// Advance to the next possible run location.
int nextStart = minNextStart > 0 ? minNextStart : start + 1;
Debug.Assert(nextStart > start);
// No runs found, output as single bytes. This is the easiest form for users
// to edit.
while (start < nextStart) {
mAnattribs[start].DataDescriptor = oneByteDefault;
FormatDescriptor.DebugPrefabBump();
start++;
}
}
#else
while (start <= end) {
// Check for block of repeated values.
int length = RecognizeRun(mFileData, start, end);
int runLen = RecognizeRun(mFileData, start, end);
bool isAscii = TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f));
if (length >= MIN_RUN_LENGTH) {
if (runLen >= MIN_RUN_LENGTH) {
// Output as run or ASCII string. Prefer ASCII if the string is short
// enough to fit on one line (e.g. 64 chars including delimiters) and
// meets the minimum string length threshold.
if (isAscii && length <= MIN_RUN_LENGTH_ASCII && length >= minStringChars) {
// string -- if we create the descriptor here, we save a little time,
// but strings like "*****hello" turn into two separate strings.
//LogV(start, "String from run of '" + (char)(mFileData[start] & 0x7f) +
// "': " + length + " bytes");
//mAnattribs[start].DataDescriptor = FormatDescriptor.CreateDescriptor(
// length, FormatDescriptor.Type.String,
// FormatDescriptor.SubType.None);
//start += length;
//continue;
if (isAscii && runLen <= MIN_RUN_LENGTH_ASCII && runLen >= minStringChars) {
// String -- if we create the descriptor here, we save a little time,
// but strings like "*****hello" turn into two separate strings. So
// just fall through and let the ASCII recognizer handle it.
} else {
// run
LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
length + " bytes");
runLen + " bytes");
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
length, FormatDescriptor.Type.Fill,
runLen, FormatDescriptor.Type.Fill,
FormatDescriptor.SubType.None);
start += length;
start += runLen;
continue;
}
}
length = RecognizeAscii(mFileData, start, end);
if (length >= minStringChars) {
LogV(start, "ASCII string, len=" + length + " bytes");
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(length,
int asciiLen = RecognizeAscii(mFileData, start, end);
if (asciiLen >= minStringChars) {
LogV(start, "ASCII string, len=" + asciiLen + " bytes");
mAnattribs[start].DataDescriptor = FormatDescriptor.Create(asciiLen,
FormatDescriptor.Type.String, FormatDescriptor.SubType.None);
start += length;
start += asciiLen;
continue;
}
// Nothing found, output as single byte. This is the easiest form for users
// to edit.
mAnattribs[start].DataDescriptor = oneByteDefault;
FormatDescriptor.DebugPrefabBump();
// It's tempting to advance by the "length" result from RecognizeRun, and if
// we were just looking for runs of identical bytes we could. However, that
// would lose short ASCII strings that began with repeated bytes, e.g. "---%".
start++;
// to edit. If we found a run, but it was too short, we can go ahead and
// mark all bytes in the run because we know the later matches will also be
// too short.
Debug.Assert(runLen > 0);
while (runLen-- != 0) {
mAnattribs[start++].DataDescriptor = oneByteDefault;
FormatDescriptor.DebugPrefabBump();
}
}
#endif
}
}
#region Static analyzer methods
#region Static analyzer methods
/// <summary>
/// Checks for a repeated run of the same byte.
@ -940,7 +1066,7 @@ namespace SourceGen {
return stringCount;
}
#endregion // Static analyzers
#endregion // Static analyzers
}
}

View File

@ -37,7 +37,7 @@ namespace SourceGen {
private const long MAGIC = 6982516645493599905;
#region Data that is saved and restored
#region Data that is saved to the project file
// All data held by structures in this section are persistent, and will be
// written to the project file. Anything not in this section may be discarded
// at any time. Smaller items are kept in arrays, with one entry per byte
@ -143,6 +143,13 @@ namespace SourceGen {
// Project and platform symbols that are being referenced from code.
public List<DefSymbol> ActiveDefSymbolList { get; private set; }
#if false
// Data scan results.
public TypedRangeSet RepeatedBytes { get; private set; }
public RangeSet StdAsciiBytes { get; private set; }
public RangeSet HighAsciiBytes { get; private set; }
#endif
// List of changes for undo/redo.
private List<ChangeSet> mUndoList = new List<ChangeSet>();
@ -226,6 +233,9 @@ namespace SourceGen {
mFileData = fileData;
mDataFileName = dataFileName;
FileDataCrc32 = CommonUtil.CRC32.OnWholeBuffer(0, mFileData);
#if false
ScanFileData();
#endif
// Mark the first byte as code so we have something to do. This may get
// overridden later.
@ -303,8 +313,89 @@ namespace SourceGen {
Debug.Assert(CRC32.OnWholeBuffer(0, fileData) == FileDataCrc32);
mFileData = fileData;
mDataFileName = dataFileName;
#if false
ScanFileData();
#endif
}
#if false
private delegate bool ByteTest(byte val); // for ScanFileData()
/// <summary>
/// Scans the contents of the file data array, noting runs of identical bytes and
/// other interesting bits.
///
/// The file data is guaranteed not to change, so doing a bit of work here can save
/// us time during data analysis.
/// </summary>
private void ScanFileData() {
DateTime startWhen = DateTime.Now;
// Find runs of identical bytes.
TypedRangeSet repeats = new TypedRangeSet();
Debug.Assert(mFileData.Length > 0);
byte matchByte = mFileData[0];
int count = 1;
for (int i = 1; i < mFileData.Length; i++) {
if (mFileData[i] == matchByte) {
count++;
continue;
}
if (count >= DataAnalysis.MIN_RUN_LENGTH) {
repeats.AddRange(i - count, i - 1, matchByte);
}
matchByte = mFileData[i];
count = 1;
}
if (count >= DataAnalysis.MIN_RUN_LENGTH) {
repeats.AddRange(mFileData.Length - count, mFileData.Length - 1, matchByte);
}
RangeSet ascii = new RangeSet();
CreateByteRangeSet(ascii, mFileData, DataAnalysis.MIN_STRING_LENGTH,
delegate (byte val) {
return val >= 0x20 && val < 0x7f;
}
);
RangeSet highAscii = new RangeSet();
CreateByteRangeSet(highAscii, mFileData, DataAnalysis.MIN_STRING_LENGTH,
delegate (byte val) {
return val >= 0xa0 && val < 0xff;
}
);
if (false) {
repeats.DebugDump("Repeated-Bytes (" + DataAnalysis.MIN_RUN_LENGTH + "+)");
ascii.DebugDump("Standard-ASCII (" + DataAnalysis.MIN_STRING_LENGTH + "+)");
highAscii.DebugDump("High-ASCII (" + DataAnalysis.MIN_STRING_LENGTH + "+)");
}
Debug.WriteLine("ScanFileData took " +
((DateTime.Now - startWhen).Milliseconds) + " ms");
RepeatedBytes = repeats;
StdAsciiBytes = ascii;
HighAsciiBytes = highAscii;
}
private void CreateByteRangeSet(RangeSet set, byte[] data, int minLen, ByteTest tester) {
int count = 0;
for (int i = 0; i < data.Length; i++) {
if (tester(data[i])) {
count++;
} else if (count < minLen) {
count = 0;
} else {
set.AddRange(i - count, i - 1);
count = 0;
}
}
if (count >= minLen) {
set.AddRange(data.Length - count, data.Length - 1);
}
}
#endif
/// <summary>
/// Loads platform symbol files and extension scripts.
///
@ -486,11 +577,16 @@ namespace SourceGen {
reanalysisTimer.StartTask("GenerateActiveDefSymbolList");
// Generate the list of project/platform symbols that are being used. This forms
// the list of EQUates at the top of the file.
// the list of EQUates at the top of the file. The active set is identified from
// the cross-reference data.
GenerateActiveDefSymbolList();
reanalysisTimer.EndTask("GenerateActiveDefSymbolList");
#if DEBUG
reanalysisTimer.StartTask("Validate");
Validate();
reanalysisTimer.EndTask("Validate");
#endif
reanalysisTimer.EndTask("DisasmProject.Analyze()");
//reanalysisTimer.DumpTimes("DisasmProject timers:", debugLog);

View File

@ -63,6 +63,9 @@ method in <code>DisasmProject.cs</code>):</p>
the list that is displayed in .EQ directives.</li>
<li>Generate cross-reference lists. This is done for file data and
for any platform/project symbols that are referenced.</li>
<li>If annotated auto-labels are enabled, the simple labels are
replaced with the annotated versions here. (This can't be done earlier
because the annotations are generated from the cross-reference data.)</li>
<li>In a debug build, some validity checks are performed.</li>
</ul>
@ -84,7 +87,8 @@ determines the width of the numeric value or length of the string. For
this reason, instructions do not need a format descriptor, but all
data items do.</p>
<p>Symbolic references are format descriptors with a symbol attached.
The symbol reference also specifies low/high/bank.</p>
The symbol reference also specifies low/high/bank, for partial symbol
references like <code>LDA #&gt;symbol</code>.</p>
<p>Every offset marked as a start point gets its own line in the on-screen
display list. Embedded instructions are identified internally by
looking for instruction-start offsets inside instructions.</p>

View File

@ -44,9 +44,9 @@ just outputs raw hex bytes for MVN/MVP instructions. This yields the
correct code for all versions of the assembler, but is ugly and
annoying. So we want to output actual MVN/MVP instructions when producing
code for newer versions of the assembler.</p>
<p>When you configure a cross-assembler, SourceGen executes it and
extracts the version information from the command-line output stream.
This is used by the generator to ensure that the output will compile.
<p>When you configure a cross-assembler, SourceGen runs the executable with
version query args, and extracts the version information from the output
stream. This is used by the generator to ensure that the output will compile.
If no assembler is configured, SourceGen will produce code optimized
for the latest version of the assembler.</p>
@ -58,7 +58,8 @@ generators may produce multiple source files, perhaps a link script or
symbol definition header to go with the assembly source. To avoid
spreading files across the filesystem, SourceGen does all of its work
in the same directory where the project lives. Before you can generate
code, you have to have given your project a name by saving it.</p>
code, you have to have assigned your project a directory. This is why
you can't assemble a project until you've saved it for the first time.</p>
<p>The Generate and Assemble dialog has a drop-down list near the top
that lets you pick which assembler to target. The name of the assembler

View File

@ -6,6 +6,7 @@ NOTE: some tests may fail if you use a version of the assembler that is
different from the one used to generate the expected output. The current
set was generated with:
* 64tass v1.53.1515
* cc65 v2.17
* Merlin 32 v1.0