Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data now recognizes the C64 encodings when selected in the project properties. The new code avoids some redundant comparisons when runs of printable characters are found. I suspect the new implementation loses on overall performance because we're now calling through delegates instead of testing characters directly, but I haven't tested for that.
2025-04-06 08:47:20 +00:00 · 2019-08-13 14:08:27 -07:00 · 2019-08-13 14:08:27 -07:00 · f3c28406a5
commit f3c28406a5
parent d5b53a0795
3 changed files with 176 additions and 75 deletions
--- a/Asm65/CharEncoding.cs
+++ b/Asm65/CharEncoding.cs
@ -35,11 +35,15 @@ namespace Asm65 {
        /// </summary>
        /// <remarks>
        /// Yes, I'm assuming it all fits in a UTF-16 char.  PETSCII has some glyphs that
-        /// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so
-        /// anything non-ASCII is getting hexified anyway.
+        /// aren't part of the BMP, but we're targeting a variety of cross-assemblers with
+        /// potentially different notions of Unicode mappings, so anything non-ASCII is
+        /// getting hexified anyway.
        /// </remarks>
        public delegate char Convert(byte val);

+        /// <summary>
+        /// Character encoding.
+        /// </summary>
        public enum Encoding {
            Unknown = 0,
            Ascii,
@ -49,7 +53,17 @@ namespace Asm65 {
        }

        //
-        // Standard ASCII.
+        // Plain ASCII.
+        //
+        // We recognize BELL, LF, and CR as control characters that may be present in
+        // text strings.  This allows use to generate:
+        //
+        //  .str "hello",$0d
+        //
+        // instead of:
+        //
+        //  .str "hello"
+        //  .dd1  $0d
        //
        public static bool IsPrintableAscii(byte val) {
            return (val >= 0x20 && val < 0x7f);
@ -66,7 +80,7 @@ namespace Asm65 {
        }

        //
-        // Standard ASCII, but with the high bit set.
+        // High ASCII: plain ASCII with the high bit set.
        //
        public static bool IsPrintableHighAscii(byte val) {
            return (val >= 0xa0 && val < 0xff);
@ -83,8 +97,14 @@ namespace Asm65 {
        }

        //
-        // High *or* low ASCII.
+        // High and/or low ASCII.
        //
+        public static bool IsPrintableLowOrHighAscii(byte val) {
+            return IsPrintableAscii((byte)(val & 0x7f));
+        }
+        public static bool IsExtendedLowOrHighAscii(byte val) {
+            return IsExtendedAscii((byte)(val & 0x7f));
+        }
        public static char ConvertLowAndHighAscii(byte val) {
            if (IsPrintableAscii(val) || IsPrintableHighAscii(val)) {
                return (char)(val & 0x7f);
@ -102,6 +122,7 @@ namespace Asm65 {
        // Characters with the high bit set are shown with colors reversed.
        //

+
        //
        // PETSCII (C64 variant)
        //
@ -142,6 +163,35 @@ namespace Asm65 {
        //
        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
        //
+        private static bool[] sPrintablePetscii = CreatePrintablePetsciiMap();
+        private static bool[] sExtendedPetscii = CreateExtendedPetsciiMap();
+        private static bool[] CreatePrintablePetsciiMap() {
+            bool[] map = new bool[256];
+            for (int i = 0x20; i <= 0x5b; i++) {
+                map[i] = true;
+            }
+            map[0x5d] = true;
+            for (int i = 0xc1; i <= 0xda; i++) {
+                map[i] = true;
+            }
+            return map;
+        }
+        private static bool[] CreateExtendedPetsciiMap() {
+            bool[] map = CreatePrintablePetsciiMap();
+            // control codes that we might expect to find in strings
+            map[0x05] = map[0x1c] = map[0x1e] = map[0x1f] = map[0x81] = map[0x90] = map[0x95] =
+                map[0x96] = map[0x97] = map[0x98] = map[0x99] = map[0x9a] = map[0x9b] =
+                map[0x9c] = map[0x9e] = map[0x9f] = true;
+            map[0x93] = map[0x12] = map[0x92] = true;
+            map[0x07] = map[0x0a] = map[0x0d] = true;
+            return map;
+        }
+        public static bool IsPrintablePetscii(byte val) {
+            return sPrintablePetscii[val];
+        }
+        public static bool IsExtendedPetscii(byte val) {
+            return sExtendedPetscii[val];
+        }

        //
        // C64 Screen Codes
@ -159,5 +209,26 @@ namespace Asm65 {
        //
        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
        //
+        private static bool[] sPrintableScreenCode = CreatePrintableScreenCodeMap();
+        private static bool[] CreatePrintableScreenCodeMap() {
+            bool[] map = new bool[256];
+            for (int i = 0x00; i <= 0x1b; i++) {
+                map[i] = true;
+            }
+            map[0x1d] = true;
+            for (int i = 0x20; i <= 0x3f; i++) {
+                map[i] = true;
+            }
+            for (int i = 0x41; i <= 0x5a; i++) {
+                map[i] = true;
+            }
+            return map;
+        }
+        public static bool IsPrintableScreenCode(byte val) {
+            return sPrintableScreenCode[val];
+        }
+        public static bool IsExtendedScreenCode(byte val) {
+            return sPrintableScreenCode[val];
+        }
    }
 }
--- a/SourceGen/DataAnalysis.cs
+++ b/SourceGen/DataAnalysis.cs
@ -18,6 +18,7 @@ using System.Diagnostics;

 using Asm65;
 using CommonUtil;
+using TextScanMode = SourceGen.ProjectProperties.AnalysisParameters.TextScanMode;

 namespace SourceGen {
    /// <summary>
@ -30,11 +31,11 @@ namespace SourceGen {
        // Minimum number of consecutive identical bytes for something to be called a "run".
        private const int MIN_RUN_LENGTH = 5;

-        // Minimum length for treating data as a run if the byte is a valid ASCII value.
-        // (Alternatively, the maximum length of an ASCII string composed of single characters.)
+        // Minimum length for treating data as a run if the byte is a printable character.
+        // (Alternatively, the maximum length of a character string composed of a single value.)
        // Anything shorter than this is handled with a string directive, anything this long or
        // longer becomes FILL.  This should be larger than the MinCharsForString parameter.
-        private const int MIN_RUN_LENGTH_ASCII = 62;
+        private const int MAX_STRING_RUN_LENGTH = 62;

        // Absolute minimum string length for auto-detection.  This is used when generating the
        // data tables.
@ -605,7 +606,7 @@ namespace SourceGen {

            int minStringChars = mAnalysisParams.MinCharsForString;

-#if false   // this is actually slower (and uses more memory)
+#if DATA_PRESCAN   // this is actually slower (and uses more memory)
            while (start <= end) {
                // This is used to let us skip forward.  It starts past the end of the block,
                // and moves backward as we identify potential points of interest.
@ -709,53 +710,105 @@ namespace SourceGen {
                }
            }
 #else
+            // Select "is printable" test.  We use the extended version to include some
+            // control characters.
+            CharEncoding.InclusionTest testPrintable;
+            FormatDescriptor.SubType baseSubType;
+            switch (mAnalysisParams.DefaultTextScanMode) {
+                case TextScanMode.LowAscii:
+                    testPrintable = CharEncoding.IsExtendedAscii;
+                    baseSubType = FormatDescriptor.SubType.Ascii;
+                    break;
+                case TextScanMode.LowHighAscii:
+                    testPrintable = CharEncoding.IsExtendedLowOrHighAscii;
+                    baseSubType = FormatDescriptor.SubType.ASCII_GENERIC;
+                    break;
+                case TextScanMode.C64Petscii:
+                    testPrintable = CharEncoding.IsExtendedPetscii;
+                    baseSubType = FormatDescriptor.SubType.C64Petscii;
+                    break;
+                case TextScanMode.C64ScreenCode:
+                    testPrintable = CharEncoding.IsExtendedScreenCode;
+                    baseSubType = FormatDescriptor.SubType.C64Screen;
+                    break;
+                default:
+                    Debug.Assert(false);
+                    testPrintable = CharEncoding.IsExtendedLowOrHighAscii;
+                    baseSubType = FormatDescriptor.SubType.ASCII_GENERIC;
+                    break;
+            }
+
            while (start <= end) {
                // Check for block of repeated values.
                int runLen = RecognizeRun(mFileData, start, end);
-                bool isAscii = TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f));
-                if (runLen >= MIN_RUN_LENGTH) {
-                    // Output as run or ASCII string.  Prefer ASCII if the string is short
-                    // enough to fit on one line (e.g. 64 chars including delimiters) and
-                    // meets the minimum string length threshold.
-                    if (isAscii && runLen <= MIN_RUN_LENGTH_ASCII && runLen >= minStringChars) {
-                        // String -- if we create the descriptor here, we save a little time,
-                        // but strings like "*****hello" turn into two separate strings.  So
-                        // just fall through and let the ASCII recognizer handle it.
-                    } else {
-                        // run
-                        LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
-                            runLen + " bytes");
-                        mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
-                            runLen, FormatDescriptor.Type.Fill,
-                            FormatDescriptor.SubType.None);
-                        start += runLen;
-                        continue;
+                int printLen = 0;
+                FormatDescriptor.SubType subType = baseSubType;
+
+                if (testPrintable(mFileData[start])) {
+                    // The run byte is printable, and the run is shorter than a line.  It's
+                    // possible the run is followed by additional printable characters, e.g.
+                    // "*****hello".  Text is easier for humans to understand, so we prefer
+                    // that unless the run is longer than one line.
+                    if (runLen <= MAX_STRING_RUN_LENGTH) {
+                        // See if the run is followed by additional printable characters.
+                        printLen = runLen;
+
+                        // For LowHighAscii we allow a string to be either low or high, but it
+                        // must be entirely one thing.  Refine our test.
+                        CharEncoding.InclusionTest refinedTest = testPrintable;
+                        if (mAnalysisParams.DefaultTextScanMode == TextScanMode.LowHighAscii) {
+                            if (CharEncoding.IsExtendedAscii(mFileData[start])) {
+                                refinedTest = CharEncoding.IsExtendedAscii;
+                                subType = FormatDescriptor.SubType.Ascii;
+                            } else {
+                                refinedTest = CharEncoding.IsExtendedHighAscii;
+                                subType = FormatDescriptor.SubType.HighAscii;
+                            }
+                        }
+                        for (int i = start + runLen; i <= end; i++) {
+                            if (!refinedTest(mFileData[i])) {
+                                break;
+                            }
+                            printLen++;
+                        }
                    }
                }

-                int asciiLen = RecognizeAscii(mFileData, start, end);
-                if (asciiLen >= minStringChars) {
-                    LogV(start, "ASCII string, len=" + asciiLen + " bytes");
-                    bool isHigh = (mFileData[start] & 0x80) != 0;
-                    mAnattribs[start].DataDescriptor = FormatDescriptor.Create(asciiLen,
-                        FormatDescriptor.Type.StringGeneric, isHigh ?
-                        FormatDescriptor.SubType.HighAscii : FormatDescriptor.SubType.Ascii);
-                    start += asciiLen;
-                    continue;
-                }
-
-                // Nothing found, output as single byte.  This is the easiest form for users
-                // to edit.  If we found a run, but it was too short, we can go ahead and
-                // mark all bytes in the run because we know the later matches will also be
-                // too short.
-                Debug.Assert(runLen > 0);
-                while (runLen-- != 0) {
-                    mAnattribs[start++].DataDescriptor = oneByteDefault;
-                    FormatDescriptor.DebugPrefabBump();
+                if (printLen >= minStringChars) {
+                    // This either a short run followed by printable characters, or just a
+                    // (possibly very large) bunch of printable characters.
+                    Debug.Assert(subType != FormatDescriptor.SubType.ASCII_GENERIC);
+                    LogD(start, "Character string (" + subType + "), len=" + printLen + " bytes");
+                    mAnattribs[start].DataDescriptor = FormatDescriptor.Create(printLen,
+                        FormatDescriptor.Type.StringGeneric, subType);
+                    start += printLen;
+                } else if (runLen >= MIN_RUN_LENGTH) {
+                    // Didn't qualify as a string, but it's long enough to be a run.
+                    //
+                    // TODO(someday): allow .fill pseudo-ops to have character encoding
+                    //   sub-types, so we can ".fill 64,'*'".  Easy to do here, but
+                    //   proper treatment requires tweaking data operand editor to allow
+                    //   char encoding to be specified.
+                    LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
+                        runLen + " bytes");
+                    mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
+                        runLen, FormatDescriptor.Type.Fill,
+                        FormatDescriptor.SubType.None);
+                    start += runLen;
+                } else {
+                    // Nothing useful found, output 1+ values as single bytes.  This is the
+                    // easiest form for users to edit.  If we found a run, but it was too short,
+                    // we can go ahead and mark all bytes in the run because we know the later
+                    // matches will also be too short.
+                    Debug.Assert(runLen > 0);
+                    while (runLen-- != 0) {
+                        mAnattribs[start++].DataDescriptor = oneByteDefault;
+                        FormatDescriptor.DebugPrefabBump();
+                    }
                }
            }
 #endif
-            }
+        }

 #region Static analyzer methods

@ -777,29 +830,6 @@ namespace SourceGen {
            return index - start;
        }

-        /// <summary>
-        /// Checks for a run of ASCII values.  Both high and low ASCII are recognized,
-        /// but the entire run must be one or the other.
-        /// </summary>
-        /// <param name="fileData">Raw data.</param>
-        /// <param name="start">Offset of first byte in range.</param>
-        /// <param name="end">Offset of last byte in range.</param>
-        /// <returns>Length of run.</returns>
-        public static int RecognizeAscii(byte[] fileData, int start, int end) {
-            // This won't find a mix of Apple II high/inverse/flashing text.
-            byte firstHi = (byte)(fileData[start] & 0x80);
-
-            int index;
-            for (index = start; index <= end; index++) {
-                char ch = (char)fileData[index];
-                if (!TextUtil.IsPrintableAscii((char)(ch & 0x7f)) || (ch & 0x80) != firstHi) {
-                    break;
-                }
-            }
-
-            return index - start;
-        }
-
        /// <summary>
        /// Counts the number of low-ASCII, high-ASCII, and non-ASCII values in the
        /// specified region.
@ -1080,7 +1110,7 @@ namespace SourceGen {



-#if false
+#if DATA_PRESCAN
        /// <summary>
        /// Iterator that generates a list of offsets which are not known to hold code or data.
        /// 
--- a/SourceGen/DisasmProject.cs
+++ b/SourceGen/DisasmProject.cs
@ -143,7 +143,7 @@ namespace SourceGen {
        // Project and platform symbols that are being referenced from code.
        public List<DefSymbol> ActiveDefSymbolList { get; private set; }

-#if false
+#if DATA_PRESCAN
        // Data scan results.
        public TypedRangeSet RepeatedBytes { get; private set; }
        public RangeSet StdAsciiBytes { get; private set; }
@ -233,7 +233,7 @@ namespace SourceGen {
            mFileData = fileData;
            mDataFileName = dataFileName;
            FileDataCrc32 = CommonUtil.CRC32.OnWholeBuffer(0, mFileData);
-#if false
+#if DATA_PRESCAN
            ScanFileData();
 #endif

@ -317,12 +317,12 @@ namespace SourceGen {

            FixAndValidate(ref report);

-#if false
+#if DATA_PRESCAN
            ScanFileData();
 #endif
        }

-#if false
+#if DATA_PRESCAN
        private delegate bool ByteTest(byte val);   // for ScanFileData()

        /// <summary>