diff --git a/Asm65/CharEncoding.cs b/Asm65/CharEncoding.cs index 157c0fe..5a20953 100644 --- a/Asm65/CharEncoding.cs +++ b/Asm65/CharEncoding.cs @@ -35,11 +35,15 @@ namespace Asm65 { /// /// /// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that - /// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so - /// anything non-ASCII is getting hexified anyway. + /// aren't part of the BMP, but we're targeting a variety of cross-assemblers with + /// potentially different notions of Unicode mappings, so anything non-ASCII is + /// getting hexified anyway. /// public delegate char Convert(byte val); + /// + /// Character encoding. + /// public enum Encoding { Unknown = 0, Ascii, @@ -49,7 +53,17 @@ namespace Asm65 { } // - // Standard ASCII. + // Plain ASCII. + // + // We recognize BELL, LF, and CR as control characters that may be present in + // text strings. This allows use to generate: + // + // .str "hello",$0d + // + // instead of: + // + // .str "hello" + // .dd1 $0d // public static bool IsPrintableAscii(byte val) { return (val >= 0x20 && val < 0x7f); @@ -66,7 +80,7 @@ namespace Asm65 { } // - // Standard ASCII, but with the high bit set. + // High ASCII: plain ASCII with the high bit set. // public static bool IsPrintableHighAscii(byte val) { return (val >= 0xa0 && val < 0xff); @@ -83,8 +97,14 @@ namespace Asm65 { } // - // High *or* low ASCII. + // High and/or low ASCII. // + public static bool IsPrintableLowOrHighAscii(byte val) { + return IsPrintableAscii((byte)(val & 0x7f)); + } + public static bool IsExtendedLowOrHighAscii(byte val) { + return IsExtendedAscii((byte)(val & 0x7f)); + } public static char ConvertLowAndHighAscii(byte val) { if (IsPrintableAscii(val) || IsPrintableHighAscii(val)) { return (char)(val & 0x7f); @@ -102,6 +122,7 @@ namespace Asm65 { // Characters with the high bit set are shown with colors reversed. // + // // PETSCII (C64 variant) // @@ -142,6 +163,35 @@ namespace Asm65 { // // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf // + private static bool[] sPrintablePetscii = CreatePrintablePetsciiMap(); + private static bool[] sExtendedPetscii = CreateExtendedPetsciiMap(); + private static bool[] CreatePrintablePetsciiMap() { + bool[] map = new bool[256]; + for (int i = 0x20; i <= 0x5b; i++) { + map[i] = true; + } + map[0x5d] = true; + for (int i = 0xc1; i <= 0xda; i++) { + map[i] = true; + } + return map; + } + private static bool[] CreateExtendedPetsciiMap() { + bool[] map = CreatePrintablePetsciiMap(); + // control codes that we might expect to find in strings + map[0x05] = map[0x1c] = map[0x1e] = map[0x1f] = map[0x81] = map[0x90] = map[0x95] = + map[0x96] = map[0x97] = map[0x98] = map[0x99] = map[0x9a] = map[0x9b] = + map[0x9c] = map[0x9e] = map[0x9f] = true; + map[0x93] = map[0x12] = map[0x92] = true; + map[0x07] = map[0x0a] = map[0x0d] = true; + return map; + } + public static bool IsPrintablePetscii(byte val) { + return sPrintablePetscii[val]; + } + public static bool IsExtendedPetscii(byte val) { + return sExtendedPetscii[val]; + } // // C64 Screen Codes @@ -159,5 +209,26 @@ namespace Asm65 { // // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf // + private static bool[] sPrintableScreenCode = CreatePrintableScreenCodeMap(); + private static bool[] CreatePrintableScreenCodeMap() { + bool[] map = new bool[256]; + for (int i = 0x00; i <= 0x1b; i++) { + map[i] = true; + } + map[0x1d] = true; + for (int i = 0x20; i <= 0x3f; i++) { + map[i] = true; + } + for (int i = 0x41; i <= 0x5a; i++) { + map[i] = true; + } + return map; + } + public static bool IsPrintableScreenCode(byte val) { + return sPrintableScreenCode[val]; + } + public static bool IsExtendedScreenCode(byte val) { + return sPrintableScreenCode[val]; + } } } diff --git a/SourceGen/DataAnalysis.cs b/SourceGen/DataAnalysis.cs index e5694e7..06477ba 100644 --- a/SourceGen/DataAnalysis.cs +++ b/SourceGen/DataAnalysis.cs @@ -18,6 +18,7 @@ using System.Diagnostics; using Asm65; using CommonUtil; +using TextScanMode = SourceGen.ProjectProperties.AnalysisParameters.TextScanMode; namespace SourceGen { /// @@ -30,11 +31,11 @@ namespace SourceGen { // Minimum number of consecutive identical bytes for something to be called a "run". private const int MIN_RUN_LENGTH = 5; - // Minimum length for treating data as a run if the byte is a valid ASCII value. - // (Alternatively, the maximum length of an ASCII string composed of single characters.) + // Minimum length for treating data as a run if the byte is a printable character. + // (Alternatively, the maximum length of a character string composed of a single value.) // Anything shorter than this is handled with a string directive, anything this long or // longer becomes FILL. This should be larger than the MinCharsForString parameter. - private const int MIN_RUN_LENGTH_ASCII = 62; + private const int MAX_STRING_RUN_LENGTH = 62; // Absolute minimum string length for auto-detection. This is used when generating the // data tables. @@ -605,7 +606,7 @@ namespace SourceGen { int minStringChars = mAnalysisParams.MinCharsForString; -#if false // this is actually slower (and uses more memory) +#if DATA_PRESCAN // this is actually slower (and uses more memory) while (start <= end) { // This is used to let us skip forward. It starts past the end of the block, // and moves backward as we identify potential points of interest. @@ -709,53 +710,105 @@ namespace SourceGen { } } #else + // Select "is printable" test. We use the extended version to include some + // control characters. + CharEncoding.InclusionTest testPrintable; + FormatDescriptor.SubType baseSubType; + switch (mAnalysisParams.DefaultTextScanMode) { + case TextScanMode.LowAscii: + testPrintable = CharEncoding.IsExtendedAscii; + baseSubType = FormatDescriptor.SubType.Ascii; + break; + case TextScanMode.LowHighAscii: + testPrintable = CharEncoding.IsExtendedLowOrHighAscii; + baseSubType = FormatDescriptor.SubType.ASCII_GENERIC; + break; + case TextScanMode.C64Petscii: + testPrintable = CharEncoding.IsExtendedPetscii; + baseSubType = FormatDescriptor.SubType.C64Petscii; + break; + case TextScanMode.C64ScreenCode: + testPrintable = CharEncoding.IsExtendedScreenCode; + baseSubType = FormatDescriptor.SubType.C64Screen; + break; + default: + Debug.Assert(false); + testPrintable = CharEncoding.IsExtendedLowOrHighAscii; + baseSubType = FormatDescriptor.SubType.ASCII_GENERIC; + break; + } + while (start <= end) { // Check for block of repeated values. int runLen = RecognizeRun(mFileData, start, end); - bool isAscii = TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f)); - if (runLen >= MIN_RUN_LENGTH) { - // Output as run or ASCII string. Prefer ASCII if the string is short - // enough to fit on one line (e.g. 64 chars including delimiters) and - // meets the minimum string length threshold. - if (isAscii && runLen <= MIN_RUN_LENGTH_ASCII && runLen >= minStringChars) { - // String -- if we create the descriptor here, we save a little time, - // but strings like "*****hello" turn into two separate strings. So - // just fall through and let the ASCII recognizer handle it. - } else { - // run - LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " + - runLen + " bytes"); - mAnattribs[start].DataDescriptor = FormatDescriptor.Create( - runLen, FormatDescriptor.Type.Fill, - FormatDescriptor.SubType.None); - start += runLen; - continue; + int printLen = 0; + FormatDescriptor.SubType subType = baseSubType; + + if (testPrintable(mFileData[start])) { + // The run byte is printable, and the run is shorter than a line. It's + // possible the run is followed by additional printable characters, e.g. + // "*****hello". Text is easier for humans to understand, so we prefer + // that unless the run is longer than one line. + if (runLen <= MAX_STRING_RUN_LENGTH) { + // See if the run is followed by additional printable characters. + printLen = runLen; + + // For LowHighAscii we allow a string to be either low or high, but it + // must be entirely one thing. Refine our test. + CharEncoding.InclusionTest refinedTest = testPrintable; + if (mAnalysisParams.DefaultTextScanMode == TextScanMode.LowHighAscii) { + if (CharEncoding.IsExtendedAscii(mFileData[start])) { + refinedTest = CharEncoding.IsExtendedAscii; + subType = FormatDescriptor.SubType.Ascii; + } else { + refinedTest = CharEncoding.IsExtendedHighAscii; + subType = FormatDescriptor.SubType.HighAscii; + } + } + for (int i = start + runLen; i <= end; i++) { + if (!refinedTest(mFileData[i])) { + break; + } + printLen++; + } } } - int asciiLen = RecognizeAscii(mFileData, start, end); - if (asciiLen >= minStringChars) { - LogV(start, "ASCII string, len=" + asciiLen + " bytes"); - bool isHigh = (mFileData[start] & 0x80) != 0; - mAnattribs[start].DataDescriptor = FormatDescriptor.Create(asciiLen, - FormatDescriptor.Type.StringGeneric, isHigh ? - FormatDescriptor.SubType.HighAscii : FormatDescriptor.SubType.Ascii); - start += asciiLen; - continue; - } - - // Nothing found, output as single byte. This is the easiest form for users - // to edit. If we found a run, but it was too short, we can go ahead and - // mark all bytes in the run because we know the later matches will also be - // too short. - Debug.Assert(runLen > 0); - while (runLen-- != 0) { - mAnattribs[start++].DataDescriptor = oneByteDefault; - FormatDescriptor.DebugPrefabBump(); + if (printLen >= minStringChars) { + // This either a short run followed by printable characters, or just a + // (possibly very large) bunch of printable characters. + Debug.Assert(subType != FormatDescriptor.SubType.ASCII_GENERIC); + LogD(start, "Character string (" + subType + "), len=" + printLen + " bytes"); + mAnattribs[start].DataDescriptor = FormatDescriptor.Create(printLen, + FormatDescriptor.Type.StringGeneric, subType); + start += printLen; + } else if (runLen >= MIN_RUN_LENGTH) { + // Didn't qualify as a string, but it's long enough to be a run. + // + // TODO(someday): allow .fill pseudo-ops to have character encoding + // sub-types, so we can ".fill 64,'*'". Easy to do here, but + // proper treatment requires tweaking data operand editor to allow + // char encoding to be specified. + LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " + + runLen + " bytes"); + mAnattribs[start].DataDescriptor = FormatDescriptor.Create( + runLen, FormatDescriptor.Type.Fill, + FormatDescriptor.SubType.None); + start += runLen; + } else { + // Nothing useful found, output 1+ values as single bytes. This is the + // easiest form for users to edit. If we found a run, but it was too short, + // we can go ahead and mark all bytes in the run because we know the later + // matches will also be too short. + Debug.Assert(runLen > 0); + while (runLen-- != 0) { + mAnattribs[start++].DataDescriptor = oneByteDefault; + FormatDescriptor.DebugPrefabBump(); + } } } #endif - } + } #region Static analyzer methods @@ -777,29 +830,6 @@ namespace SourceGen { return index - start; } - /// - /// Checks for a run of ASCII values. Both high and low ASCII are recognized, - /// but the entire run must be one or the other. - /// - /// Raw data. - /// Offset of first byte in range. - /// Offset of last byte in range. - /// Length of run. - public static int RecognizeAscii(byte[] fileData, int start, int end) { - // This won't find a mix of Apple II high/inverse/flashing text. - byte firstHi = (byte)(fileData[start] & 0x80); - - int index; - for (index = start; index <= end; index++) { - char ch = (char)fileData[index]; - if (!TextUtil.IsPrintableAscii((char)(ch & 0x7f)) || (ch & 0x80) != firstHi) { - break; - } - } - - return index - start; - } - /// /// Counts the number of low-ASCII, high-ASCII, and non-ASCII values in the /// specified region. @@ -1080,7 +1110,7 @@ namespace SourceGen { -#if false +#if DATA_PRESCAN /// /// Iterator that generates a list of offsets which are not known to hold code or data. /// diff --git a/SourceGen/DisasmProject.cs b/SourceGen/DisasmProject.cs index c72c239..5b02417 100644 --- a/SourceGen/DisasmProject.cs +++ b/SourceGen/DisasmProject.cs @@ -143,7 +143,7 @@ namespace SourceGen { // Project and platform symbols that are being referenced from code. public List ActiveDefSymbolList { get; private set; } -#if false +#if DATA_PRESCAN // Data scan results. public TypedRangeSet RepeatedBytes { get; private set; } public RangeSet StdAsciiBytes { get; private set; } @@ -233,7 +233,7 @@ namespace SourceGen { mFileData = fileData; mDataFileName = dataFileName; FileDataCrc32 = CommonUtil.CRC32.OnWholeBuffer(0, mFileData); -#if false +#if DATA_PRESCAN ScanFileData(); #endif @@ -317,12 +317,12 @@ namespace SourceGen { FixAndValidate(ref report); -#if false +#if DATA_PRESCAN ScanFileData(); #endif } -#if false +#if DATA_PRESCAN private delegate bool ByteTest(byte val); // for ScanFileData() ///