diff --git a/Asm65/CharEncoding.cs b/Asm65/CharEncoding.cs
index 157c0fe..5a20953 100644
--- a/Asm65/CharEncoding.cs
+++ b/Asm65/CharEncoding.cs
@@ -35,11 +35,15 @@ namespace Asm65 {
///
///
/// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that
- /// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so
- /// anything non-ASCII is getting hexified anyway.
+ /// aren't part of the BMP, but we're targeting a variety of cross-assemblers with
+ /// potentially different notions of Unicode mappings, so anything non-ASCII is
+ /// getting hexified anyway.
///
public delegate char Convert(byte val);
+ ///
+ /// Character encoding.
+ ///
public enum Encoding {
Unknown = 0,
Ascii,
@@ -49,7 +53,17 @@ namespace Asm65 {
}
//
- // Standard ASCII.
+ // Plain ASCII.
+ //
+ // We recognize BELL, LF, and CR as control characters that may be present in
+ // text strings. This allows use to generate:
+ //
+ // .str "hello",$0d
+ //
+ // instead of:
+ //
+ // .str "hello"
+ // .dd1 $0d
//
public static bool IsPrintableAscii(byte val) {
return (val >= 0x20 && val < 0x7f);
@@ -66,7 +80,7 @@ namespace Asm65 {
}
//
- // Standard ASCII, but with the high bit set.
+ // High ASCII: plain ASCII with the high bit set.
//
public static bool IsPrintableHighAscii(byte val) {
return (val >= 0xa0 && val < 0xff);
@@ -83,8 +97,14 @@ namespace Asm65 {
}
//
- // High *or* low ASCII.
+ // High and/or low ASCII.
//
+ public static bool IsPrintableLowOrHighAscii(byte val) {
+ return IsPrintableAscii((byte)(val & 0x7f));
+ }
+ public static bool IsExtendedLowOrHighAscii(byte val) {
+ return IsExtendedAscii((byte)(val & 0x7f));
+ }
public static char ConvertLowAndHighAscii(byte val) {
if (IsPrintableAscii(val) || IsPrintableHighAscii(val)) {
return (char)(val & 0x7f);
@@ -102,6 +122,7 @@ namespace Asm65 {
// Characters with the high bit set are shown with colors reversed.
//
+
//
// PETSCII (C64 variant)
//
@@ -142,6 +163,35 @@ namespace Asm65 {
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
+ private static bool[] sPrintablePetscii = CreatePrintablePetsciiMap();
+ private static bool[] sExtendedPetscii = CreateExtendedPetsciiMap();
+ private static bool[] CreatePrintablePetsciiMap() {
+ bool[] map = new bool[256];
+ for (int i = 0x20; i <= 0x5b; i++) {
+ map[i] = true;
+ }
+ map[0x5d] = true;
+ for (int i = 0xc1; i <= 0xda; i++) {
+ map[i] = true;
+ }
+ return map;
+ }
+ private static bool[] CreateExtendedPetsciiMap() {
+ bool[] map = CreatePrintablePetsciiMap();
+ // control codes that we might expect to find in strings
+ map[0x05] = map[0x1c] = map[0x1e] = map[0x1f] = map[0x81] = map[0x90] = map[0x95] =
+ map[0x96] = map[0x97] = map[0x98] = map[0x99] = map[0x9a] = map[0x9b] =
+ map[0x9c] = map[0x9e] = map[0x9f] = true;
+ map[0x93] = map[0x12] = map[0x92] = true;
+ map[0x07] = map[0x0a] = map[0x0d] = true;
+ return map;
+ }
+ public static bool IsPrintablePetscii(byte val) {
+ return sPrintablePetscii[val];
+ }
+ public static bool IsExtendedPetscii(byte val) {
+ return sExtendedPetscii[val];
+ }
//
// C64 Screen Codes
@@ -159,5 +209,26 @@ namespace Asm65 {
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
+ private static bool[] sPrintableScreenCode = CreatePrintableScreenCodeMap();
+ private static bool[] CreatePrintableScreenCodeMap() {
+ bool[] map = new bool[256];
+ for (int i = 0x00; i <= 0x1b; i++) {
+ map[i] = true;
+ }
+ map[0x1d] = true;
+ for (int i = 0x20; i <= 0x3f; i++) {
+ map[i] = true;
+ }
+ for (int i = 0x41; i <= 0x5a; i++) {
+ map[i] = true;
+ }
+ return map;
+ }
+ public static bool IsPrintableScreenCode(byte val) {
+ return sPrintableScreenCode[val];
+ }
+ public static bool IsExtendedScreenCode(byte val) {
+ return sPrintableScreenCode[val];
+ }
}
}
diff --git a/SourceGen/DataAnalysis.cs b/SourceGen/DataAnalysis.cs
index e5694e7..06477ba 100644
--- a/SourceGen/DataAnalysis.cs
+++ b/SourceGen/DataAnalysis.cs
@@ -18,6 +18,7 @@ using System.Diagnostics;
using Asm65;
using CommonUtil;
+using TextScanMode = SourceGen.ProjectProperties.AnalysisParameters.TextScanMode;
namespace SourceGen {
///
@@ -30,11 +31,11 @@ namespace SourceGen {
// Minimum number of consecutive identical bytes for something to be called a "run".
private const int MIN_RUN_LENGTH = 5;
- // Minimum length for treating data as a run if the byte is a valid ASCII value.
- // (Alternatively, the maximum length of an ASCII string composed of single characters.)
+ // Minimum length for treating data as a run if the byte is a printable character.
+ // (Alternatively, the maximum length of a character string composed of a single value.)
// Anything shorter than this is handled with a string directive, anything this long or
// longer becomes FILL. This should be larger than the MinCharsForString parameter.
- private const int MIN_RUN_LENGTH_ASCII = 62;
+ private const int MAX_STRING_RUN_LENGTH = 62;
// Absolute minimum string length for auto-detection. This is used when generating the
// data tables.
@@ -605,7 +606,7 @@ namespace SourceGen {
int minStringChars = mAnalysisParams.MinCharsForString;
-#if false // this is actually slower (and uses more memory)
+#if DATA_PRESCAN // this is actually slower (and uses more memory)
while (start <= end) {
// This is used to let us skip forward. It starts past the end of the block,
// and moves backward as we identify potential points of interest.
@@ -709,53 +710,105 @@ namespace SourceGen {
}
}
#else
+ // Select "is printable" test. We use the extended version to include some
+ // control characters.
+ CharEncoding.InclusionTest testPrintable;
+ FormatDescriptor.SubType baseSubType;
+ switch (mAnalysisParams.DefaultTextScanMode) {
+ case TextScanMode.LowAscii:
+ testPrintable = CharEncoding.IsExtendedAscii;
+ baseSubType = FormatDescriptor.SubType.Ascii;
+ break;
+ case TextScanMode.LowHighAscii:
+ testPrintable = CharEncoding.IsExtendedLowOrHighAscii;
+ baseSubType = FormatDescriptor.SubType.ASCII_GENERIC;
+ break;
+ case TextScanMode.C64Petscii:
+ testPrintable = CharEncoding.IsExtendedPetscii;
+ baseSubType = FormatDescriptor.SubType.C64Petscii;
+ break;
+ case TextScanMode.C64ScreenCode:
+ testPrintable = CharEncoding.IsExtendedScreenCode;
+ baseSubType = FormatDescriptor.SubType.C64Screen;
+ break;
+ default:
+ Debug.Assert(false);
+ testPrintable = CharEncoding.IsExtendedLowOrHighAscii;
+ baseSubType = FormatDescriptor.SubType.ASCII_GENERIC;
+ break;
+ }
+
while (start <= end) {
// Check for block of repeated values.
int runLen = RecognizeRun(mFileData, start, end);
- bool isAscii = TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f));
- if (runLen >= MIN_RUN_LENGTH) {
- // Output as run or ASCII string. Prefer ASCII if the string is short
- // enough to fit on one line (e.g. 64 chars including delimiters) and
- // meets the minimum string length threshold.
- if (isAscii && runLen <= MIN_RUN_LENGTH_ASCII && runLen >= minStringChars) {
- // String -- if we create the descriptor here, we save a little time,
- // but strings like "*****hello" turn into two separate strings. So
- // just fall through and let the ASCII recognizer handle it.
- } else {
- // run
- LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
- runLen + " bytes");
- mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
- runLen, FormatDescriptor.Type.Fill,
- FormatDescriptor.SubType.None);
- start += runLen;
- continue;
+ int printLen = 0;
+ FormatDescriptor.SubType subType = baseSubType;
+
+ if (testPrintable(mFileData[start])) {
+ // The run byte is printable, and the run is shorter than a line. It's
+ // possible the run is followed by additional printable characters, e.g.
+ // "*****hello". Text is easier for humans to understand, so we prefer
+ // that unless the run is longer than one line.
+ if (runLen <= MAX_STRING_RUN_LENGTH) {
+ // See if the run is followed by additional printable characters.
+ printLen = runLen;
+
+ // For LowHighAscii we allow a string to be either low or high, but it
+ // must be entirely one thing. Refine our test.
+ CharEncoding.InclusionTest refinedTest = testPrintable;
+ if (mAnalysisParams.DefaultTextScanMode == TextScanMode.LowHighAscii) {
+ if (CharEncoding.IsExtendedAscii(mFileData[start])) {
+ refinedTest = CharEncoding.IsExtendedAscii;
+ subType = FormatDescriptor.SubType.Ascii;
+ } else {
+ refinedTest = CharEncoding.IsExtendedHighAscii;
+ subType = FormatDescriptor.SubType.HighAscii;
+ }
+ }
+ for (int i = start + runLen; i <= end; i++) {
+ if (!refinedTest(mFileData[i])) {
+ break;
+ }
+ printLen++;
+ }
}
}
- int asciiLen = RecognizeAscii(mFileData, start, end);
- if (asciiLen >= minStringChars) {
- LogV(start, "ASCII string, len=" + asciiLen + " bytes");
- bool isHigh = (mFileData[start] & 0x80) != 0;
- mAnattribs[start].DataDescriptor = FormatDescriptor.Create(asciiLen,
- FormatDescriptor.Type.StringGeneric, isHigh ?
- FormatDescriptor.SubType.HighAscii : FormatDescriptor.SubType.Ascii);
- start += asciiLen;
- continue;
- }
-
- // Nothing found, output as single byte. This is the easiest form for users
- // to edit. If we found a run, but it was too short, we can go ahead and
- // mark all bytes in the run because we know the later matches will also be
- // too short.
- Debug.Assert(runLen > 0);
- while (runLen-- != 0) {
- mAnattribs[start++].DataDescriptor = oneByteDefault;
- FormatDescriptor.DebugPrefabBump();
+ if (printLen >= minStringChars) {
+ // This either a short run followed by printable characters, or just a
+ // (possibly very large) bunch of printable characters.
+ Debug.Assert(subType != FormatDescriptor.SubType.ASCII_GENERIC);
+ LogD(start, "Character string (" + subType + "), len=" + printLen + " bytes");
+ mAnattribs[start].DataDescriptor = FormatDescriptor.Create(printLen,
+ FormatDescriptor.Type.StringGeneric, subType);
+ start += printLen;
+ } else if (runLen >= MIN_RUN_LENGTH) {
+ // Didn't qualify as a string, but it's long enough to be a run.
+ //
+ // TODO(someday): allow .fill pseudo-ops to have character encoding
+ // sub-types, so we can ".fill 64,'*'". Easy to do here, but
+ // proper treatment requires tweaking data operand editor to allow
+ // char encoding to be specified.
+ LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " +
+ runLen + " bytes");
+ mAnattribs[start].DataDescriptor = FormatDescriptor.Create(
+ runLen, FormatDescriptor.Type.Fill,
+ FormatDescriptor.SubType.None);
+ start += runLen;
+ } else {
+ // Nothing useful found, output 1+ values as single bytes. This is the
+ // easiest form for users to edit. If we found a run, but it was too short,
+ // we can go ahead and mark all bytes in the run because we know the later
+ // matches will also be too short.
+ Debug.Assert(runLen > 0);
+ while (runLen-- != 0) {
+ mAnattribs[start++].DataDescriptor = oneByteDefault;
+ FormatDescriptor.DebugPrefabBump();
+ }
}
}
#endif
- }
+ }
#region Static analyzer methods
@@ -777,29 +830,6 @@ namespace SourceGen {
return index - start;
}
- ///
- /// Checks for a run of ASCII values. Both high and low ASCII are recognized,
- /// but the entire run must be one or the other.
- ///
- /// Raw data.
- /// Offset of first byte in range.
- /// Offset of last byte in range.
- /// Length of run.
- public static int RecognizeAscii(byte[] fileData, int start, int end) {
- // This won't find a mix of Apple II high/inverse/flashing text.
- byte firstHi = (byte)(fileData[start] & 0x80);
-
- int index;
- for (index = start; index <= end; index++) {
- char ch = (char)fileData[index];
- if (!TextUtil.IsPrintableAscii((char)(ch & 0x7f)) || (ch & 0x80) != firstHi) {
- break;
- }
- }
-
- return index - start;
- }
-
///
/// Counts the number of low-ASCII, high-ASCII, and non-ASCII values in the
/// specified region.
@@ -1080,7 +1110,7 @@ namespace SourceGen {
-#if false
+#if DATA_PRESCAN
///
/// Iterator that generates a list of offsets which are not known to hold code or data.
///
diff --git a/SourceGen/DisasmProject.cs b/SourceGen/DisasmProject.cs
index c72c239..5b02417 100644
--- a/SourceGen/DisasmProject.cs
+++ b/SourceGen/DisasmProject.cs
@@ -143,7 +143,7 @@ namespace SourceGen {
// Project and platform symbols that are being referenced from code.
public List ActiveDefSymbolList { get; private set; }
-#if false
+#if DATA_PRESCAN
// Data scan results.
public TypedRangeSet RepeatedBytes { get; private set; }
public RangeSet StdAsciiBytes { get; private set; }
@@ -233,7 +233,7 @@ namespace SourceGen {
mFileData = fileData;
mDataFileName = dataFileName;
FileDataCrc32 = CommonUtil.CRC32.OnWholeBuffer(0, mFileData);
-#if false
+#if DATA_PRESCAN
ScanFileData();
#endif
@@ -317,12 +317,12 @@ namespace SourceGen {
FixAndValidate(ref report);
-#if false
+#if DATA_PRESCAN
ScanFileData();
#endif
}
-#if false
+#if DATA_PRESCAN
private delegate bool ByteTest(byte val); // for ScanFileData()
///