Treat low and high ASCII as two distinct formats

We've been treating ASCII strings and instruction/data operands as ambiguous, resolving low vs. high when generating output for the display or assembler. This change splits it into two separate formats, simplifying output generation. The UI will continue to treat low/high ASCII as as single thing, selecting the format appropriately based on the data. There's no reason to have two radio buttons that are never both enabled. The data operand string functions need some additional work, but that overlaps substantially with the upcoming PETSCII changes, so for now all strings set by the data operand editor are low ASCII. The file format has changed again, but since there hasn't been a release since the previous change, I'm leaving the file format at v2. Code has been added to resolve the ASCII mode when loading a v1 project file. This removes some complexity from the assembly code generators.
2025-04-13 09:37:10 +00:00 · 2019-08-10 14:24:19 -07:00 · 2019-08-10 14:24:19 -07:00 · 975b62db6b
commit 975b62db6b
parent a4f5d19295
13 changed files with 202 additions and 106 deletions
--- a/SourceGen/AsmGen/AsmAcme.cs
+++ b/SourceGen/AsmGen/AsmAcme.cs
@ -540,7 +540,11 @@ namespace SourceGen.AsmGen {
            Debug.Assert(dfd.IsString);
            Debug.Assert(dfd.Length > 0);

-            bool highAscii = false;
+            if (dfd.FormatSubType == FormatDescriptor.SubType.HighAscii) {
+                OutputNoJoy(offset, dfd.Length, labelStr, commentStr);
+                return;
+            }
+
            int leadingBytes = 0;

            switch (dfd.FormatType) {
@ -548,18 +552,11 @@ namespace SourceGen.AsmGen {
                case FormatDescriptor.Type.StringReverse:
                case FormatDescriptor.Type.StringNullTerm:
                case FormatDescriptor.Type.StringDci:
-                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                case FormatDescriptor.Type.StringL8:
-                    if (dfd.Length > 1) {
-                        highAscii = (data[offset + 1] & 0x80) != 0;
-                    }
                    leadingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL16:
-                    if (dfd.Length > 2) {
-                        highAscii = (data[offset + 2] & 0x80) != 0;
-                    }
                    leadingBytes = 2;
                    break;
                default:
@ -567,11 +564,6 @@ namespace SourceGen.AsmGen {
                    return;
            }

-            if (highAscii) {
-                OutputNoJoy(offset, dfd.Length, labelStr, commentStr);
-                return;
-            }
-
            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"',
                StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN,
                CharEncoding.ConvertLowAscii);
--- a/SourceGen/AsmGen/AsmCc65.cs
+++ b/SourceGen/AsmGen/AsmCc65.cs
@ -595,7 +595,6 @@ namespace SourceGen.AsmGen {
            Debug.Assert(dfd.IsString);
            Debug.Assert(dfd.Length > 0);

-            bool highAscii = false;
            int leadingBytes = 0;
            int trailingBytes = 0;

@ -603,22 +602,14 @@ namespace SourceGen.AsmGen {
                case FormatDescriptor.Type.StringGeneric:
                case FormatDescriptor.Type.StringReverse:
                case FormatDescriptor.Type.StringDci:
-                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                case FormatDescriptor.Type.StringNullTerm:
-                    highAscii = (data[offset] & 0x80) != 0;
                    trailingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL8:
-                    if (dfd.Length > 1) {
-                        highAscii = (data[offset + 1] & 0x80) != 0;
-                    }
                    leadingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL16:
-                    if (dfd.Length > 2) {
-                        highAscii = (data[offset + 2] & 0x80) != 0;
-                    }
                    leadingBytes = 2;
                    break;
                default:
@ -626,6 +617,7 @@ namespace SourceGen.AsmGen {
                    return;
            }

+            bool highAscii = (dfd.FormatSubType == FormatDescriptor.SubType.HighAscii);
            if (highAscii && dfd.FormatType != FormatDescriptor.Type.StringGeneric) {
                OutputNoJoy(offset, dfd.Length, labelStr, commentStr);
                return;
@ -639,8 +631,7 @@ namespace SourceGen.AsmGen {
            }

            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"',
-                StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN,
-                charConv);
+                StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN, charConv);
            stropf.FeedBytes(data, offset, dfd.Length - trailingBytes, leadingBytes, false);

            string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric);
--- a/SourceGen/AsmGen/AsmMerlin32.cs
+++ b/SourceGen/AsmGen/AsmMerlin32.cs
@ -479,7 +479,6 @@ namespace SourceGen.AsmGen {
            Debug.Assert(dfd.IsString);
            Debug.Assert(dfd.Length > 0);

-            bool highAscii = false;
            bool reverse = false;
            int leadingBytes = 0;
            string opcodeStr;
@ -487,16 +486,13 @@ namespace SourceGen.AsmGen {
            switch (dfd.FormatType) {
                case FormatDescriptor.Type.StringGeneric:
                    opcodeStr = sDataOpNames.StrGeneric;
-                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                case FormatDescriptor.Type.StringReverse:
                    opcodeStr = sDataOpNames.StrReverse;
-                    highAscii = (data[offset] & 0x80) != 0;
                    reverse = true;
                    break;
                case FormatDescriptor.Type.StringNullTerm:
                    opcodeStr = sDataOpNames.StrGeneric;        // no pseudo-op for this
-                    highAscii = (data[offset] & 0x80) != 0;
                    if (dfd.Length == 1) {
                        // Empty string.  Just output the length byte(s) or null terminator.
                        GenerateShortSequence(offset, 1, out string opcode, out string operand);
@ -506,37 +502,29 @@ namespace SourceGen.AsmGen {
                    break;
                case FormatDescriptor.Type.StringL8:
                    opcodeStr = sDataOpNames.StrLen8;
-                    if (dfd.Length > 1) {
-                        highAscii = (data[offset + 1] & 0x80) != 0;
-                    }
                    leadingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL16:
                    opcodeStr = sDataOpNames.StrLen16;
-                    if (dfd.Length > 2) {
-                        highAscii = (data[offset + 2] & 0x80) != 0;
-                    }
                    leadingBytes = 2;
                    break;
                case FormatDescriptor.Type.StringDci:
                    opcodeStr = sDataOpNames.StrDci;
-                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                default:
                    Debug.Assert(false);
                    return;
            }

-            // Merlin 32 uses single-quote for low ASCII, double-quote for high ASCII.  When
-            // quoting the delimiter we use a hexadecimal value.  We need to bear in mind that
-            // we're forcing the characters to low ASCII, but the actual character being
-            // escaped might be in high ASCII.  Hence delim vs. delimReplace.
-            char delim = highAscii ? '"' : '\'';
+            // Merlin 32 uses single-quote for low ASCII, double-quote for high ASCII.
            CharEncoding.Convert charConv;
-            if (highAscii) {
+            char delim;
+            if (dfd.FormatSubType == FormatDescriptor.SubType.HighAscii) {
                charConv = CharEncoding.ConvertHighAscii;
+                delim = '"';
            } else {
                charConv = CharEncoding.ConvertLowAscii;
+                delim = '\'';
            }

            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, delim,
--- a/SourceGen/AsmGen/AsmTass64.cs
+++ b/SourceGen/AsmGen/AsmTass64.cs
@ -532,51 +532,41 @@ namespace SourceGen.AsmGen {
            Debug.Assert(dfd.IsString);
            Debug.Assert(dfd.Length > 0);

-            bool highAscii = false;
            int hiddenLeadingBytes = 0;
            int shownLeadingBytes = 0;
            int trailingBytes = 0;
            string opcodeStr;

+            if (dfd.FormatSubType == FormatDescriptor.SubType.HighAscii) {
+                OutputNoJoy(offset, dfd.Length, labelStr, commentStr);
+                return;
+            }
+
            switch (dfd.FormatType) {
                case FormatDescriptor.Type.StringGeneric:
                case FormatDescriptor.Type.StringReverse:
                    opcodeStr = sDataOpNames.StrGeneric;
-                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                case FormatDescriptor.Type.StringNullTerm:
                    opcodeStr = sDataOpNames.StrNullTerm;
-                    highAscii = (data[offset] & 0x80) != 0;
                    trailingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL8:
                    opcodeStr = sDataOpNames.StrLen8;
-                    if (dfd.Length > 1) {
-                        highAscii = (data[offset + 1] & 0x80) != 0;
-                    }
                    hiddenLeadingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL16:
                    opcodeStr = sDataOpNames.StrGeneric;
-                    if (dfd.Length > 2) {
-                        highAscii = (data[offset + 2] & 0x80) != 0;
-                    }
                    shownLeadingBytes = 2;
                    break;
                case FormatDescriptor.Type.StringDci:
                    opcodeStr = sDataOpNames.StrDci;
-                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                default:
                    Debug.Assert(false);
                    return;
            }

-            if (highAscii) {
-                OutputNoJoy(offset, dfd.Length, labelStr, commentStr);
-                return;
-            }
-
            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"',
                StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN,
                CharEncoding.ConvertLowAscii);
--- a/SourceGen/DataAnalysis.cs
+++ b/SourceGen/DataAnalysis.cs
@ -736,8 +736,10 @@ namespace SourceGen {
                int asciiLen = RecognizeAscii(mFileData, start, end);
                if (asciiLen >= minStringChars) {
                    LogV(start, "ASCII string, len=" + asciiLen + " bytes");
+                    bool isHigh = (mFileData[start] & 0x80) != 0;
                    mAnattribs[start].DataDescriptor = FormatDescriptor.Create(asciiLen,
-                        FormatDescriptor.Type.StringGeneric, FormatDescriptor.SubType.Ascii);
+                        FormatDescriptor.Type.StringGeneric, isHigh ?
+                        FormatDescriptor.SubType.HighAscii : FormatDescriptor.SubType.LowAscii);
                    start += asciiLen;
                    continue;
                }
--- a/SourceGen/DisasmProject.cs
+++ b/SourceGen/DisasmProject.cs
@ -308,12 +308,15 @@ namespace SourceGen {
        /// </summary>
        /// <param name="fileData">65xx data file contents.</param>
        /// <param name="dataFileName">Data file's filename (not pathname).</param>
-        public void SetFileData(byte[] fileData, string dataFileName) {
+        /// <param name="report">Reporting object for validation errors.</param>
+        public void SetFileData(byte[] fileData, string dataFileName, ref FileLoadReport report) {
            Debug.Assert(fileData.Length == FileDataLength);
            Debug.Assert(CRC32.OnWholeBuffer(0, fileData) == FileDataCrc32);
            mFileData = fileData;
            mDataFileName = dataFileName;

+            FixAndValidate(ref report);
+
 #if false
            ScanFileData();
 #endif
@ -396,6 +399,98 @@ namespace SourceGen {
        }
 #endif

+        /// <summary>
+        /// Walks the list of format descriptors, fixing places where the data doesn't match.
+        /// </summary>
+        private void FixAndValidate(ref FileLoadReport report) {
+            Dictionary<int, FormatDescriptor> changes = new Dictionary<int, FormatDescriptor>();
+
+            foreach (KeyValuePair<int, FormatDescriptor> kvp in OperandFormats) {
+                FormatDescriptor dfd = kvp.Value;
+
+                // v1 project files specified string layouts as sub-types, and assumed they
+                // were high or low ASCII.  Numeric values could use the ASCII sub-type, which
+                // included both high and low.
+                //
+                // v2 project files changed this to make string layouts types, with the
+                // character encoding specified in the sub-type.  High and low ASCII became
+                // separate, explicitly specified items.
+                //
+                // When loading a v1 file, the old "Ascii" sub-type is deserialized to
+                // ASCII_GENERIC.  Now that we have access to the file data, we need to refine
+                // the sub-type to high or low.
+                if (dfd.FormatSubType == FormatDescriptor.SubType.ASCII_GENERIC) {
+                    FormatDescriptor newDfd;
+                    if (dfd.IsString) {
+                        // Determine the string encoding by looking at the first character.
+                        // For some strings (StringL8, StringL16) we need to skip forward a
+                        // byte or two.  Empty strings with lengths or null-termination will
+                        // be treated as low ASCII.
+                        int checkOffset = kvp.Key;
+                        if (dfd.FormatType == FormatDescriptor.Type.StringL8 && dfd.Length > 1) {
+                            checkOffset++;
+                        } else if (dfd.FormatType == FormatDescriptor.Type.StringL16 && dfd.Length > 2) {
+                            checkOffset += 2;
+                        }
+                        bool isHigh = (FileData[checkOffset] & 0x80) != 0;
+                        newDfd = FormatDescriptor.Create(dfd.Length, dfd.FormatType,
+                            isHigh ? FormatDescriptor.SubType.HighAscii :
+                                FormatDescriptor.SubType.LowAscii);
+                    } else if (dfd.IsNumeric) {
+                        // This is a character constant in an instruction or data operand, such
+                        // as ".dd1 'f'" or "LDA #'f'".  Could be multi-byte (even instructions
+                        // can be 16-bit).  This is a little awkward, because at this point we
+                        // can't tell the difference between instructions and data.
+                        //
+                        // However, we do know that instructions are always little-endian, that
+                        // opcodes are one byte, that data values > $ff can't be ASCII encoded,
+                        // and that $00 isn't a valid ASCII character.  So we can apply the
+                        // following test:
+                        // - if the length is 1, it's data; grab the first byte
+                        // - if it's NumericBE, it's data; grab the last byte
+                        // - if the second byte is $00, it's data; grab the first byte
+                        // - otherwise, it's an instruction; grab the second byte
+                        int checkOffset;
+                        if (dfd.FormatType == FormatDescriptor.Type.NumericBE) {
+                            Debug.Assert(dfd.Length <= FormatDescriptor.MAX_NUMERIC_LEN);
+                            checkOffset = kvp.Key + dfd.Length - 1;
+                        } else if (dfd.Length < 2 || FileData[kvp.Key + 1] == 0x00) {
+                            checkOffset = kvp.Key;
+                        } else {
+                            Debug.Assert(dfd.FormatType == FormatDescriptor.Type.NumericLE);
+                            checkOffset = kvp.Key + 1;
+                        }
+                        bool isHigh = (FileData[checkOffset] & 0x80) != 0;
+                        newDfd = FormatDescriptor.Create(dfd.Length, dfd.FormatType,
+                            isHigh ? FormatDescriptor.SubType.HighAscii :
+                                FormatDescriptor.SubType.LowAscii);
+                    } else {
+                        Debug.Assert(false);
+                        newDfd = dfd;
+                    }
+                    changes[kvp.Key] = newDfd;
+                    Debug.WriteLine("Fix +" + kvp.Key.ToString("x6") + ": " +
+                        dfd + " -> " + newDfd);
+                }
+            }
+
+            // apply changes to main list
+            foreach (KeyValuePair<int, FormatDescriptor> kvp in changes) {
+                OperandFormats[kvp.Key] = kvp.Value;
+                //report.Add(FileLoadItem.Type.Notice,
+                //    "Fixed format at +" + kvp.Key.ToString("x6"));
+            }
+
+            // TODO: validate strings
+            // - null-terminated strings must not have 0x00 bytes, except for the last byte,
+            //   which must be 0x00
+            // - the length stored in L8/L16 strings much match the format descriptor length
+            // - DCI strings must have the appropriate pattern for the high bit
+            //
+            // Note it is not required that string data match the encoding, since you're allowed
+            // to have random gunk mixed in.  It just can't violate the above rules.
+        }
+
        /// <summary>
        /// Loads platform symbol files and extension scripts.
        /// 
--- a/SourceGen/FormatDescriptor.cs
+++ b/SourceGen/FormatDescriptor.cs
@ -66,6 +66,7 @@ namespace SourceGen {
        /// </summary>
        public enum SubType : byte {
            None = 0,
+            ASCII_GENERIC,      // internal place-holder, used when loading older projects

            // NumericLE/BE; default is "raw", which can have a context-specific display format
            Hex,
@ -75,8 +76,9 @@ namespace SourceGen {
            Symbol,             // symbolic ref; replace with Expression, someday?

            // Strings and NumericLE/BE (single character)
-            Ascii,              // ASCII (with or without the high bit set)
-            C64Petscii,         // C64 PETSCII
+            LowAscii,           // ASCII (high bit clear)
+            HighAscii,          // ASCII (high bit set)
+            C64Petscii,         // C64 PETSCII (lower case $41-5a, upper case $c1-da)
            C64Screen,          // C64 screen code

            // Dense; no sub-types
@ -85,7 +87,8 @@ namespace SourceGen {
            Ignore              // TODO(someday): use this for "don't care" sections
        }

-        private const int MAX_NUMERIC_LEN = 4;
+        // Maximum length of a NumericLE/BE item (32-bit value or 4-byte instruction).
+        public const int MAX_NUMERIC_LEN = 4;

        // Create some "stock" descriptors.  For simple cases we return one of these
        // instead of allocating a new object.
@ -99,8 +102,8 @@ namespace SourceGen {
            Type.NumericLE, SubType.Decimal);
        private static FormatDescriptor ONE_BINARY = new FormatDescriptor(1,
            Type.NumericLE, SubType.Binary);
-        private static FormatDescriptor ONE_ASCII = new FormatDescriptor(1,
-            Type.NumericLE, SubType.Ascii);
+        private static FormatDescriptor ONE_LOW_ASCII = new FormatDescriptor(1,
+            Type.NumericLE, SubType.LowAscii);

        /// <summary>
        /// Length, in bytes, of the data to be formatted.
@ -210,8 +213,8 @@ namespace SourceGen {
                            return ONE_DECIMAL;
                        case SubType.Binary:
                            return ONE_BINARY;
-                        case SubType.Ascii:
-                            return ONE_ASCII;
+                        case SubType.LowAscii:
+                            return ONE_LOW_ASCII;
                    }
                }
            }
@ -347,9 +350,12 @@ namespace SourceGen {
            if (IsString) {
                string descr;
                switch (FormatSubType) {
-                    case SubType.Ascii:
+                    case SubType.LowAscii:
                        descr = "ASCII";
                        break;
+                    case SubType.HighAscii:
+                        descr = "ASCII (high)";
+                        break;
                    case SubType.C64Petscii:
                        descr = "C64 PETSCII";
                        break;
@ -411,12 +417,14 @@ namespace SourceGen {
                    return "Address";
                case SubType.Symbol:
                    return "Symbol \"" + SymbolRef.Label + "\"";
-                case SubType.Ascii:
-                    return "ASCII";
+                case SubType.LowAscii:
+                    return "Numeric, ASCII";
+                case SubType.HighAscii:
+                    return "Numeric, ASCII (high)";
                case SubType.C64Petscii:
-                    return "C64 PETSCII";
+                    return "Numeric, C64 PETSCII";
                case SubType.C64Screen:
-                    return "C64 Screen";
+                    return "Numeric, C64 Screen";

                default:
                    return "???";
--- a/SourceGen/MainController.cs
+++ b/SourceGen/MainController.cs
@ -991,6 +991,8 @@ namespace SourceGen {
                }
            }

+            newProject.SetFileData(fileData, Path.GetFileName(dataPathName), ref report);
+
            // If there were warnings, notify the user and give the a chance to cancel.
            if (report.Count != 0) {
                ProjectLoadIssues dlg = new ProjectLoadIssues(mMainWin, report.Format(),
@ -1004,7 +1006,6 @@ namespace SourceGen {

            mProject = newProject;
            mProjectPathName = mProject.ProjectPathName = projPathName;
-            mProject.SetFileData(fileData, Path.GetFileName(dataPathName));
            FinishPrep();
        }

--- a/SourceGen/ProjectFile.cs
+++ b/SourceGen/ProjectFile.cs
@ -107,6 +107,9 @@ namespace SourceGen {

        /// <summary>
        /// Reads the specified file and deserializes it into the project.
+        ///
+        /// The deserialized form may include place-holder entries that can't be resolved
+        /// until the data file is available (see the ASCII_GENERIC string sub-type).
        /// </summary>
        /// <param name="pathName">Input path name.</param>
        /// <param name="proj">Project to deserialize into.</param>
@ -670,10 +673,10 @@ namespace SourceGen {
            FormatDescriptor.Type format;
            FormatDescriptor.SubType subFormat;

-            // File version 1 used a different set of enumerated values for defining strings.
-            // Parse it out here.
            if ("String".Equals(sfd.Format)) {
-                subFormat = FormatDescriptor.SubType.Ascii;
+                // File version 1 used a different set of enumerated values for defining strings.
+                // Parse it out here.
+                subFormat = FormatDescriptor.SubType.ASCII_GENERIC;
                if ("None".Equals(sfd.SubFormat)) {
                    format = FormatDescriptor.Type.StringGeneric;
                } else if ("Reverse".Equals(sfd.SubFormat)) {
@ -687,12 +690,8 @@ namespace SourceGen {
                } else if ("Dci".Equals(sfd.SubFormat)) {
                    format = FormatDescriptor.Type.StringDci;
                } else if ("DciReverse".Equals(sfd.SubFormat)) {
-                    // No longer supported.  Treating it as a generic string works poorly,
-                    // because the first byte will appear to be (say) high ASCII, but the rest
-                    // of the string will be low ASCII and get output as hex data.  If we
-                    // explicitly differentiated high/low ASCII we could make this work right.
-                    // We could also split the descriptor into two parts.  Nobody ever used
-                    // this but the regression tests, though, so we don't really care.
+                    // No longer supported.  Nobody ever used this but the regression tests,
+                    // though, so there's no reason to handle this nicely.
                    format = FormatDescriptor.Type.Dense;
                    subFormat = FormatDescriptor.SubType.None;
                } else {
@ -708,8 +707,15 @@ namespace SourceGen {
            try {
                format = (FormatDescriptor.Type)Enum.Parse(
                    typeof(FormatDescriptor.Type), sfd.Format);
-                subFormat = (FormatDescriptor.SubType)Enum.Parse(
-                    typeof(FormatDescriptor.SubType), sfd.SubFormat);
+                if ("Ascii".Equals(sfd.SubFormat)) {
+                    // File version 1 used "Ascii" for all character data in numeric operands.
+                    // It applied to both low and high ASCII.
+                    subFormat = FormatDescriptor.SubType.ASCII_GENERIC;
+                } else {
+                    subFormat = (FormatDescriptor.SubType)Enum.Parse(
+                        typeof(FormatDescriptor.SubType), sfd.SubFormat);
+                }
+
            } catch (ArgumentException) {
                report.Add(FileLoadItem.Type.Warning, Res.Strings.ERR_BAD_FD_FORMAT +
                    ": " + sfd.Format + "/" + sfd.SubFormat);
--- a/SourceGen/PseudoOp.cs
+++ b/SourceGen/PseudoOp.cs
@ -541,11 +541,13 @@ namespace SourceGen {
                    return formatter.FormatDecimalValue(operandValue);
                case FormatDescriptor.SubType.Binary:
                    return formatter.FormatBinaryValue(operandValue, hexMinLen * 4);
-                case FormatDescriptor.SubType.Ascii:
+                case FormatDescriptor.SubType.LowAscii:
+                case FormatDescriptor.SubType.HighAscii:
                case FormatDescriptor.SubType.C64Petscii:
                case FormatDescriptor.SubType.C64Screen:
                    // TODO(petscii): convert encoding; use a helper function *not* in
                    //   formatter -- pass converted char value in along with operandValue
+                    // TODO: pass in a "make high ASCII" string, e.g. "| 0x80", that fixes char
                    return formatter.FormatAsciiOrHex(operandValue);
                case FormatDescriptor.SubType.Symbol:
                    if (symbolTable.TryGetValue(dfd.SymbolRef.Label, out Symbol sym)) {
@ -575,6 +577,7 @@ namespace SourceGen {
                        return formatter.FormatHexValue(operandValue, hexMinLen);
                    }
                default:
+                    // should not see REMOVE or ASCII_GENERIC here
                    Debug.Assert(false);
                    return "???";
            }
--- a/SourceGen/Tests/GenTest.cs
+++ b/SourceGen/Tests/GenTest.cs
@ -463,7 +463,8 @@ namespace SourceGen.Tests {
                    return null;
                }

-                project.SetFileData(fileData, Path.GetFileName(dataPathName));
+                FileLoadReport unused = new FileLoadReport("test");
+                project.SetFileData(fileData, Path.GetFileName(dataPathName), ref unused);
                project.ProjectPathName = projectPathName;
                project.LoadExternalFiles();
            }
--- a/SourceGen/WpfGui/EditDataOperand.xaml.cs
+++ b/SourceGen/WpfGui/EditDataOperand.xaml.cs
@ -527,7 +527,8 @@ namespace SourceGen.WpfGui {
                            case FormatDescriptor.SubType.Binary:
                                radioSimpleDataBinary.IsChecked = true;
                                break;
-                            case FormatDescriptor.SubType.Ascii:
+                            case FormatDescriptor.SubType.LowAscii:
+                            case FormatDescriptor.SubType.HighAscii:
                            case FormatDescriptor.SubType.C64Petscii:
                            case FormatDescriptor.SubType.C64Screen:
                                // TODO(petscii): update UI
@ -631,8 +632,8 @@ namespace SourceGen.WpfGui {
                } else if (radioSimpleDataBinary.IsChecked == true) {
                    subType = FormatDescriptor.SubType.Binary;
                } else if (radioSimpleDataAscii.IsChecked == true) {
-                    // TODO(petscii): configure subType correctly
-                    subType = FormatDescriptor.SubType.Ascii;
+                    // TODO(petscii): add PETSCII buttons
+                    subType = FormatDescriptor.SubType.ASCII_GENERIC;
                } else if (radioSimpleDataAddress.IsChecked == true) {
                    subType = FormatDescriptor.SubType.Address;
                } else if (radioSimpleDataSymbolic.IsChecked == true) {
@ -681,25 +682,27 @@ namespace SourceGen.WpfGui {
                type = FormatDescriptor.Type.Dense;
            } else if (radioFill.IsChecked == true) {
                type = FormatDescriptor.Type.Fill;
-                subType = FormatDescriptor.SubType.Ascii;    // TODO(petscii): set encoding
            } else if (radioStringMixed.IsChecked == true) {
+                // TODO(petscii): encoding format will come from a combo box; that determines
+                //   the subType and the arg to the string-creation functions, which use the
+                //   appropriate char encoding methods to break up the strings
                type = FormatDescriptor.Type.StringGeneric;
-                subType = FormatDescriptor.SubType.Ascii;
+                subType = FormatDescriptor.SubType.LowAscii;
            } else if (radioStringMixedReverse.IsChecked == true) {
                type = FormatDescriptor.Type.StringReverse;
-                subType = FormatDescriptor.SubType.Ascii;
+                subType = FormatDescriptor.SubType.LowAscii;
            } else if (radioStringNullTerm.IsChecked == true) {
                type = FormatDescriptor.Type.StringNullTerm;
-                subType = FormatDescriptor.SubType.Ascii;
+                subType = FormatDescriptor.SubType.LowAscii;
            } else if (radioStringLen8.IsChecked == true) {
                type = FormatDescriptor.Type.StringL8;
-                subType = FormatDescriptor.SubType.Ascii;
+                subType = FormatDescriptor.SubType.LowAscii;
            } else if (radioStringLen16.IsChecked == true) {
                type = FormatDescriptor.Type.StringL16;
-                subType = FormatDescriptor.SubType.Ascii;
+                subType = FormatDescriptor.SubType.LowAscii;
            } else if (radioStringDci.IsChecked == true) {
                type = FormatDescriptor.Type.StringDci;
-                subType = FormatDescriptor.SubType.Ascii;
+                subType = FormatDescriptor.SubType.LowAscii;
            } else {
                Debug.Assert(false);
                // default/none
@ -762,8 +765,8 @@ namespace SourceGen.WpfGui {
            // length.  Either way, we only need to create the descriptor once.  (This is
            // safe because FormatDescriptor instances are immutable.)
            //
-            // Because certain details, like the fill byte and high-vs-low ASCII, are pulled
-            // out of the data stream at format time, we don't have to dig for them now.
+            // The one exception to this is ASCII values for non-string data, because we have
+            // to dig the low vs. high value out of the data itself.
            FormatDescriptor dfd;
            if (subType == FormatDescriptor.SubType.Symbol) {
                dfd = FormatDescriptor.Create(chunkLength, symbolRef,
@ -771,8 +774,19 @@ namespace SourceGen.WpfGui {
            } else {
                dfd = FormatDescriptor.Create(chunkLength, type, subType);
            }
-
            while (low <= high) {
+                if (subType == FormatDescriptor.SubType.ASCII_GENERIC) {
+                    Debug.Assert(dfd.IsNumeric);
+                    int val = RawData.GetWord(mFileData, low, dfd.Length,
+                        type == FormatDescriptor.Type.NumericBE);
+                    FormatDescriptor.SubType actualSubType = (val > 0x7f) ?
+                        FormatDescriptor.SubType.HighAscii : FormatDescriptor.SubType.LowAscii;
+                    if (actualSubType != dfd.FormatSubType) {
+                        // replace the descriptor
+                        dfd = FormatDescriptor.Create(chunkLength, type, actualSubType);
+                    }
+                }
+
                Results.Add(low, dfd);
                low += chunkLength;
            }
@ -833,13 +847,12 @@ namespace SourceGen.WpfGui {
        /// <param name="offset">Offset of first byte.</param>
        /// <param name="length">Length of string.</param>
        /// <param name="subType">String sub-type.</param>
-        private void CreateStringOrByte(int offset, int length,
-                FormatDescriptor.SubType subType) {
+        private void CreateStringOrByte(int offset, int length, FormatDescriptor.SubType subType) {
            Debug.Assert(length > 0);
            if (length == 1) {
-                // single byte, output as single ASCII char rather than 1-byte string
-                // TODO(petscii): low/high?
-                CreateByteFD(offset, FormatDescriptor.SubType.Ascii);
+                // Single byte, output as single char rather than 1-byte string.  We use the
+                // same encoding as the rest of the string.
+                CreateByteFD(offset, subType);
            } else {
                FormatDescriptor dfd;
                dfd = FormatDescriptor.Create(length,
--- a/SourceGen/WpfGui/EditInstructionOperand.xaml.cs
+++ b/SourceGen/WpfGui/EditInstructionOperand.xaml.cs
@ -332,7 +332,8 @@ namespace SourceGen.WpfGui {
                case FormatDescriptor.SubType.Binary:
                    preview.Append(mFormatter.FormatBinaryValue(mOperandValue, 8));
                    break;
-                case FormatDescriptor.SubType.Ascii:
+                case FormatDescriptor.SubType.LowAscii:
+                case FormatDescriptor.SubType.HighAscii:
                    // TODO(petscii): encoding
                    preview.Append(mFormatter.FormatAsciiOrHex(mOperandValue));
                    break;
@ -470,8 +471,9 @@ namespace SourceGen.WpfGui {
                        case FormatDescriptor.SubType.Binary:
                            binaryButton.IsChecked = true;
                            break;
-                        case FormatDescriptor.SubType.Ascii:
-                        // TODO(petscii): encoding
+                        case FormatDescriptor.SubType.LowAscii:
+                        case FormatDescriptor.SubType.HighAscii:
+                            // TODO(petscii): encoding
                            asciiButton.IsChecked = true;
                            break;
                        case FormatDescriptor.SubType.Symbol:
@ -552,7 +554,11 @@ namespace SourceGen.WpfGui {
                subType = FormatDescriptor.SubType.Binary;
            } else if (asciiButton.IsChecked == true) {
                // TODO(petscii): encoding
-                subType = FormatDescriptor.SubType.Ascii;
+                if (mOperandValue > 0x7f) {
+                    subType = FormatDescriptor.SubType.HighAscii;
+                } else {
+                    subType = FormatDescriptor.SubType.LowAscii;
+                }
            } else if (symbolButton.IsChecked == true) {
                subType = FormatDescriptor.SubType.Symbol;
            } else {