Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into the Asm65 library. The assembly source generators have been updated to use it. This makes the individual generators simpler, and by virtue of avoiding "test runs" should make them slightly faster. This also introduces byte-to-character converters, though we're currently still only supporting low/high ASCII. Regression test output is unchanged.
2026-04-20 19:16:34 +00:00 · 2019-08-09 16:42:30 -07:00
parent 7a40d7f9bf
commit dae76d9b45
7 changed files with 383 additions and 443 deletions
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2019 faddenSoft
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Asm65 {
+    /// <summary>
+    /// Character encoding helper methods.
+    /// </summary>
+    public static class CharEncoding {
+        public const char UNPRINTABLE_CHAR = '\ufffd';
+
+        /// <summary>
+        /// Determines whether the byte represents a character in the character set.
+        /// </summary>
+        public delegate bool InclusionTest(byte val);
+
+        /// <summary>
+        /// Converts the byte to a printable character.  Returns UNPRINTABLE_CHAR if the value
+        /// does not map to something printable.
+        /// </summary>
+        /// <remarks>
+        /// Yes, I'm assuming it all fits in the Unicode BMP.  Should be a safe assumption
+        /// for 8-bit computer character sets.
+        /// </remarks>
+        public delegate char Convert(byte val);
+
+        //
+        // Standard ASCII.
+        //
+        public static bool IsPrintableLowAscii(byte val) {
+            return (val >= 0x20 && val < 0x7f);
+        }
+        public static bool IsExtendedLowAscii(byte val) {
+            return IsPrintableLowAscii(val) || val == 0x0a || val == 0x0d;
+        }
+        public static char ConvertLowAscii(byte val) {
+            if (IsPrintableLowAscii(val)) {
+                return (char)val;
+            } else {
+                return UNPRINTABLE_CHAR;
+            }
+        }
+
+        //
+        // Standard ASCII, but with the high bit set.
+        //
+        public static bool IsPrintableHighAscii(byte val) {
+            return (val >= 0xa0 && val < 0xff);
+        }
+        public static bool IsExtendedHighAscii(byte val) {
+            return IsPrintableHighAscii(val) || val == 0x8a || val == 0x8d;
+        }
+        public static char ConvertHighAscii(byte val) {
+            if (IsPrintableHighAscii(val)) {
+                return (char)(val & 0x7f);
+            } else {
+                return UNPRINTABLE_CHAR;
+            }
+        }
+
+        //
+        // High *or* low ASCII.
+        //
+        public static char ConvertLowAndHighAscii(byte val) {
+            if (IsPrintableLowAscii(val) || IsPrintableHighAscii(val)) {
+                return (char)(val & 0x7f);
+            } else {
+                return UNPRINTABLE_CHAR;
+            }
+        }
+
+    }
+}
@@ -14,35 +14,30 @@
 * limitations under the License.
 */
 using System;
+using System.Collections.Generic;
 using System.Diagnostics;

-namespace SourceGen.AsmGen {
+namespace Asm65 {
    /// <summary>
-    /// Multi-line string gatherer.  Accumulates characters and raw bytes, emitting
-    /// them when we have a full operand's worth.
-    /// 
-    /// If the delimiter character appears, it will be output inline as a raw byte.
-    /// The low-ASCII string ['hello'world'] will become [27,'hello',27,'world',27]
-    /// (or something similar).
+    /// String pseudo-op formatter.  Handles character encoding conversion and quoting of
+    /// delimiters and non-printable characters.
    /// </summary>
-    public class StringGather {
-        // Inputs.
-        public IGenerator Gen { get; private set; }
-        public string Label { get; private set; }
-        public string Opcode { get; private set; }
-        public string Comment { get; private set; }
-        public char Delimiter { get; private set; }
-        public char DelimiterReplacement { get; private set; }
-        public ByteStyle ByteStyleX { get; private set; }
-        public int MaxOperandLen { get; private set; }
-        public bool IsTestRun { get; private set; }
+    public class StringOpFormatter {
+        public CharEncoding.Convert CharConv { get; set; }

-        public enum ByteStyle { DenseHex, CommaSep };
+        private char Delimiter { get; set; }
+        private RawOutputStyle RawStyle { get; set; }
+        private int MaxOperandLen { get; set; }
+
+        // Output format for raw (non-printable) characters.  Most assemblers use comma-separated
+        // hex values, some allow dense hex strings.
+        public enum RawOutputStyle { DenseHex, CommaSep };

        // Outputs.
-        public bool HasDelimiter { get; private set; }
-        public int NumLinesOutput { get; private set; }
+        public bool HasEscapedText { get; private set; }
+        public List<string> Lines { get; private set; }

+        // Reference to array with 16 hex digits.  (May be upper or lower case.)
        private char[] mHexChars;

        /// <summary>
@@ -50,12 +45,13 @@ namespace SourceGen.AsmGen {
        /// because they're mixed with bytes, particularly when we have to escape the
        /// delimiter character.  Strings might start or end with escaped delimiters,
        /// so we don't add them until we have to.
+        /// </summary>
        private char[] mBuffer;

        /// <summary>
        /// Next available character position.
        /// </summary>
-        private int mIndex = 0;
+        private int mIndex;

        /// <summary>
        /// State of the buffer, based on the last thing we added.
@@ -64,48 +60,51 @@ namespace SourceGen.AsmGen {
            Unknown = 0,
            StartOfLine,
            InQuote,
-            OutQuote
+            OutQuote,
+            Finished
        }
-        private State mState = State.StartOfLine;
+        private State mState;

        /// <summary>
        /// Constructor.
        /// </summary>
-        /// <param name="gen">Reference back to generator, for output function and
-        ///   format options.</param>
-        /// <param name="label">Line label.  Appears on first output line only.</param>
-        /// <param name="opcode">Opcode to use for all lines.</param>
-        /// <param name="comment">End-of-line comment.  Appears on first output line
-        ///   only.</param>
+        /// <param name="formatter">Reference to text formatter.</param>
        /// <param name="delimiter">String delimiter character.</param>
-        /// <param name="isTestRun">If true, no file output is produced.</param>
-        public StringGather(IGenerator gen, string label, string opcode,
-                string comment, char delimiter, char delimReplace, ByteStyle byteStyle,
-                int maxOperandLen, bool isTestRun) {
-            Gen = gen;
-            Label = label;
-            Opcode = opcode;
-            Comment = comment;
+        /// <param name="byteStyle">How to format raw byte data.</param>
+        /// <param name="maxOperandLen">Maximum line length.</param>
+        /// <param name="charConv">Character conversion delegate.</param>
+        public StringOpFormatter(Formatter formatter, char delimiter, RawOutputStyle byteStyle,
+                int maxOperandLen, CharEncoding.Convert charConv) {
            Delimiter = delimiter;
-            DelimiterReplacement = delimReplace;
-            ByteStyleX = byteStyle;
+            RawStyle = byteStyle;
            MaxOperandLen = maxOperandLen;
-            IsTestRun = isTestRun;
+            CharConv = charConv;

            mBuffer = new char[MaxOperandLen];
-            mHexChars = Gen.SourceFormatter.HexDigits;
+            mHexChars = formatter.HexDigits;
+            Lines = new List<string>();
+
+            Reset();
+        }
+
+        public void Reset() {
+            mState = State.StartOfLine;
+            mIndex = 0;
+            Lines.Clear();
        }

        /// <summary>
-        /// Write a character into the buffer.
+        /// Write a character into the buffer.  If the character matches the delimiter, or
+        /// isn't printable, the raw character value will be written as a byte instead.
        /// </summary>
-        /// <param name="ch">Character to add.</param>
-        public void WriteChar(char ch) {
-            Debug.Assert(ch >= 0 && ch <= 0xff);
-            if (ch == Delimiter) {
+        /// <param name="rawCh">Raw character value.</param>
+        public void WriteChar(byte rawCh) {
+            Debug.Assert(mState != State.Finished);
+
+            char ch = CharConv(rawCh);
+            if (ch == Delimiter || ch == CharEncoding.UNPRINTABLE_CHAR) {
                // Must write it as a byte.
-                HasDelimiter = true;
-                WriteByte((byte)DelimiterReplacement);
+                WriteByte(rawCh);
                return;
            }

@@ -146,6 +145,10 @@ namespace SourceGen.AsmGen {
        /// </summary>
        /// <param name="val">Value to add.</param>
        public void WriteByte(byte val) {
+            Debug.Assert(mState != State.Finished);
+
+            HasEscapedText = true;
+
            // If we're at the start of a line, just output the byte.
            // If we're inside quotes, emit a delimiter, comma, and the byte.  We must
            //   have space for four (DenseHex) or five (CommaSep) chars.
@@ -155,7 +158,7 @@ namespace SourceGen.AsmGen {
                case State.StartOfLine:
                    break;
                case State.InQuote:
-                    int minWidth = (ByteStyleX == ByteStyle.CommaSep) ? 5 : 4;
+                    int minWidth = (RawStyle == RawOutputStyle.CommaSep) ? 5 : 4;
                    if (mIndex + minWidth > MaxOperandLen) {
                        Flush();
                    } else {
@@ -164,11 +167,11 @@ namespace SourceGen.AsmGen {
                    }
                    break;
                case State.OutQuote:
-                    minWidth = (ByteStyleX == ByteStyle.CommaSep) ? 4 : 2;
+                    minWidth = (RawStyle == RawOutputStyle.CommaSep) ? 4 : 2;
                    if (mIndex + minWidth > MaxOperandLen) {
                        Flush();
                    } else {
-                        if (ByteStyleX == ByteStyle.CommaSep) {
+                        if (RawStyle == RawOutputStyle.CommaSep) {
                            mBuffer[mIndex++] = ',';
                        }
                    }
@@ -178,7 +181,7 @@ namespace SourceGen.AsmGen {
                    break;
            }

-            if (ByteStyleX == ByteStyle.CommaSep) {
+            if (RawStyle == RawOutputStyle.CommaSep) {
                mBuffer[mIndex++] = '$';
            }
            mBuffer[mIndex++] = mHexChars[val >> 4];
@@ -202,26 +205,65 @@ namespace SourceGen.AsmGen {
                    // empty string; put out a pair of delimiters
                    mBuffer[mIndex++] = Delimiter;
                    mBuffer[mIndex++] = Delimiter;
-                    NumLinesOutput++;
                    break;
                case State.InQuote:
                    // add delimiter and finish
                    mBuffer[mIndex++] = Delimiter;
-                    NumLinesOutput++;
                    break;
                case State.OutQuote:
                    // just output it
-                    NumLinesOutput++;
                    break;
            }
-            if (!IsTestRun) {
-                Gen.OutputLine(Label, Opcode, new string(mBuffer, 0, mIndex),
-                    Comment);
-            }
-            mIndex = 0;

-            // Erase these after first use so we don't put them on every line.
-            Label = Comment = string.Empty;
+            string newStr = new string(mBuffer, 0, mIndex);
+            Debug.Assert(newStr.Length <= MaxOperandLen);
+            Lines.Add(newStr);
+
+            mState = State.Finished;
+
+            mIndex = 0;
+        }
+
+        /// <summary>
+        /// Feeds the bytes into the StringGather.
+        /// </summary>
+        public void FeedBytes(byte[] data, int offset, int length, int leadingBytes,
+                bool reverse) {
+            int startOffset = offset;
+            int strEndOffset = offset + length;
+
+            // Write leading bytes.  This is used for the 8- or 16-bit length (when no
+            // appropriate pseudo-op is available), because we want to output that as hex
+            // even if it maps to a printable character.
+            while (leadingBytes-- > 0) {
+                WriteByte(data[offset++]);
+            }
+            if (reverse) {
+                // Max per line is line length minus the two delimiters.  We don't allow
+                // any hex quoting in reversed text, so this always works.  (If somebody
+                // does try to reverse text with delimiters or unprintable chars, we'll
+                // blow out the line limit, but for a cross-assembler that should be purely
+                // cosmetic.)
+                int maxPerLine = MaxOperandLen - 2;
+                int numBlockLines = (length + maxPerLine - 1) / maxPerLine;
+
+                for (int chunk = 0; chunk < numBlockLines; chunk++) {
+                    int chunkOffset = startOffset + chunk * maxPerLine;
+                    int endOffset = chunkOffset + maxPerLine;
+                    if (endOffset > strEndOffset) {
+                        endOffset = strEndOffset;
+                    }
+                    for (int off = endOffset - 1; off >= chunkOffset; off--) {
+                        WriteChar(data[off]);
+                    }
+                }
+            } else {
+                for (; offset < strEndOffset; offset++) {
+                    WriteChar(data[offset]);
+                }
+            }
+
+            Finish();
        }
    }
 }
@@ -542,74 +542,25 @@ namespace SourceGen.AsmGen {

            bool highAscii = false;
            int leadingBytes = 0;
-            int trailingBytes = 0;
-            bool showLeading = false;
-            bool showTrailing = false;

            switch (dfd.FormatType) {
                case FormatDescriptor.Type.StringGeneric:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    break;
+                case FormatDescriptor.Type.StringReverse:
+                case FormatDescriptor.Type.StringNullTerm:
                case FormatDescriptor.Type.StringDci:
                    highAscii = (data[offset] & 0x80) != 0;
-                    trailingBytes = 1;
-                    showTrailing = true;
-                    break;
-                case FormatDescriptor.Type.StringReverse:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    break;
-                case FormatDescriptor.Type.StringNullTerm:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    trailingBytes = 1;
-                    showTrailing = true;
                    break;
                case FormatDescriptor.Type.StringL8:
                    if (dfd.Length > 1) {
                        highAscii = (data[offset + 1] & 0x80) != 0;
                    }
                    leadingBytes = 1;
-                    showLeading = true;
                    break;
                case FormatDescriptor.Type.StringL16:
                    if (dfd.Length > 2) {
                        highAscii = (data[offset + 2] & 0x80) != 0;
                    }
                    leadingBytes = 2;
-                    showLeading = true;
-                    break;
-                default:
-                    Debug.Assert(false);
-                    return;
-            }
-
-            char delim = '"';
-            StringGather gath = null;
-
-            // Run the string through so we can see if it'll fit on one line.  As a minor
-            // optimization, we skip this step for "generic" strings, which are probably
-            // the most common thing.
-            if (dfd.FormatSubType != FormatDescriptor.SubType.None || highAscii) {
-                gath = new StringGather(this, labelStr, "???", commentStr, delim,
-                        delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, true);
-                FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading,
-                    trailingBytes, showTrailing);
-                Debug.Assert(gath.NumLinesOutput > 0);
-            }
-
-            string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric);
-
-            switch (dfd.FormatType) {
-                case FormatDescriptor.Type.StringGeneric:
-                    // TODO(someday): something fancy with encodings to handle high-ASCII text?
-                    break;
-                case FormatDescriptor.Type.StringDci:
-                case FormatDescriptor.Type.StringReverse:
-                    // Fully configured above.
-                    break;
-                case FormatDescriptor.Type.StringNullTerm:
-                case FormatDescriptor.Type.StringL8:
-                case FormatDescriptor.Type.StringL16:
-                    // Implement as macro?
                    break;
                default:
                    Debug.Assert(false);
@@ -621,35 +572,16 @@ namespace SourceGen.AsmGen {
                return;
            }

-            // Create a new StringGather, with the final opcode choice.
-            gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim,
-                delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, false);
-            FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading,
-                trailingBytes, showTrailing);
-        }
+            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"',
+                StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN,
+                CharEncoding.ConvertLowAscii);
+            stropf.FeedBytes(data, offset, dfd.Length, leadingBytes, false);

-        /// <summary>
-        /// Feeds the bytes into the StringGather.
-        /// </summary>
-        private void FeedGath(StringGather gath, byte[] data, int offset, int length,
-                int leadingBytes, bool showLeading, int trailingBytes, bool showTrailing) {
-            int startOffset = offset;
-            int strEndOffset = offset + length - trailingBytes;
-
-            if (showLeading) {
-                while (leadingBytes-- > 0) {
-                    gath.WriteByte(data[offset++]);
-                }
-            } else {
-                offset += leadingBytes;
+            string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric);
+            foreach (string str in stropf.Lines) {
+                OutputLine(labelStr, opcodeStr, str, commentStr);
+                labelStr = commentStr = string.Empty;       // only show on first
            }
-            for (; offset < strEndOffset; offset++) {
-                gath.WriteChar((char)(data[offset] & 0x7f));
-            }
-            while (showTrailing && trailingBytes-- > 0) {
-                gath.WriteByte(data[offset++]);
-            }
-            gath.Finish();
        }
    }

@@ -598,135 +598,89 @@ namespace SourceGen.AsmGen {
            bool highAscii = false;
            int leadingBytes = 0;
            int trailingBytes = 0;
-            bool showLeading = false;
-            bool showTrailing = false;

            switch (dfd.FormatType) {
                case FormatDescriptor.Type.StringGeneric:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    break;
-                case FormatDescriptor.Type.StringDci:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    trailingBytes = 1;
-                    showTrailing = true;
-                    break;
                case FormatDescriptor.Type.StringReverse:
+                case FormatDescriptor.Type.StringDci:
                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                case FormatDescriptor.Type.StringNullTerm:
                    highAscii = (data[offset] & 0x80) != 0;
                    trailingBytes = 1;
-                    showTrailing = true;
                    break;
                case FormatDescriptor.Type.StringL8:
                    if (dfd.Length > 1) {
                        highAscii = (data[offset + 1] & 0x80) != 0;
                    }
                    leadingBytes = 1;
-                    showLeading = true;
                    break;
                case FormatDescriptor.Type.StringL16:
                    if (dfd.Length > 2) {
                        highAscii = (data[offset + 2] & 0x80) != 0;
                    }
                    leadingBytes = 2;
-                    showLeading = true;
                    break;
                default:
                    Debug.Assert(false);
                    return;
            }

-            char delim = '"';
-            StringGather gath = null;
-
-            // Run the string through so we can see if it'll fit on one line.  As a minor
-            // optimization, we skip this step for "generic" strings, which are probably
-            // the most common thing.
-            if (dfd.FormatSubType != FormatDescriptor.SubType.None || highAscii) {
-                gath = new StringGather(this, labelStr, "???", commentStr, delim,
-                        delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, true);
-                FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading,
-                    trailingBytes, showTrailing);
-                Debug.Assert(gath.NumLinesOutput > 0);
-            }
-
-            string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric);
-
-            switch (dfd.FormatType) {
-                case FormatDescriptor.Type.StringGeneric:
-                    // Special case for simple short high-ASCII strings.  These have no
-                    // leading or trailing bytes.  We can improve this a bit by handling
-                    // arbitrarily long strings by simply breaking them across lines.
-                    Debug.Assert(leadingBytes == 0);
-                    Debug.Assert(trailingBytes == 0);
-                    if (highAscii && gath.NumLinesOutput == 1 && !gath.HasDelimiter) {
-                        if (!mHighAsciiMacroOutput) {
-                            mHighAsciiMacroOutput = true;
-                            // Output a macro for high-ASCII strings.
-                            OutputLine(".macro", "HiAscii", "Arg", string.Empty);
-                            OutputLine(string.Empty, ".repeat", ".strlen(Arg), I", string.Empty);
-                            OutputLine(string.Empty, ".byte", ".strat(Arg, I) | $80", string.Empty);
-                            OutputLine(string.Empty, ".endrep", string.Empty, string.Empty);
-                            OutputLine(".endmacro", string.Empty, string.Empty, string.Empty);
-                        }
-                        opcodeStr = formatter.FormatPseudoOp("HiAscii");
-                        highAscii = false;
-                    }
-                    break;
-                case FormatDescriptor.Type.StringDci:
-                case FormatDescriptor.Type.StringReverse:
-                    // Full configured above.
-                    break;
-                case FormatDescriptor.Type.StringNullTerm:
-                    if (gath.NumLinesOutput == 1 && !gath.HasDelimiter) {
-                        opcodeStr = sDataOpNames.StrNullTerm;
-                        showTrailing = false;
-                    }
-                    break;
-                case FormatDescriptor.Type.StringL8:
-                case FormatDescriptor.Type.StringL16:
-                    // Implement macros?
-                    break;
-                default:
-                    Debug.Assert(false);
-                    return;
-            }
-
-            if (highAscii) {
+            if (highAscii && dfd.FormatType != FormatDescriptor.Type.StringGeneric) {
                OutputNoJoy(offset, dfd.Length, labelStr, commentStr);
                return;
            }

-            // Create a new StringGather, with the final opcode choice.
-            gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim,
-                delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, false);
-            FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading,
-                trailingBytes, showTrailing);
-        }
-
-        /// <summary>
-        /// Feeds the bytes into the StringGather.
-        /// </summary>
-        private void FeedGath(StringGather gath, byte[] data, int offset, int length,
-                int leadingBytes, bool showLeading, int trailingBytes, bool showTrailing) {
-            int startOffset = offset;
-            int strEndOffset = offset + length - trailingBytes;
-
-            if (showLeading) {
-                while (leadingBytes-- > 0) {
-                    gath.WriteByte(data[offset++]);
-                }
+            CharEncoding.Convert charConv;
+            if (highAscii) {
+                charConv = CharEncoding.ConvertHighAscii;
            } else {
-                offset += leadingBytes;
+                charConv = CharEncoding.ConvertLowAscii;
            }
-            for (; offset < strEndOffset; offset++) {
-                gath.WriteChar((char)(data[offset] & 0x7f));
+
+            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"',
+                StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN,
+                charConv);
+            stropf.FeedBytes(data, offset, dfd.Length - trailingBytes, leadingBytes, false);
+
+            string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric);
+
+            if (highAscii) {
+                // Does this fit the narrow definition of what we can do with a macro?
+                Debug.Assert(dfd.FormatType == FormatDescriptor.Type.StringGeneric);
+                if (stropf.Lines.Count == 1 && !stropf.HasEscapedText) {
+                    if (!mHighAsciiMacroOutput) {
+                        mHighAsciiMacroOutput = true;
+                        // Output a macro for high-ASCII strings.
+                        OutputLine(".macro", "HiAscii", "Arg", string.Empty);
+                        OutputLine(string.Empty, ".repeat", ".strlen(Arg), I", string.Empty);
+                        OutputLine(string.Empty, ".byte", ".strat(Arg, I) | $80", string.Empty);
+                        OutputLine(string.Empty, ".endrep", string.Empty, string.Empty);
+                        OutputLine(".endmacro", string.Empty, string.Empty, string.Empty);
+                    }
+                    opcodeStr = formatter.FormatPseudoOp("HiAscii");
+                } else {
+                    // didn't work out, dump hex
+                    OutputNoJoy(offset, dfd.Length, labelStr, commentStr);
+                    return;
+                }
            }
-            while (showTrailing && trailingBytes-- > 0) {
-                gath.WriteByte(data[offset++]);
+
+            if (dfd.FormatType == FormatDescriptor.Type.StringNullTerm) {
+                if (stropf.Lines.Count == 1 && !stropf.HasEscapedText) {
+                    // Keep it.
+                    opcodeStr = sDataOpNames.StrNullTerm;
+                } else {
+                    // Didn't fit, so re-emit it, this time with the terminating null byte.
+                    stropf.Reset();
+                    stropf.FeedBytes(data, offset, dfd.Length, leadingBytes, false);
+                }
+            }
+
+            foreach (string str in stropf.Lines) {
+                OutputLine(labelStr, opcodeStr, str, commentStr);
+                labelStr = commentStr = string.Empty;       // only show on first
            }
-            gath.Finish();
        }
    }

@@ -448,8 +448,6 @@ namespace SourceGen.AsmGen {
        }


-        private enum RevMode { Forward, Reverse, BlockReverse };
-
        private void OutputString(int offset, string labelStr, string commentStr) {
            // This gets complicated.
            //
@@ -467,6 +465,11 @@ namespace SourceGen.AsmGen {
            // For aesthetic purposes, zero-length CString, L8String, and L16String
            // should be output as DFB/DW zeroes rather than an empty string -- makes
            // it easier to read.
+            //
+            // NOTE: we generally assume that the input is in the correct format, e.g.
+            // the length byte in a StringL8 matches dfd.Length, and the high bits in DCI strings
+            // have the right pattern.  If not, we will generate bad output.  This would need
+            // to be scanned and corrected at a higher level.

            Formatter formatter = SourceFormatter;
            byte[] data = Project.FileData;
@@ -477,127 +480,115 @@ namespace SourceGen.AsmGen {
            Debug.Assert(dfd.Length > 0);

            bool highAscii = false;
-            int showZeroes = 0;
+            bool reverse = false;
            int leadingBytes = 0;
-            int trailingBytes = 0;
-            bool showLeading = false;
-            bool showTrailing = false;
-            RevMode revMode = RevMode.Forward;
+            string opcodeStr;

            switch (dfd.FormatType) {
                case FormatDescriptor.Type.StringGeneric:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    break;
-                case FormatDescriptor.Type.StringDci:
+                    opcodeStr = sDataOpNames.StrGeneric;
                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                case FormatDescriptor.Type.StringReverse:
+                    opcodeStr = sDataOpNames.StrReverse;
                    highAscii = (data[offset] & 0x80) != 0;
-                    revMode = RevMode.Reverse;
+                    reverse = true;
                    break;
                case FormatDescriptor.Type.StringNullTerm:
+                    opcodeStr = sDataOpNames.StrGeneric;        // no pseudo-op for this
                    highAscii = (data[offset] & 0x80) != 0;
                    if (dfd.Length == 1) {
-                        showZeroes = 1;     // empty null-terminated string
+                        // Empty string.  Just output the length byte(s) or null terminator.
+                        GenerateShortSequence(offset, 1, out string opcode, out string operand);
+                        OutputLine(labelStr, opcode, operand, commentStr);
+                        return;
                    }
-                    trailingBytes = 1;
-                    showTrailing = true;
                    break;
                case FormatDescriptor.Type.StringL8:
+                    opcodeStr = sDataOpNames.StrLen8;
                    if (dfd.Length > 1) {
                        highAscii = (data[offset + 1] & 0x80) != 0;
-                    } else {
-                        //showZeroes = 1;
                    }
                    leadingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL16:
+                    opcodeStr = sDataOpNames.StrLen16;
                    if (dfd.Length > 2) {
                        highAscii = (data[offset + 2] & 0x80) != 0;
-                    } else {
-                        //showZeroes = 2;
                    }
                    leadingBytes = 2;
                    break;
+                case FormatDescriptor.Type.StringDci:
+                    opcodeStr = sDataOpNames.StrDci;
+                    highAscii = (data[offset] & 0x80) != 0;
+                    break;
                default:
                    Debug.Assert(false);
                    return;
            }

-            if (showZeroes != 0) {
-                // Empty string.  Just output the length byte(s) or null terminator.
-                GenerateShortSequence(offset, showZeroes, out string opcode, out string operand);
-                OutputLine(labelStr, opcode, operand, commentStr);
-                return;
-            }
-
            // Merlin 32 uses single-quote for low ASCII, double-quote for high ASCII.  When
            // quoting the delimiter we use a hexadecimal value.  We need to bear in mind that
            // we're forcing the characters to low ASCII, but the actual character being
            // escaped might be in high ASCII.  Hence delim vs. delimReplace.
            char delim = highAscii ? '"' : '\'';
-            char delimReplace = highAscii ? ((char)(delim | 0x80)) : delim;
-            StringGather gath = null;
-
-            // Run the string through so we can see if it'll fit on one line.  As a minor
-            // optimization, we skip this step for "generic" strings, which are probably
-            // the most common thing.
-            if (dfd.FormatSubType != FormatDescriptor.SubType.None) {
-                gath = new StringGather(this, labelStr, "???", commentStr, delim,
-                        delimReplace, StringGather.ByteStyle.DenseHex, MAX_OPERAND_LEN, true);
-                FeedGath(gath, data, offset, dfd.Length, revMode, leadingBytes, showLeading,
-                    trailingBytes, showTrailing);
-                Debug.Assert(gath.NumLinesOutput > 0);
+            CharEncoding.Convert charConv;
+            if (highAscii) {
+                charConv = CharEncoding.ConvertHighAscii;
+            } else {
+                charConv = CharEncoding.ConvertLowAscii;
            }

-            string opcodeStr;
+            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, delim,
+                StringOpFormatter.RawOutputStyle.DenseHex, MAX_OPERAND_LEN, charConv);
+            if (dfd.FormatType == FormatDescriptor.Type.StringDci) {
+                // DCI is awkward because the character encoding flips on the last byte.  Rather
+                // than clutter up StringOpFormatter for this rare item, we just accept both
+                // throughout.
+                stropf.CharConv = CharEncoding.ConvertLowAndHighAscii;
+            }

+            // Feed bytes in, skipping over the leading length bytes.
+            stropf.FeedBytes(data, offset + leadingBytes,
+                dfd.Length - leadingBytes, 0, reverse);
+            Debug.Assert(stropf.Lines.Count > 0);
+
+            // See if we need to do this over.
+            bool redo = false;
            switch (dfd.FormatType) {
                case FormatDescriptor.Type.StringGeneric:
-                    opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric;
-                    break;
-                case FormatDescriptor.Type.StringDci:
-                    if (gath.NumLinesOutput == 1) {
-                        opcodeStr = highAscii ? sDataOpNames.StrDciHi : sDataOpNames.StrDci;
-                    } else {
-                        opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric;
-                        trailingBytes = 1;
-                        showTrailing = true;
-                    }
+                case FormatDescriptor.Type.StringNullTerm:
                    break;
                case FormatDescriptor.Type.StringReverse:
-                    if (gath.HasDelimiter) {
-                        // can't include escaped delimiters in REV
-                        opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric;
-                        revMode = RevMode.Forward;
-                    } else if (gath.NumLinesOutput > 1) {
-                        opcodeStr = highAscii ? sDataOpNames.StrReverseHi : sDataOpNames.StrReverse;
-                        revMode = RevMode.BlockReverse;
-                    } else {
-                        opcodeStr = highAscii ? sDataOpNames.StrReverseHi : sDataOpNames.StrReverse;
-                        Debug.Assert(revMode == RevMode.Reverse);
+                    if (stropf.HasEscapedText) {
+                        // can't include escaped characters in REV
+                        opcodeStr = sDataOpNames.StrGeneric;
+                        reverse = false;
+                        redo = true;
                    }
                    break;
-                case FormatDescriptor.Type.StringNullTerm:
-                    //opcodeStr = sDataOpNames.StrNullTerm[highAscii ? 1 : 0];
-                    opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric;
-                    break;
                case FormatDescriptor.Type.StringL8:
-                    if (gath.NumLinesOutput == 1) {
-                        opcodeStr = highAscii ? sDataOpNames.StrLen8Hi : sDataOpNames.StrLen8;
-                    } else {
-                        opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric;
+                    if (stropf.Lines.Count != 1) {
+                        // single-line only
+                        opcodeStr = sDataOpNames.StrGeneric;
                        leadingBytes = 1;
-                        showLeading = true;
+                        redo = true;
                    }
                    break;
                case FormatDescriptor.Type.StringL16:
-                    if (gath.NumLinesOutput == 1) {
-                        opcodeStr = highAscii ? sDataOpNames.StrLen16Hi : sDataOpNames.StrLen16;
-                    } else {
-                        opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric;
+                    if (stropf.Lines.Count != 1) {
+                        // single-line only
+                        opcodeStr = sDataOpNames.StrGeneric;
                        leadingBytes = 2;
-                        showLeading = true;
+                        redo = true;
+                    }
+                    break;
+                case FormatDescriptor.Type.StringDci:
+                    if (stropf.Lines.Count != 1) {
+                        // single-line only
+                        opcodeStr = sDataOpNames.StrGeneric;
+                        stropf.CharConv = charConv;
+                        redo = true;
                    }
                    break;
                default:
@@ -605,61 +596,21 @@ namespace SourceGen.AsmGen {
                    return;
            }

+            if (redo) {
+                //Debug.WriteLine("REDO off=+" + offset.ToString("x6") + ": " + dfd.FormatType);
+
+                // This time, instead of skipping over leading length bytes, we include them
+                // explicitly.
+                stropf.Reset();
+                stropf.FeedBytes(data, offset, dfd.Length, leadingBytes, reverse);
+            }
+
            opcodeStr = formatter.FormatPseudoOp(opcodeStr);

-            // Create a new StringGather, with the final opcode choice.
-            gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim,
-                delimReplace, StringGather.ByteStyle.DenseHex, MAX_OPERAND_LEN, false);
-            FeedGath(gath, data, offset, dfd.Length, revMode, leadingBytes, showLeading,
-                trailingBytes, showTrailing);
-        }
-
-        /// <summary>
-        /// Feeds the bytes into the StringGather.
-        /// </summary>
-        private void FeedGath(StringGather gath, byte[] data, int offset, int length,
-                RevMode revMode, int leadingBytes, bool showLeading,
-                int trailingBytes, bool showTrailing) {
-            int startOffset = offset;
-            int strEndOffset = offset + length - trailingBytes;
-
-            if (showLeading) {
-                while (leadingBytes-- > 0) {
-                    gath.WriteByte(data[offset++]);
-                }
-            } else {
-                offset += leadingBytes;
+            foreach (string str in stropf.Lines) {
+                OutputLine(labelStr, opcodeStr, str, commentStr);
+                labelStr = commentStr = string.Empty;       // only show on first
            }
-            if (revMode == RevMode.BlockReverse) {
-                const int maxPerLine = MAX_OPERAND_LEN - 2;
-                int numBlockLines = (length + maxPerLine - 1) / maxPerLine;
-
-                for (int chunk = 0; chunk < numBlockLines; chunk++) {
-                    int chunkOffset = startOffset + chunk * maxPerLine;
-                    int endOffset = chunkOffset + maxPerLine;
-                    if (endOffset > strEndOffset) {
-                        endOffset = strEndOffset;
-                    }
-                    for (int off = endOffset - 1; off >= chunkOffset; off--) {
-                        gath.WriteChar((char)(data[off] & 0x7f));
-                    }
-                }
-            } else {
-                for (; offset < strEndOffset; offset++) {
-                    if (revMode == RevMode.Forward) {
-                        gath.WriteChar((char)(data[offset] & 0x7f));
-                    } else if (revMode == RevMode.Reverse) {
-                        int posn = startOffset + (strEndOffset - offset) - 1;
-                        gath.WriteChar((char)(data[posn] & 0x7f));
-                    } else {
-                        Debug.Assert(false);
-                    }
-                }
-            }
-            while (showTrailing && trailingBytes-- > 0) {
-                gath.WriteByte(data[offset++]);
-            }
-            gath.Finish();
        }
    }

@@ -533,85 +533,36 @@ namespace SourceGen.AsmGen {
            Debug.Assert(dfd.Length > 0);

            bool highAscii = false;
-            int leadingBytes = 0;
+            int hiddenLeadingBytes = 0;
+            int shownLeadingBytes = 0;
            int trailingBytes = 0;
-            bool showLeading = false;
-            bool showTrailing = false;
+            string opcodeStr;

            switch (dfd.FormatType) {
                case FormatDescriptor.Type.StringGeneric:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    break;
-                case FormatDescriptor.Type.StringDci:
-                    highAscii = (data[offset] & 0x80) != 0;
-                    trailingBytes = 1;
-                    showTrailing = true;
-                    break;
                case FormatDescriptor.Type.StringReverse:
+                case FormatDescriptor.Type.StringDci:
+                    opcodeStr = sDataOpNames.StrGeneric;
                    highAscii = (data[offset] & 0x80) != 0;
                    break;
                case FormatDescriptor.Type.StringNullTerm:
+                    opcodeStr = sDataOpNames.StrNullTerm;
                    highAscii = (data[offset] & 0x80) != 0;
                    trailingBytes = 1;
-                    showTrailing = true;
                    break;
                case FormatDescriptor.Type.StringL8:
+                    opcodeStr = sDataOpNames.StrLen8;
                    if (dfd.Length > 1) {
                        highAscii = (data[offset + 1] & 0x80) != 0;
                    }
-                    leadingBytes = 1;
-                    showLeading = true;
+                    hiddenLeadingBytes = 1;
                    break;
                case FormatDescriptor.Type.StringL16:
+                    opcodeStr = sDataOpNames.StrGeneric;
                    if (dfd.Length > 2) {
                        highAscii = (data[offset + 2] & 0x80) != 0;
                    }
-                    leadingBytes = 2;
-                    showLeading = true;
-                    break;
-                default:
-                    Debug.Assert(false);
-                    return;
-            }
-
-            char delim = '"';
-            StringGather gath = null;
-
-            // Run the string through so we can see if it'll fit on one line.  As a minor
-            // optimization, we skip this step for "generic" strings, which are probably
-            // the most common thing.
-            if (dfd.FormatSubType != FormatDescriptor.SubType.None || highAscii) {
-                gath = new StringGather(this, labelStr, "???", commentStr, delim,
-                        delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, true);
-                FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading,
-                    trailingBytes, showTrailing);
-                Debug.Assert(gath.NumLinesOutput > 0);
-            }
-
-            string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric);
-
-            switch (dfd.FormatType) {
-                case FormatDescriptor.Type.StringGeneric:
-                    // TODO(someday): something fancy with encodings to handle high-ASCII text?
-                    break;
-                case FormatDescriptor.Type.StringDci:
-                case FormatDescriptor.Type.StringReverse:
-                    // Fully configured above.
-                    break;
-                case FormatDescriptor.Type.StringNullTerm:
-                    if (gath.NumLinesOutput == 1 && !gath.HasDelimiter) {
-                        opcodeStr = sDataOpNames.StrNullTerm;
-                        showTrailing = false;
-                    }
-                    break;
-                case FormatDescriptor.Type.StringL8:
-                    if (gath.NumLinesOutput == 1 && !gath.HasDelimiter) {
-                        opcodeStr = sDataOpNames.StrLen8;
-                        showLeading = false;
-                    }
-                    break;
-                case FormatDescriptor.Type.StringL16:
-                    // Implement as macro?
+                    shownLeadingBytes = 2;
                    break;
                default:
                    Debug.Assert(false);
@@ -623,35 +574,58 @@ namespace SourceGen.AsmGen {
                return;
            }

-            // Create a new StringGather, with the final opcode choice.
-            gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim,
-                delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, false);
-            FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading,
-                trailingBytes, showTrailing);
-        }
+            StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"',
+                StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN,
+                CharEncoding.ConvertLowAscii);

-        /// <summary>
-        /// Feeds the bytes into the StringGather.
-        /// </summary>
-        private void FeedGath(StringGather gath, byte[] data, int offset, int length,
-                int leadingBytes, bool showLeading, int trailingBytes, bool showTrailing) {
-            int startOffset = offset;
-            int strEndOffset = offset + length - trailingBytes;
+            // Feed bytes in, skipping over hidden bytes (leading L8, trailing null).
+            stropf.FeedBytes(data, offset + hiddenLeadingBytes,
+                dfd.Length - hiddenLeadingBytes - trailingBytes, shownLeadingBytes, false);
+            Debug.Assert(stropf.Lines.Count > 0);

-            if (showLeading) {
-                while (leadingBytes-- > 0) {
-                    gath.WriteByte(data[offset++]);
-                }
-            } else {
-                offset += leadingBytes;
+            // See if we need to do this over.
+            bool redo = false;
+            switch (dfd.FormatType) {
+                case FormatDescriptor.Type.StringGeneric:
+                case FormatDescriptor.Type.StringReverse:
+                case FormatDescriptor.Type.StringL16:
+                case FormatDescriptor.Type.StringDci:
+                    // All good the first time.
+                    break;
+                case FormatDescriptor.Type.StringNullTerm:
+                    if (stropf.Lines.Count != 1 || stropf.HasEscapedText) {
+                        // Must be single-line without quoted chars.
+                        opcodeStr = sDataOpNames.StrGeneric;
+                        redo = true;
+                    }
+                    break;
+                case FormatDescriptor.Type.StringL8:
+                    if (stropf.Lines.Count != 1 || stropf.HasEscapedText) {
+                        // Must be single-line without quoted chars.
+                        opcodeStr = sDataOpNames.StrGeneric;
+                        redo = true;
+                    }
+                    break;
+                default:
+                    Debug.Assert(false);
+                    return;
            }
-            for (; offset < strEndOffset; offset++) {
-                gath.WriteChar((char)(data[offset] & 0x7f));
+
+            if (redo) {
+                //Debug.WriteLine("REDO off=+" + offset.ToString("x6") + ": " + dfd.FormatType);
+
+                // This time, instead of skipping over leading length bytes, we include them
+                // explicitly.
+                stropf.Reset();
+                stropf.FeedBytes(data, offset, dfd.Length, hiddenLeadingBytes, false);
            }
-            while (showTrailing && trailingBytes-- > 0) {
-                gath.WriteByte(data[offset++]);
+
+            opcodeStr = formatter.FormatPseudoOp(opcodeStr);
+
+            foreach (string str in stropf.Lines) {
+                OutputLine(labelStr, opcodeStr, str, commentStr);
+                labelStr = commentStr = string.Empty;       // only show on first
            }
-            gath.Finish();
        }
    }

@@ -73,7 +73,6 @@
    <Compile Include="AsmGen\IAssembler.cs" />
    <Compile Include="AsmGen\IGenerator.cs" />
    <Compile Include="AsmGen\LabelLocalizer.cs" />
-    <Compile Include="AsmGen\StringGather.cs" />
    <Compile Include="Tests\GenTest.cs" />
    <Compile Include="Tests\ProgressMessage.cs" />
    <Compile Include="Tests\WpfGui\GenTestRunner.xaml.cs">