diff --git a/Asm65/CharEncoding.cs b/Asm65/CharEncoding.cs new file mode 100644 index 0000000..7f7b6a5 --- /dev/null +++ b/Asm65/CharEncoding.cs @@ -0,0 +1,88 @@ +/* + * Copyright 2019 faddenSoft + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Asm65 { + /// + /// Character encoding helper methods. + /// + public static class CharEncoding { + public const char UNPRINTABLE_CHAR = '\ufffd'; + + /// + /// Determines whether the byte represents a character in the character set. + /// + public delegate bool InclusionTest(byte val); + + /// + /// Converts the byte to a printable character. Returns UNPRINTABLE_CHAR if the value + /// does not map to something printable. + /// + /// + /// Yes, I'm assuming it all fits in the Unicode BMP. Should be a safe assumption + /// for 8-bit computer character sets. + /// + public delegate char Convert(byte val); + + // + // Standard ASCII. + // + public static bool IsPrintableLowAscii(byte val) { + return (val >= 0x20 && val < 0x7f); + } + public static bool IsExtendedLowAscii(byte val) { + return IsPrintableLowAscii(val) || val == 0x0a || val == 0x0d; + } + public static char ConvertLowAscii(byte val) { + if (IsPrintableLowAscii(val)) { + return (char)val; + } else { + return UNPRINTABLE_CHAR; + } + } + + // + // Standard ASCII, but with the high bit set. + // + public static bool IsPrintableHighAscii(byte val) { + return (val >= 0xa0 && val < 0xff); + } + public static bool IsExtendedHighAscii(byte val) { + return IsPrintableHighAscii(val) || val == 0x8a || val == 0x8d; + } + public static char ConvertHighAscii(byte val) { + if (IsPrintableHighAscii(val)) { + return (char)(val & 0x7f); + } else { + return UNPRINTABLE_CHAR; + } + } + + // + // High *or* low ASCII. + // + public static char ConvertLowAndHighAscii(byte val) { + if (IsPrintableLowAscii(val) || IsPrintableHighAscii(val)) { + return (char)(val & 0x7f); + } else { + return UNPRINTABLE_CHAR; + } + } + + } +} diff --git a/SourceGen/AsmGen/StringGather.cs b/Asm65/StringOpFormatter.cs similarity index 55% rename from SourceGen/AsmGen/StringGather.cs rename to Asm65/StringOpFormatter.cs index 5be7085..6144fe7 100644 --- a/SourceGen/AsmGen/StringGather.cs +++ b/Asm65/StringOpFormatter.cs @@ -14,35 +14,30 @@ * limitations under the License. */ using System; +using System.Collections.Generic; using System.Diagnostics; -namespace SourceGen.AsmGen { +namespace Asm65 { /// - /// Multi-line string gatherer. Accumulates characters and raw bytes, emitting - /// them when we have a full operand's worth. - /// - /// If the delimiter character appears, it will be output inline as a raw byte. - /// The low-ASCII string ['hello'world'] will become [27,'hello',27,'world',27] - /// (or something similar). + /// String pseudo-op formatter. Handles character encoding conversion and quoting of + /// delimiters and non-printable characters. /// - public class StringGather { - // Inputs. - public IGenerator Gen { get; private set; } - public string Label { get; private set; } - public string Opcode { get; private set; } - public string Comment { get; private set; } - public char Delimiter { get; private set; } - public char DelimiterReplacement { get; private set; } - public ByteStyle ByteStyleX { get; private set; } - public int MaxOperandLen { get; private set; } - public bool IsTestRun { get; private set; } + public class StringOpFormatter { + public CharEncoding.Convert CharConv { get; set; } - public enum ByteStyle { DenseHex, CommaSep }; + private char Delimiter { get; set; } + private RawOutputStyle RawStyle { get; set; } + private int MaxOperandLen { get; set; } + + // Output format for raw (non-printable) characters. Most assemblers use comma-separated + // hex values, some allow dense hex strings. + public enum RawOutputStyle { DenseHex, CommaSep }; // Outputs. - public bool HasDelimiter { get; private set; } - public int NumLinesOutput { get; private set; } + public bool HasEscapedText { get; private set; } + public List Lines { get; private set; } + // Reference to array with 16 hex digits. (May be upper or lower case.) private char[] mHexChars; /// @@ -50,12 +45,13 @@ namespace SourceGen.AsmGen { /// because they're mixed with bytes, particularly when we have to escape the /// delimiter character. Strings might start or end with escaped delimiters, /// so we don't add them until we have to. + /// private char[] mBuffer; /// /// Next available character position. /// - private int mIndex = 0; + private int mIndex; /// /// State of the buffer, based on the last thing we added. @@ -64,48 +60,51 @@ namespace SourceGen.AsmGen { Unknown = 0, StartOfLine, InQuote, - OutQuote + OutQuote, + Finished } - private State mState = State.StartOfLine; + private State mState; /// /// Constructor. /// - /// Reference back to generator, for output function and - /// format options. - /// Line label. Appears on first output line only. - /// Opcode to use for all lines. - /// End-of-line comment. Appears on first output line - /// only. + /// Reference to text formatter. /// String delimiter character. - /// If true, no file output is produced. - public StringGather(IGenerator gen, string label, string opcode, - string comment, char delimiter, char delimReplace, ByteStyle byteStyle, - int maxOperandLen, bool isTestRun) { - Gen = gen; - Label = label; - Opcode = opcode; - Comment = comment; + /// How to format raw byte data. + /// Maximum line length. + /// Character conversion delegate. + public StringOpFormatter(Formatter formatter, char delimiter, RawOutputStyle byteStyle, + int maxOperandLen, CharEncoding.Convert charConv) { Delimiter = delimiter; - DelimiterReplacement = delimReplace; - ByteStyleX = byteStyle; + RawStyle = byteStyle; MaxOperandLen = maxOperandLen; - IsTestRun = isTestRun; + CharConv = charConv; mBuffer = new char[MaxOperandLen]; - mHexChars = Gen.SourceFormatter.HexDigits; + mHexChars = formatter.HexDigits; + Lines = new List(); + + Reset(); + } + + public void Reset() { + mState = State.StartOfLine; + mIndex = 0; + Lines.Clear(); } /// - /// Write a character into the buffer. + /// Write a character into the buffer. If the character matches the delimiter, or + /// isn't printable, the raw character value will be written as a byte instead. /// - /// Character to add. - public void WriteChar(char ch) { - Debug.Assert(ch >= 0 && ch <= 0xff); - if (ch == Delimiter) { + /// Raw character value. + public void WriteChar(byte rawCh) { + Debug.Assert(mState != State.Finished); + + char ch = CharConv(rawCh); + if (ch == Delimiter || ch == CharEncoding.UNPRINTABLE_CHAR) { // Must write it as a byte. - HasDelimiter = true; - WriteByte((byte)DelimiterReplacement); + WriteByte(rawCh); return; } @@ -146,6 +145,10 @@ namespace SourceGen.AsmGen { /// /// Value to add. public void WriteByte(byte val) { + Debug.Assert(mState != State.Finished); + + HasEscapedText = true; + // If we're at the start of a line, just output the byte. // If we're inside quotes, emit a delimiter, comma, and the byte. We must // have space for four (DenseHex) or five (CommaSep) chars. @@ -155,7 +158,7 @@ namespace SourceGen.AsmGen { case State.StartOfLine: break; case State.InQuote: - int minWidth = (ByteStyleX == ByteStyle.CommaSep) ? 5 : 4; + int minWidth = (RawStyle == RawOutputStyle.CommaSep) ? 5 : 4; if (mIndex + minWidth > MaxOperandLen) { Flush(); } else { @@ -164,11 +167,11 @@ namespace SourceGen.AsmGen { } break; case State.OutQuote: - minWidth = (ByteStyleX == ByteStyle.CommaSep) ? 4 : 2; + minWidth = (RawStyle == RawOutputStyle.CommaSep) ? 4 : 2; if (mIndex + minWidth > MaxOperandLen) { Flush(); } else { - if (ByteStyleX == ByteStyle.CommaSep) { + if (RawStyle == RawOutputStyle.CommaSep) { mBuffer[mIndex++] = ','; } } @@ -178,7 +181,7 @@ namespace SourceGen.AsmGen { break; } - if (ByteStyleX == ByteStyle.CommaSep) { + if (RawStyle == RawOutputStyle.CommaSep) { mBuffer[mIndex++] = '$'; } mBuffer[mIndex++] = mHexChars[val >> 4]; @@ -202,26 +205,65 @@ namespace SourceGen.AsmGen { // empty string; put out a pair of delimiters mBuffer[mIndex++] = Delimiter; mBuffer[mIndex++] = Delimiter; - NumLinesOutput++; break; case State.InQuote: // add delimiter and finish mBuffer[mIndex++] = Delimiter; - NumLinesOutput++; break; case State.OutQuote: // just output it - NumLinesOutput++; break; } - if (!IsTestRun) { - Gen.OutputLine(Label, Opcode, new string(mBuffer, 0, mIndex), - Comment); - } - mIndex = 0; - // Erase these after first use so we don't put them on every line. - Label = Comment = string.Empty; + string newStr = new string(mBuffer, 0, mIndex); + Debug.Assert(newStr.Length <= MaxOperandLen); + Lines.Add(newStr); + + mState = State.Finished; + + mIndex = 0; + } + + /// + /// Feeds the bytes into the StringGather. + /// + public void FeedBytes(byte[] data, int offset, int length, int leadingBytes, + bool reverse) { + int startOffset = offset; + int strEndOffset = offset + length; + + // Write leading bytes. This is used for the 8- or 16-bit length (when no + // appropriate pseudo-op is available), because we want to output that as hex + // even if it maps to a printable character. + while (leadingBytes-- > 0) { + WriteByte(data[offset++]); + } + if (reverse) { + // Max per line is line length minus the two delimiters. We don't allow + // any hex quoting in reversed text, so this always works. (If somebody + // does try to reverse text with delimiters or unprintable chars, we'll + // blow out the line limit, but for a cross-assembler that should be purely + // cosmetic.) + int maxPerLine = MaxOperandLen - 2; + int numBlockLines = (length + maxPerLine - 1) / maxPerLine; + + for (int chunk = 0; chunk < numBlockLines; chunk++) { + int chunkOffset = startOffset + chunk * maxPerLine; + int endOffset = chunkOffset + maxPerLine; + if (endOffset > strEndOffset) { + endOffset = strEndOffset; + } + for (int off = endOffset - 1; off >= chunkOffset; off--) { + WriteChar(data[off]); + } + } + } else { + for (; offset < strEndOffset; offset++) { + WriteChar(data[offset]); + } + } + + Finish(); } } } diff --git a/SourceGen/AsmGen/AsmAcme.cs b/SourceGen/AsmGen/AsmAcme.cs index 37d73f2..6f08de9 100644 --- a/SourceGen/AsmGen/AsmAcme.cs +++ b/SourceGen/AsmGen/AsmAcme.cs @@ -542,74 +542,25 @@ namespace SourceGen.AsmGen { bool highAscii = false; int leadingBytes = 0; - int trailingBytes = 0; - bool showLeading = false; - bool showTrailing = false; switch (dfd.FormatType) { case FormatDescriptor.Type.StringGeneric: - highAscii = (data[offset] & 0x80) != 0; - break; + case FormatDescriptor.Type.StringReverse: + case FormatDescriptor.Type.StringNullTerm: case FormatDescriptor.Type.StringDci: highAscii = (data[offset] & 0x80) != 0; - trailingBytes = 1; - showTrailing = true; - break; - case FormatDescriptor.Type.StringReverse: - highAscii = (data[offset] & 0x80) != 0; - break; - case FormatDescriptor.Type.StringNullTerm: - highAscii = (data[offset] & 0x80) != 0; - trailingBytes = 1; - showTrailing = true; break; case FormatDescriptor.Type.StringL8: if (dfd.Length > 1) { highAscii = (data[offset + 1] & 0x80) != 0; } leadingBytes = 1; - showLeading = true; break; case FormatDescriptor.Type.StringL16: if (dfd.Length > 2) { highAscii = (data[offset + 2] & 0x80) != 0; } leadingBytes = 2; - showLeading = true; - break; - default: - Debug.Assert(false); - return; - } - - char delim = '"'; - StringGather gath = null; - - // Run the string through so we can see if it'll fit on one line. As a minor - // optimization, we skip this step for "generic" strings, which are probably - // the most common thing. - if (dfd.FormatSubType != FormatDescriptor.SubType.None || highAscii) { - gath = new StringGather(this, labelStr, "???", commentStr, delim, - delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, true); - FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading, - trailingBytes, showTrailing); - Debug.Assert(gath.NumLinesOutput > 0); - } - - string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric); - - switch (dfd.FormatType) { - case FormatDescriptor.Type.StringGeneric: - // TODO(someday): something fancy with encodings to handle high-ASCII text? - break; - case FormatDescriptor.Type.StringDci: - case FormatDescriptor.Type.StringReverse: - // Fully configured above. - break; - case FormatDescriptor.Type.StringNullTerm: - case FormatDescriptor.Type.StringL8: - case FormatDescriptor.Type.StringL16: - // Implement as macro? break; default: Debug.Assert(false); @@ -621,35 +572,16 @@ namespace SourceGen.AsmGen { return; } - // Create a new StringGather, with the final opcode choice. - gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim, - delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, false); - FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading, - trailingBytes, showTrailing); - } + StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"', + StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN, + CharEncoding.ConvertLowAscii); + stropf.FeedBytes(data, offset, dfd.Length, leadingBytes, false); - /// - /// Feeds the bytes into the StringGather. - /// - private void FeedGath(StringGather gath, byte[] data, int offset, int length, - int leadingBytes, bool showLeading, int trailingBytes, bool showTrailing) { - int startOffset = offset; - int strEndOffset = offset + length - trailingBytes; - - if (showLeading) { - while (leadingBytes-- > 0) { - gath.WriteByte(data[offset++]); - } - } else { - offset += leadingBytes; + string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric); + foreach (string str in stropf.Lines) { + OutputLine(labelStr, opcodeStr, str, commentStr); + labelStr = commentStr = string.Empty; // only show on first } - for (; offset < strEndOffset; offset++) { - gath.WriteChar((char)(data[offset] & 0x7f)); - } - while (showTrailing && trailingBytes-- > 0) { - gath.WriteByte(data[offset++]); - } - gath.Finish(); } } diff --git a/SourceGen/AsmGen/AsmCc65.cs b/SourceGen/AsmGen/AsmCc65.cs index a8fd438..1a4164b 100644 --- a/SourceGen/AsmGen/AsmCc65.cs +++ b/SourceGen/AsmGen/AsmCc65.cs @@ -598,135 +598,89 @@ namespace SourceGen.AsmGen { bool highAscii = false; int leadingBytes = 0; int trailingBytes = 0; - bool showLeading = false; - bool showTrailing = false; switch (dfd.FormatType) { case FormatDescriptor.Type.StringGeneric: - highAscii = (data[offset] & 0x80) != 0; - break; - case FormatDescriptor.Type.StringDci: - highAscii = (data[offset] & 0x80) != 0; - trailingBytes = 1; - showTrailing = true; - break; case FormatDescriptor.Type.StringReverse: + case FormatDescriptor.Type.StringDci: highAscii = (data[offset] & 0x80) != 0; break; case FormatDescriptor.Type.StringNullTerm: highAscii = (data[offset] & 0x80) != 0; trailingBytes = 1; - showTrailing = true; break; case FormatDescriptor.Type.StringL8: if (dfd.Length > 1) { highAscii = (data[offset + 1] & 0x80) != 0; } leadingBytes = 1; - showLeading = true; break; case FormatDescriptor.Type.StringL16: if (dfd.Length > 2) { highAscii = (data[offset + 2] & 0x80) != 0; } leadingBytes = 2; - showLeading = true; break; default: Debug.Assert(false); return; } - char delim = '"'; - StringGather gath = null; - - // Run the string through so we can see if it'll fit on one line. As a minor - // optimization, we skip this step for "generic" strings, which are probably - // the most common thing. - if (dfd.FormatSubType != FormatDescriptor.SubType.None || highAscii) { - gath = new StringGather(this, labelStr, "???", commentStr, delim, - delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, true); - FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading, - trailingBytes, showTrailing); - Debug.Assert(gath.NumLinesOutput > 0); - } - - string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric); - - switch (dfd.FormatType) { - case FormatDescriptor.Type.StringGeneric: - // Special case for simple short high-ASCII strings. These have no - // leading or trailing bytes. We can improve this a bit by handling - // arbitrarily long strings by simply breaking them across lines. - Debug.Assert(leadingBytes == 0); - Debug.Assert(trailingBytes == 0); - if (highAscii && gath.NumLinesOutput == 1 && !gath.HasDelimiter) { - if (!mHighAsciiMacroOutput) { - mHighAsciiMacroOutput = true; - // Output a macro for high-ASCII strings. - OutputLine(".macro", "HiAscii", "Arg", string.Empty); - OutputLine(string.Empty, ".repeat", ".strlen(Arg), I", string.Empty); - OutputLine(string.Empty, ".byte", ".strat(Arg, I) | $80", string.Empty); - OutputLine(string.Empty, ".endrep", string.Empty, string.Empty); - OutputLine(".endmacro", string.Empty, string.Empty, string.Empty); - } - opcodeStr = formatter.FormatPseudoOp("HiAscii"); - highAscii = false; - } - break; - case FormatDescriptor.Type.StringDci: - case FormatDescriptor.Type.StringReverse: - // Full configured above. - break; - case FormatDescriptor.Type.StringNullTerm: - if (gath.NumLinesOutput == 1 && !gath.HasDelimiter) { - opcodeStr = sDataOpNames.StrNullTerm; - showTrailing = false; - } - break; - case FormatDescriptor.Type.StringL8: - case FormatDescriptor.Type.StringL16: - // Implement macros? - break; - default: - Debug.Assert(false); - return; - } - - if (highAscii) { + if (highAscii && dfd.FormatType != FormatDescriptor.Type.StringGeneric) { OutputNoJoy(offset, dfd.Length, labelStr, commentStr); return; } - // Create a new StringGather, with the final opcode choice. - gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim, - delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, false); - FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading, - trailingBytes, showTrailing); - } - - /// - /// Feeds the bytes into the StringGather. - /// - private void FeedGath(StringGather gath, byte[] data, int offset, int length, - int leadingBytes, bool showLeading, int trailingBytes, bool showTrailing) { - int startOffset = offset; - int strEndOffset = offset + length - trailingBytes; - - if (showLeading) { - while (leadingBytes-- > 0) { - gath.WriteByte(data[offset++]); - } + CharEncoding.Convert charConv; + if (highAscii) { + charConv = CharEncoding.ConvertHighAscii; } else { - offset += leadingBytes; + charConv = CharEncoding.ConvertLowAscii; } - for (; offset < strEndOffset; offset++) { - gath.WriteChar((char)(data[offset] & 0x7f)); + + StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"', + StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN, + charConv); + stropf.FeedBytes(data, offset, dfd.Length - trailingBytes, leadingBytes, false); + + string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric); + + if (highAscii) { + // Does this fit the narrow definition of what we can do with a macro? + Debug.Assert(dfd.FormatType == FormatDescriptor.Type.StringGeneric); + if (stropf.Lines.Count == 1 && !stropf.HasEscapedText) { + if (!mHighAsciiMacroOutput) { + mHighAsciiMacroOutput = true; + // Output a macro for high-ASCII strings. + OutputLine(".macro", "HiAscii", "Arg", string.Empty); + OutputLine(string.Empty, ".repeat", ".strlen(Arg), I", string.Empty); + OutputLine(string.Empty, ".byte", ".strat(Arg, I) | $80", string.Empty); + OutputLine(string.Empty, ".endrep", string.Empty, string.Empty); + OutputLine(".endmacro", string.Empty, string.Empty, string.Empty); + } + opcodeStr = formatter.FormatPseudoOp("HiAscii"); + } else { + // didn't work out, dump hex + OutputNoJoy(offset, dfd.Length, labelStr, commentStr); + return; + } } - while (showTrailing && trailingBytes-- > 0) { - gath.WriteByte(data[offset++]); + + if (dfd.FormatType == FormatDescriptor.Type.StringNullTerm) { + if (stropf.Lines.Count == 1 && !stropf.HasEscapedText) { + // Keep it. + opcodeStr = sDataOpNames.StrNullTerm; + } else { + // Didn't fit, so re-emit it, this time with the terminating null byte. + stropf.Reset(); + stropf.FeedBytes(data, offset, dfd.Length, leadingBytes, false); + } + } + + foreach (string str in stropf.Lines) { + OutputLine(labelStr, opcodeStr, str, commentStr); + labelStr = commentStr = string.Empty; // only show on first } - gath.Finish(); } } diff --git a/SourceGen/AsmGen/AsmMerlin32.cs b/SourceGen/AsmGen/AsmMerlin32.cs index c308979..08c42c4 100644 --- a/SourceGen/AsmGen/AsmMerlin32.cs +++ b/SourceGen/AsmGen/AsmMerlin32.cs @@ -448,8 +448,6 @@ namespace SourceGen.AsmGen { } - private enum RevMode { Forward, Reverse, BlockReverse }; - private void OutputString(int offset, string labelStr, string commentStr) { // This gets complicated. // @@ -467,6 +465,11 @@ namespace SourceGen.AsmGen { // For aesthetic purposes, zero-length CString, L8String, and L16String // should be output as DFB/DW zeroes rather than an empty string -- makes // it easier to read. + // + // NOTE: we generally assume that the input is in the correct format, e.g. + // the length byte in a StringL8 matches dfd.Length, and the high bits in DCI strings + // have the right pattern. If not, we will generate bad output. This would need + // to be scanned and corrected at a higher level. Formatter formatter = SourceFormatter; byte[] data = Project.FileData; @@ -477,127 +480,115 @@ namespace SourceGen.AsmGen { Debug.Assert(dfd.Length > 0); bool highAscii = false; - int showZeroes = 0; + bool reverse = false; int leadingBytes = 0; - int trailingBytes = 0; - bool showLeading = false; - bool showTrailing = false; - RevMode revMode = RevMode.Forward; + string opcodeStr; switch (dfd.FormatType) { case FormatDescriptor.Type.StringGeneric: - highAscii = (data[offset] & 0x80) != 0; - break; - case FormatDescriptor.Type.StringDci: + opcodeStr = sDataOpNames.StrGeneric; highAscii = (data[offset] & 0x80) != 0; break; case FormatDescriptor.Type.StringReverse: + opcodeStr = sDataOpNames.StrReverse; highAscii = (data[offset] & 0x80) != 0; - revMode = RevMode.Reverse; + reverse = true; break; case FormatDescriptor.Type.StringNullTerm: + opcodeStr = sDataOpNames.StrGeneric; // no pseudo-op for this highAscii = (data[offset] & 0x80) != 0; if (dfd.Length == 1) { - showZeroes = 1; // empty null-terminated string + // Empty string. Just output the length byte(s) or null terminator. + GenerateShortSequence(offset, 1, out string opcode, out string operand); + OutputLine(labelStr, opcode, operand, commentStr); + return; } - trailingBytes = 1; - showTrailing = true; break; case FormatDescriptor.Type.StringL8: + opcodeStr = sDataOpNames.StrLen8; if (dfd.Length > 1) { highAscii = (data[offset + 1] & 0x80) != 0; - } else { - //showZeroes = 1; } leadingBytes = 1; break; case FormatDescriptor.Type.StringL16: + opcodeStr = sDataOpNames.StrLen16; if (dfd.Length > 2) { highAscii = (data[offset + 2] & 0x80) != 0; - } else { - //showZeroes = 2; } leadingBytes = 2; break; + case FormatDescriptor.Type.StringDci: + opcodeStr = sDataOpNames.StrDci; + highAscii = (data[offset] & 0x80) != 0; + break; default: Debug.Assert(false); return; } - if (showZeroes != 0) { - // Empty string. Just output the length byte(s) or null terminator. - GenerateShortSequence(offset, showZeroes, out string opcode, out string operand); - OutputLine(labelStr, opcode, operand, commentStr); - return; - } - // Merlin 32 uses single-quote for low ASCII, double-quote for high ASCII. When // quoting the delimiter we use a hexadecimal value. We need to bear in mind that // we're forcing the characters to low ASCII, but the actual character being // escaped might be in high ASCII. Hence delim vs. delimReplace. char delim = highAscii ? '"' : '\''; - char delimReplace = highAscii ? ((char)(delim | 0x80)) : delim; - StringGather gath = null; - - // Run the string through so we can see if it'll fit on one line. As a minor - // optimization, we skip this step for "generic" strings, which are probably - // the most common thing. - if (dfd.FormatSubType != FormatDescriptor.SubType.None) { - gath = new StringGather(this, labelStr, "???", commentStr, delim, - delimReplace, StringGather.ByteStyle.DenseHex, MAX_OPERAND_LEN, true); - FeedGath(gath, data, offset, dfd.Length, revMode, leadingBytes, showLeading, - trailingBytes, showTrailing); - Debug.Assert(gath.NumLinesOutput > 0); + CharEncoding.Convert charConv; + if (highAscii) { + charConv = CharEncoding.ConvertHighAscii; + } else { + charConv = CharEncoding.ConvertLowAscii; } - string opcodeStr; + StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, delim, + StringOpFormatter.RawOutputStyle.DenseHex, MAX_OPERAND_LEN, charConv); + if (dfd.FormatType == FormatDescriptor.Type.StringDci) { + // DCI is awkward because the character encoding flips on the last byte. Rather + // than clutter up StringOpFormatter for this rare item, we just accept both + // throughout. + stropf.CharConv = CharEncoding.ConvertLowAndHighAscii; + } + // Feed bytes in, skipping over the leading length bytes. + stropf.FeedBytes(data, offset + leadingBytes, + dfd.Length - leadingBytes, 0, reverse); + Debug.Assert(stropf.Lines.Count > 0); + + // See if we need to do this over. + bool redo = false; switch (dfd.FormatType) { case FormatDescriptor.Type.StringGeneric: - opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric; - break; - case FormatDescriptor.Type.StringDci: - if (gath.NumLinesOutput == 1) { - opcodeStr = highAscii ? sDataOpNames.StrDciHi : sDataOpNames.StrDci; - } else { - opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric; - trailingBytes = 1; - showTrailing = true; - } + case FormatDescriptor.Type.StringNullTerm: break; case FormatDescriptor.Type.StringReverse: - if (gath.HasDelimiter) { - // can't include escaped delimiters in REV - opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric; - revMode = RevMode.Forward; - } else if (gath.NumLinesOutput > 1) { - opcodeStr = highAscii ? sDataOpNames.StrReverseHi : sDataOpNames.StrReverse; - revMode = RevMode.BlockReverse; - } else { - opcodeStr = highAscii ? sDataOpNames.StrReverseHi : sDataOpNames.StrReverse; - Debug.Assert(revMode == RevMode.Reverse); + if (stropf.HasEscapedText) { + // can't include escaped characters in REV + opcodeStr = sDataOpNames.StrGeneric; + reverse = false; + redo = true; } break; - case FormatDescriptor.Type.StringNullTerm: - //opcodeStr = sDataOpNames.StrNullTerm[highAscii ? 1 : 0]; - opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric; - break; case FormatDescriptor.Type.StringL8: - if (gath.NumLinesOutput == 1) { - opcodeStr = highAscii ? sDataOpNames.StrLen8Hi : sDataOpNames.StrLen8; - } else { - opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric; + if (stropf.Lines.Count != 1) { + // single-line only + opcodeStr = sDataOpNames.StrGeneric; leadingBytes = 1; - showLeading = true; + redo = true; } break; case FormatDescriptor.Type.StringL16: - if (gath.NumLinesOutput == 1) { - opcodeStr = highAscii ? sDataOpNames.StrLen16Hi : sDataOpNames.StrLen16; - } else { - opcodeStr = highAscii ? sDataOpNames.StrGenericHi : sDataOpNames.StrGeneric; + if (stropf.Lines.Count != 1) { + // single-line only + opcodeStr = sDataOpNames.StrGeneric; leadingBytes = 2; - showLeading = true; + redo = true; + } + break; + case FormatDescriptor.Type.StringDci: + if (stropf.Lines.Count != 1) { + // single-line only + opcodeStr = sDataOpNames.StrGeneric; + stropf.CharConv = charConv; + redo = true; } break; default: @@ -605,61 +596,21 @@ namespace SourceGen.AsmGen { return; } + if (redo) { + //Debug.WriteLine("REDO off=+" + offset.ToString("x6") + ": " + dfd.FormatType); + + // This time, instead of skipping over leading length bytes, we include them + // explicitly. + stropf.Reset(); + stropf.FeedBytes(data, offset, dfd.Length, leadingBytes, reverse); + } + opcodeStr = formatter.FormatPseudoOp(opcodeStr); - // Create a new StringGather, with the final opcode choice. - gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim, - delimReplace, StringGather.ByteStyle.DenseHex, MAX_OPERAND_LEN, false); - FeedGath(gath, data, offset, dfd.Length, revMode, leadingBytes, showLeading, - trailingBytes, showTrailing); - } - - /// - /// Feeds the bytes into the StringGather. - /// - private void FeedGath(StringGather gath, byte[] data, int offset, int length, - RevMode revMode, int leadingBytes, bool showLeading, - int trailingBytes, bool showTrailing) { - int startOffset = offset; - int strEndOffset = offset + length - trailingBytes; - - if (showLeading) { - while (leadingBytes-- > 0) { - gath.WriteByte(data[offset++]); - } - } else { - offset += leadingBytes; + foreach (string str in stropf.Lines) { + OutputLine(labelStr, opcodeStr, str, commentStr); + labelStr = commentStr = string.Empty; // only show on first } - if (revMode == RevMode.BlockReverse) { - const int maxPerLine = MAX_OPERAND_LEN - 2; - int numBlockLines = (length + maxPerLine - 1) / maxPerLine; - - for (int chunk = 0; chunk < numBlockLines; chunk++) { - int chunkOffset = startOffset + chunk * maxPerLine; - int endOffset = chunkOffset + maxPerLine; - if (endOffset > strEndOffset) { - endOffset = strEndOffset; - } - for (int off = endOffset - 1; off >= chunkOffset; off--) { - gath.WriteChar((char)(data[off] & 0x7f)); - } - } - } else { - for (; offset < strEndOffset; offset++) { - if (revMode == RevMode.Forward) { - gath.WriteChar((char)(data[offset] & 0x7f)); - } else if (revMode == RevMode.Reverse) { - int posn = startOffset + (strEndOffset - offset) - 1; - gath.WriteChar((char)(data[posn] & 0x7f)); - } else { - Debug.Assert(false); - } - } - } - while (showTrailing && trailingBytes-- > 0) { - gath.WriteByte(data[offset++]); - } - gath.Finish(); } } diff --git a/SourceGen/AsmGen/AsmTass64.cs b/SourceGen/AsmGen/AsmTass64.cs index 1bc7fe2..55035c4 100644 --- a/SourceGen/AsmGen/AsmTass64.cs +++ b/SourceGen/AsmGen/AsmTass64.cs @@ -533,85 +533,36 @@ namespace SourceGen.AsmGen { Debug.Assert(dfd.Length > 0); bool highAscii = false; - int leadingBytes = 0; + int hiddenLeadingBytes = 0; + int shownLeadingBytes = 0; int trailingBytes = 0; - bool showLeading = false; - bool showTrailing = false; + string opcodeStr; switch (dfd.FormatType) { case FormatDescriptor.Type.StringGeneric: - highAscii = (data[offset] & 0x80) != 0; - break; - case FormatDescriptor.Type.StringDci: - highAscii = (data[offset] & 0x80) != 0; - trailingBytes = 1; - showTrailing = true; - break; case FormatDescriptor.Type.StringReverse: + case FormatDescriptor.Type.StringDci: + opcodeStr = sDataOpNames.StrGeneric; highAscii = (data[offset] & 0x80) != 0; break; case FormatDescriptor.Type.StringNullTerm: + opcodeStr = sDataOpNames.StrNullTerm; highAscii = (data[offset] & 0x80) != 0; trailingBytes = 1; - showTrailing = true; break; case FormatDescriptor.Type.StringL8: + opcodeStr = sDataOpNames.StrLen8; if (dfd.Length > 1) { highAscii = (data[offset + 1] & 0x80) != 0; } - leadingBytes = 1; - showLeading = true; + hiddenLeadingBytes = 1; break; case FormatDescriptor.Type.StringL16: + opcodeStr = sDataOpNames.StrGeneric; if (dfd.Length > 2) { highAscii = (data[offset + 2] & 0x80) != 0; } - leadingBytes = 2; - showLeading = true; - break; - default: - Debug.Assert(false); - return; - } - - char delim = '"'; - StringGather gath = null; - - // Run the string through so we can see if it'll fit on one line. As a minor - // optimization, we skip this step for "generic" strings, which are probably - // the most common thing. - if (dfd.FormatSubType != FormatDescriptor.SubType.None || highAscii) { - gath = new StringGather(this, labelStr, "???", commentStr, delim, - delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, true); - FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading, - trailingBytes, showTrailing); - Debug.Assert(gath.NumLinesOutput > 0); - } - - string opcodeStr = formatter.FormatPseudoOp(sDataOpNames.StrGeneric); - - switch (dfd.FormatType) { - case FormatDescriptor.Type.StringGeneric: - // TODO(someday): something fancy with encodings to handle high-ASCII text? - break; - case FormatDescriptor.Type.StringDci: - case FormatDescriptor.Type.StringReverse: - // Fully configured above. - break; - case FormatDescriptor.Type.StringNullTerm: - if (gath.NumLinesOutput == 1 && !gath.HasDelimiter) { - opcodeStr = sDataOpNames.StrNullTerm; - showTrailing = false; - } - break; - case FormatDescriptor.Type.StringL8: - if (gath.NumLinesOutput == 1 && !gath.HasDelimiter) { - opcodeStr = sDataOpNames.StrLen8; - showLeading = false; - } - break; - case FormatDescriptor.Type.StringL16: - // Implement as macro? + shownLeadingBytes = 2; break; default: Debug.Assert(false); @@ -623,35 +574,58 @@ namespace SourceGen.AsmGen { return; } - // Create a new StringGather, with the final opcode choice. - gath = new StringGather(this, labelStr, opcodeStr, commentStr, delim, - delim, StringGather.ByteStyle.CommaSep, MAX_OPERAND_LEN, false); - FeedGath(gath, data, offset, dfd.Length, leadingBytes, showLeading, - trailingBytes, showTrailing); - } + StringOpFormatter stropf = new StringOpFormatter(SourceFormatter, '"', + StringOpFormatter.RawOutputStyle.CommaSep, MAX_OPERAND_LEN, + CharEncoding.ConvertLowAscii); - /// - /// Feeds the bytes into the StringGather. - /// - private void FeedGath(StringGather gath, byte[] data, int offset, int length, - int leadingBytes, bool showLeading, int trailingBytes, bool showTrailing) { - int startOffset = offset; - int strEndOffset = offset + length - trailingBytes; + // Feed bytes in, skipping over hidden bytes (leading L8, trailing null). + stropf.FeedBytes(data, offset + hiddenLeadingBytes, + dfd.Length - hiddenLeadingBytes - trailingBytes, shownLeadingBytes, false); + Debug.Assert(stropf.Lines.Count > 0); - if (showLeading) { - while (leadingBytes-- > 0) { - gath.WriteByte(data[offset++]); - } - } else { - offset += leadingBytes; + // See if we need to do this over. + bool redo = false; + switch (dfd.FormatType) { + case FormatDescriptor.Type.StringGeneric: + case FormatDescriptor.Type.StringReverse: + case FormatDescriptor.Type.StringL16: + case FormatDescriptor.Type.StringDci: + // All good the first time. + break; + case FormatDescriptor.Type.StringNullTerm: + if (stropf.Lines.Count != 1 || stropf.HasEscapedText) { + // Must be single-line without quoted chars. + opcodeStr = sDataOpNames.StrGeneric; + redo = true; + } + break; + case FormatDescriptor.Type.StringL8: + if (stropf.Lines.Count != 1 || stropf.HasEscapedText) { + // Must be single-line without quoted chars. + opcodeStr = sDataOpNames.StrGeneric; + redo = true; + } + break; + default: + Debug.Assert(false); + return; } - for (; offset < strEndOffset; offset++) { - gath.WriteChar((char)(data[offset] & 0x7f)); + + if (redo) { + //Debug.WriteLine("REDO off=+" + offset.ToString("x6") + ": " + dfd.FormatType); + + // This time, instead of skipping over leading length bytes, we include them + // explicitly. + stropf.Reset(); + stropf.FeedBytes(data, offset, dfd.Length, hiddenLeadingBytes, false); } - while (showTrailing && trailingBytes-- > 0) { - gath.WriteByte(data[offset++]); + + opcodeStr = formatter.FormatPseudoOp(opcodeStr); + + foreach (string str in stropf.Lines) { + OutputLine(labelStr, opcodeStr, str, commentStr); + labelStr = commentStr = string.Empty; // only show on first } - gath.Finish(); } } diff --git a/SourceGen/SourceGen.csproj b/SourceGen/SourceGen.csproj index 619e1fd..0b48b8c 100644 --- a/SourceGen/SourceGen.csproj +++ b/SourceGen/SourceGen.csproj @@ -73,7 +73,6 @@ -