6502bench/Asm65/CharEncoding.cs

/*
 * Copyright 2019 faddenSoft
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
using System;
using System.Diagnostics;

namespace Asm65 {
    /// <summary>
    /// Character encoding helper methods.
    /// </summary>
    public static class CharEncoding {
        public const char UNPRINTABLE_CHAR = '\ufffd';  // Unicode REPLACEMENT CHARACTER

        /// <summary>
        /// Determines whether the byte represents a member of the character set.  The
        /// specifics (e.g. printable only) are defined by the method.
        /// </summary>
        public delegate bool InclusionTest(byte val);

        /// <summary>
        /// Converts the byte to a printable character.  Returns UNPRINTABLE_CHAR if the value
        /// does not map to something printable.
        /// </summary>
        /// <remarks>
        /// Yes, I'm assuming it all fits in a UTF-16 char.  PETSCII has some glyphs that
        /// aren't part of the BMP, but we're targeting a variety of cross-assemblers with
        /// potentially different notions of Unicode mappings, so anything non-ASCII is
        /// getting hexified anyway.
        /// </remarks>
        public delegate char Convert(byte val);

        /// <summary>
        /// Character encoding.
        /// </summary>
        public enum Encoding {
            Unknown = 0,
            Ascii,
            HighAscii,
            C64Petscii,
            C64ScreenCode,
        }

        //
        // Plain ASCII.
        //
        // We recognize BELL, LF, and CR as control characters that may be present in
        // text strings.  This allows use to generate:
        //
        //  .str "hello",$0d
        //
        // instead of:
        //
        //  .str "hello"
        //  .dd1  $0d
        //
        public static bool IsPrintableAscii(byte val) {
            return (val >= 0x20 && val < 0x7f);
        }
        public static bool IsExtendedAscii(byte val) {
            return IsPrintableAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
        }
        public static char ConvertAscii(byte val) {
            if (IsPrintableAscii(val)) {
                return (char)val;
            } else {
                return UNPRINTABLE_CHAR;
            }
        }

        //
        // High ASCII: plain ASCII with the high bit set.
        //
        public static bool IsPrintableHighAscii(byte val) {
            return (val >= 0xa0 && val < 0xff);
        }
        public static bool IsExtendedHighAscii(byte val) {
            return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
        }
        public static char ConvertHighAscii(byte val) {
            if (IsPrintableHighAscii(val)) {
                return (char)(val & 0x7f);
            } else {
                return UNPRINTABLE_CHAR;
            }
        }

        //
        // High and/or low ASCII.
        //
        public static bool IsPrintableLowOrHighAscii(byte val) {
            return IsPrintableAscii((byte)(val & 0x7f));
        }
        public static bool IsExtendedLowOrHighAscii(byte val) {
            return IsExtendedAscii((byte)(val & 0x7f));
        }
        public static char ConvertLowAndHighAscii(byte val) {
            //if (IsPrintableAscii(val) || IsPrintableHighAscii(val)) {
            //    return (char)(val & 0x7f);
            //} else {
            //    return UNPRINTABLE_CHAR;
            //}
            return ConvertAscii((byte)(val & 0x7f));
        }

        //
        // ATASCII (Atari 400/800)
        //
        // Substantially similar to ASCII, but with printable symbols in the control character
        // range ($00-1f).  Characters $60 and $7b-7f don't correspond to ASCII symbols.
        //
        // Characters with the high bit set are shown with colors reversed.
        //


        //
        // PETSCII (C64 variant)
        //
        // Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
        // mode, or text mode.
        //
        // Comparison to ASCII:
        //  $00-1f: control codes, many with C64-specific meanings
        //  $20-3f: same as ASCII
        //  $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
        //   replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
        //  $60-7f: upper case letters (rather than lower case); backquote, curly braces,
        //   vertical bar, and tilde replaced with non-ASCII symbols
        //  $80-9f: more control codes
        //  $a0-bf: non-ASCII symbols
        //  $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
        //   equal to lower case with the high bit set
        //  $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
        //
        // The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
        // (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
        // Anything outside that range will get printed as hex to ensure proper conversion.
        //
        // Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
        // assigned to the caret and underscore values.  So arguably those are "ASCII" as
        // well, unless you're sane and define ASCII more narrowly.
        //
        // Control codes that we might expect to appear in the middle of a string:
        //  $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
        //  $93 - clear
        //  $12 92 - reverse on/off
        //  $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
        //
        // Other Commodore systems use variants on PETSCII, but the ASCII correspondence remains
        // the same -- only the non-ASCII symbols change.  (On the original PET, $60-7f was a
        // duplicate of $20-3f rather than a duplicate of the upper-case letters, which might be
        // why $c0-df is preferred for upper case.)
        //
        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
        //
        private static bool[] sPrintablePetscii = CreatePrintablePetsciiMap();
        private static bool[] sExtendedPetscii = CreateExtendedPetsciiMap();
        private static bool[] CreatePrintablePetsciiMap() {
            bool[] map = new bool[256];
            for (int i = 0x20; i <= 0x5b; i++) {
                map[i] = true;
            }
            map[0x5d] = true;
            for (int i = 0xc1; i <= 0xda; i++) {
                map[i] = true;
            }
            return map;
        }
        private static bool[] CreateExtendedPetsciiMap() {
            bool[] map = CreatePrintablePetsciiMap();
            // control codes that we might expect to find in strings
            map[0x05] = map[0x1c] = map[0x1e] = map[0x1f] = map[0x81] = map[0x90] = map[0x95] =
                map[0x96] = map[0x97] = map[0x98] = map[0x99] = map[0x9a] = map[0x9b] =
                map[0x9c] = map[0x9e] = map[0x9f] = true;
            map[0x93] = map[0x12] = map[0x92] = true;
            map[0x07] = map[0x0a] = map[0x0d] = true;
            return map;
        }
        public static bool IsPrintableC64Petscii(byte val) {
            return sPrintablePetscii[val];
        }
        public static bool IsExtendedC64Petscii(byte val) {
            return sExtendedPetscii[val];
        }
        private static char[] sPetsciiToUnicode = CreatePetsciiToUnicodeMap();
        private static char[] CreatePetsciiToUnicodeMap() {
            // There are performance arguments for doing this with and without a table.  For
            // x64 with fast memory and large caches, table seems reasonable.
            char[] map = new char[256];
            for (int val = 0; val < 256; val++) {
                char ch;
                if ((val >= 0x20 && val <= 0x40) || val == 0x5b || val == 0x5d) {
                    ch = (char)val;               // number/symbols, '[', ']'
                } else if (val >= 0x41 && val <= 0x5a) {
                    ch = (char)(val + 0x20);      // lower case
                } else if (val >= 0xc1 && val <= 0xda) {
                    ch = (char)(val - 0x80);      // upper case
                } else {
                    Debug.Assert(!IsPrintableC64Petscii((byte)val));
                    ch = UNPRINTABLE_CHAR;
                }
                map[val] = ch;
            }
            return map;
        }
        public static char ConvertC64Petscii(byte val) {
            return sPetsciiToUnicode[val];
        }
        public static char ConvertLowAndHighC64Petscii(byte val) {
            // This is an odd one.  Some programs use DCI with PETSCII, which means the
            // string is allow lower case except for the last letteR.
            //
            // There's no such thing as "high PETSCII", in the same sense that ASCII or
            // C64 screen codes have it, but I'm giving the method a similar name for
            // the sake of consistency.
            return ConvertC64Petscii((byte)(val & 0x7f));
        }

        //
        // C64 Screen Codes
        //
        // Using character set 2, which includes lower case letters.
        //
        //  $00-1f: lower case letters (PETSCII $40-5f)
        //  $20-3f: same as ASCII (PETSCII $20-3f)
        //  $40-5f: upper case letters (PETSCII $60-7f / $c0-df)
        //  $60-7f: non-ASCII symbols (PETSCII $a0-bf)
        //
        // With the high bit set, character colors are reversed.  The printable ASCII set
        // is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a].  By definition, only printable characters
        // are included in the set, so there are no control codes.
        //
        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
        //
        private static bool[] sPrintableScreenCode = CreatePrintableScreenCodeMap();
        private static bool[] CreatePrintableScreenCodeMap() {
            bool[] map = new bool[256];
            for (int i = 0x00; i <= 0x1b; i++) {
                map[i] = true;
            }
            map[0x1d] = true;
            for (int i = 0x20; i <= 0x3f; i++) {
                map[i] = true;
            }
            for (int i = 0x41; i <= 0x5a; i++) {
                map[i] = true;
            }
            return map;
        }
        public static bool IsPrintableC64ScreenCode(byte val) {
            return sPrintableScreenCode[val];
        }
        public static bool IsExtendedC64ScreenCode(byte val) {
            return sPrintableScreenCode[val];
        }
        private static char[] sScreenCodeToUnicode = CreateScreenCodeToUnicodeMap();
        private static char[] CreateScreenCodeToUnicodeMap() {
            char[] map = new char[256];
            for (int val = 0; val < 256; val++) {
                char ch;
                if (val == 0x00 || val == 0x1b || val == 0x1d) {
                    ch = (char)(val + 0x40);      // '@', '[', ']'
                } else if (val >= 0x01 && val <= 0x1a) {
                    ch = (char)(val + 0x60);      // lower case
                } else if (val >= 0x20 && val <= 0x3f) {
                    ch = (char)(val);             // numbers/symbols
                } else if (val >= 0x41 && val <= 0x5a) {
                    ch = (char)(val);             // upper case
                } else {
                    Debug.Assert(!IsPrintableC64ScreenCode((byte)val));
                    ch = UNPRINTABLE_CHAR;
                }
                map[val] = ch;
            }
            return map;
        }
        public static char ConvertC64ScreenCode(byte val) {
            return sScreenCodeToUnicode[val];
        }
        public static char ConvertLowAndHighC64ScreenCode(byte val) {
            return ConvertC64ScreenCode((byte)(val & 0x7f));
        }
    }
}
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								/*
 								 * Copyright 2019 faddenSoft
 								 *
 								 * Licensed under the Apache License, Version 2.0 (the "License");
 								 * you may not use this file except in compliance with the License.
 								 * You may obtain a copy of the License at
 								 *
 								 *     http://www.apache.org/licenses/LICENSE-2.0
 								 *
 								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS,
 								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								 * See the License for the specific language governing permissions and
 								 * limitations under the License.
 								 */
 								using System;
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								using System.Diagnostics;
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
 								namespace Asm65 {
 								    /// <summary>
 								    /// Character encoding helper methods.
 								    /// </summary>
 								    public static class CharEncoding {
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								        public const char UNPRINTABLE_CHAR = '\ufffd';  // Unicode REPLACEMENT CHARACTER
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
 								        /// <summary>
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								        /// Determines whether the byte represents a member of the character set.  The
 								        /// specifics (e.g. printable only) are defined by the method.
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        /// </summary>
 								        public delegate bool InclusionTest(byte val);
 								        /// <summary>
 								        /// Converts the byte to a printable character.  Returns UNPRINTABLE_CHAR if the value
 								        /// does not map to something printable.
 								        /// </summary>
 								        /// <remarks>
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								        /// Yes, I'm assuming it all fits in a UTF-16 char.  PETSCII has some glyphs that
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        /// aren't part of the BMP, but we're targeting a variety of cross-assemblers with
 								        /// potentially different notions of Unicode mappings, so anything non-ASCII is
 								        /// getting hexified anyway.
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        /// </remarks>
 								        public delegate char Convert(byte val);
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        /// <summary>
 								        /// Character encoding.
 								        /// </summary>
-												Replace character operand output method

The previous code output a character in single-quotes if it was
standard ASCII, double-quotes if high ASCII, or hex if it was neither
of those.  If a flag was set, high ASCII would also be output as
hex.

The new system takes the character value and an encoding identifier.
The identifier selects the character converter and delimiter
pattern, and puts the two together to generate the operand.

While doing this I realized that I could trivially support high
ASCII character arguments in all assemblers by setting the delimiter
pattern to "'#' | $80".

In FormatDescriptor, I had previously renamed the "Ascii" sub-type
"LowAscii" so it wouldn't be confused, but I dislike filling the
project file with "LowAscii" when "Ascii" is more accurate and less
confusing.  So I switched it back, and we now check the project
file version number when deciding what to do with an ASCII item.
The CharEncoding tests/converters were also renamed.

Moved the default delimiter patterns to the string table.

Widened the delimiter pattern input fields slightly.  Added a read-
only TextBox with assorted non-typewriter quotes and things so
people have something to copy text from.

											
										
										
											2019-08-12 00:59:20 +00:00
+								        public enum Encoding {
 								            Unknown = 0,
 								            Ascii,
 								            HighAscii,
 								            C64Petscii,
 								            C64ScreenCode,
 								        }
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        //
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        // Plain ASCII.
 								        //
 								        // We recognize BELL, LF, and CR as control characters that may be present in
 								        // text strings.  This allows use to generate:
 								        //
 								        //  .str "hello",$0d
 								        //
 								        // instead of:
 								        //
 								        //  .str "hello"
 								        //  .dd1  $0d
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        //
-												Replace character operand output method

The previous code output a character in single-quotes if it was
standard ASCII, double-quotes if high ASCII, or hex if it was neither
of those.  If a flag was set, high ASCII would also be output as
hex.

The new system takes the character value and an encoding identifier.
The identifier selects the character converter and delimiter
pattern, and puts the two together to generate the operand.

While doing this I realized that I could trivially support high
ASCII character arguments in all assemblers by setting the delimiter
pattern to "'#' | $80".

In FormatDescriptor, I had previously renamed the "Ascii" sub-type
"LowAscii" so it wouldn't be confused, but I dislike filling the
project file with "LowAscii" when "Ascii" is more accurate and less
confusing.  So I switched it back, and we now check the project
file version number when deciding what to do with an ASCII item.
The CharEncoding tests/converters were also renamed.

Moved the default delimiter patterns to the string table.

Widened the delimiter pattern input fields slightly.  Added a read-
only TextBox with assorted non-typewriter quotes and things so
people have something to copy text from.

											
										
										
											2019-08-12 00:59:20 +00:00
+								        public static bool IsPrintableAscii(byte val) {
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								            return (val >= 0x20 && val < 0x7f);
 								        }
-												Replace character operand output method

The previous code output a character in single-quotes if it was
standard ASCII, double-quotes if high ASCII, or hex if it was neither
of those.  If a flag was set, high ASCII would also be output as
hex.

The new system takes the character value and an encoding identifier.
The identifier selects the character converter and delimiter
pattern, and puts the two together to generate the operand.

While doing this I realized that I could trivially support high
ASCII character arguments in all assemblers by setting the delimiter
pattern to "'#' | $80".

In FormatDescriptor, I had previously renamed the "Ascii" sub-type
"LowAscii" so it wouldn't be confused, but I dislike filling the
project file with "LowAscii" when "Ascii" is more accurate and less
confusing.  So I switched it back, and we now check the project
file version number when deciding what to do with an ASCII item.
The CharEncoding tests/converters were also renamed.

Moved the default delimiter patterns to the string table.

Widened the delimiter pattern input fields slightly.  Added a read-
only TextBox with assorted non-typewriter quotes and things so
people have something to copy text from.

											
										
										
											2019-08-12 00:59:20 +00:00
+								        public static bool IsExtendedAscii(byte val) {
 								            return IsPrintableAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        }
-												Replace character operand output method

The previous code output a character in single-quotes if it was
standard ASCII, double-quotes if high ASCII, or hex if it was neither
of those.  If a flag was set, high ASCII would also be output as
hex.

The new system takes the character value and an encoding identifier.
The identifier selects the character converter and delimiter
pattern, and puts the two together to generate the operand.

While doing this I realized that I could trivially support high
ASCII character arguments in all assemblers by setting the delimiter
pattern to "'#' | $80".

In FormatDescriptor, I had previously renamed the "Ascii" sub-type
"LowAscii" so it wouldn't be confused, but I dislike filling the
project file with "LowAscii" when "Ascii" is more accurate and less
confusing.  So I switched it back, and we now check the project
file version number when deciding what to do with an ASCII item.
The CharEncoding tests/converters were also renamed.

Moved the default delimiter patterns to the string table.

Widened the delimiter pattern input fields slightly.  Added a read-
only TextBox with assorted non-typewriter quotes and things so
people have something to copy text from.

											
										
										
											2019-08-12 00:59:20 +00:00
+								        public static char ConvertAscii(byte val) {
 								            if (IsPrintableAscii(val)) {
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								                return (char)val;
 								            } else {
 								                return UNPRINTABLE_CHAR;
 								            }
 								        }
 								        //
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        // High ASCII: plain ASCII with the high bit set.
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        //
 								        public static bool IsPrintableHighAscii(byte val) {
 								            return (val >= 0xa0 && val < 0xff);
 								        }
 								        public static bool IsExtendedHighAscii(byte val) {
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								            return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        }
 								        public static char ConvertHighAscii(byte val) {
 								            if (IsPrintableHighAscii(val)) {
 								                return (char)(val & 0x7f);
 								            } else {
 								                return UNPRINTABLE_CHAR;
 								            }
 								        }
 								        //
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        // High and/or low ASCII.
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        //
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        public static bool IsPrintableLowOrHighAscii(byte val) {
 								            return IsPrintableAscii((byte)(val & 0x7f));
 								        }
 								        public static bool IsExtendedLowOrHighAscii(byte val) {
 								            return IsExtendedAscii((byte)(val & 0x7f));
 								        }
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        public static char ConvertLowAndHighAscii(byte val) {
-												Add C64 encodings to instruction and data operand editors

Both dialogs got a couple extra radio buttons for selection of
single character operands.  The data operand editor got a combo box
that lets you specify how it scans for viable strings.

Various string scanning methods were made more generic.  This got a
little strange with auto-detection of low/high ASCII, but that was
mostly a matter of keeping the previous code around as a special
case.

Made C64 Screen Code DCI strings a thing that works.

											
										
										
											2019-08-16 00:53:12 +00:00
+								            //if (IsPrintableAscii(val) || IsPrintableHighAscii(val)) {
 								            //    return (char)(val & 0x7f);
 								            //} else {
 								            //    return UNPRINTABLE_CHAR;
 								            //}
 								            return ConvertAscii((byte)(val & 0x7f));
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								        }
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								        //
-												Add combo box for default text scan mode

It's not quite the same as the character encoding -- sometimes we
want a mix of things -- so it gets its own enum.  The value is
saved to the project file, but not actually used yet.

Also, moved some combo box strings into XAML resources.

											
										
										
											2019-08-13 00:01:50 +00:00
+								        // ATASCII (Atari 400/800)
 								        //
 								        // Substantially similar to ASCII, but with printable symbols in the control character
 								        // range ($00-1f).  Characters $60 and $7b-7f don't correspond to ASCII symbols.
 								        //
 								        // Characters with the high bit set are shown with colors reversed.
 								        //
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
-												Add combo box for default text scan mode

It's not quite the same as the character encoding -- sometimes we
want a mix of things -- so it gets its own enum.  The value is
saved to the project file, but not actually used yet.

Also, moved some combo box strings into XAML resources.

											
										
										
											2019-08-13 00:01:50 +00:00
+								        //
 								        // PETSCII (C64 variant)
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								        //
 								        // Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
 								        // mode, or text mode.
 								        //
 								        // Comparison to ASCII:
 								        //  $00-1f: control codes, many with C64-specific meanings
 								        //  $20-3f: same as ASCII
 								        //  $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
 								        //   replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
 								        //  $60-7f: upper case letters (rather than lower case); backquote, curly braces,
 								        //   vertical bar, and tilde replaced with non-ASCII symbols
 								        //  $80-9f: more control codes
 								        //  $a0-bf: non-ASCII symbols
 								        //  $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
 								        //   equal to lower case with the high bit set
 								        //  $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
 								        //
 								        // The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
 								        // (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
 								        // Anything outside that range will get printed as hex to ensure proper conversion.
 								        //
 								        // Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
 								        // assigned to the caret and underscore values.  So arguably those are "ASCII" as
 								        // well, unless you're sane and define ASCII more narrowly.
 								        //
 								        // Control codes that we might expect to appear in the middle of a string:
 								        //  $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
 								        //  $93 - clear
 								        //  $12 92 - reverse on/off
 								        //  $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
 								        //
-												Add combo box for default text scan mode

It's not quite the same as the character encoding -- sometimes we
want a mix of things -- so it gets its own enum.  The value is
saved to the project file, but not actually used yet.

Also, moved some combo box strings into XAML resources.

											
										
										
											2019-08-13 00:01:50 +00:00
+								        // Other Commodore systems use variants on PETSCII, but the ASCII correspondence remains
 								        // the same -- only the non-ASCII symbols change.  (On the original PET, $60-7f was a
 								        // duplicate of $20-3f rather than a duplicate of the upper-case letters, which might be
 								        // why $c0-df is preferred for upper case.)
 								        //
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
 								        //
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        private static bool[] sPrintablePetscii = CreatePrintablePetsciiMap();
 								        private static bool[] sExtendedPetscii = CreateExtendedPetsciiMap();
 								        private static bool[] CreatePrintablePetsciiMap() {
 								            bool[] map = new bool[256];
 								            for (int i = 0x20; i <= 0x5b; i++) {
 								                map[i] = true;
 								            }
 								            map[0x5d] = true;
 								            for (int i = 0xc1; i <= 0xda; i++) {
 								                map[i] = true;
 								            }
 								            return map;
 								        }
 								        private static bool[] CreateExtendedPetsciiMap() {
 								            bool[] map = CreatePrintablePetsciiMap();
 								            // control codes that we might expect to find in strings
 								            map[0x05] = map[0x1c] = map[0x1e] = map[0x1f] = map[0x81] = map[0x90] = map[0x95] =
 								                map[0x96] = map[0x97] = map[0x98] = map[0x99] = map[0x9a] = map[0x9b] =
 								                map[0x9c] = map[0x9e] = map[0x9f] = true;
 								            map[0x93] = map[0x12] = map[0x92] = true;
 								            map[0x07] = map[0x0a] = map[0x0d] = true;
 								            return map;
 								        }
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								        public static bool IsPrintableC64Petscii(byte val) {
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								            return sPrintablePetscii[val];
 								        }
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								        public static bool IsExtendedC64Petscii(byte val) {
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								            return sExtendedPetscii[val];
 								        }
-												Various improvements

The PseudoOpNames class is increasingly being used in situations
where mutability is undesirable.  This change makes instances
immutable, eliminating the Copy() method and adding a constructor
that takes a Dictionary.  The serialization code now operates on a
Dictionary instead of the class properties, but the JSON encoding is
identical, so this doesn't invalidate app settings file data.

Added an equality test to PseudoOpNames.  In LineListGen, don't
reset the line list if the names haven't actually changed.

Use a table lookup for C64 character conversions.  I figure that
should be faster than multiple conditionals on a modern x64 system.

Fixed a 64tass generator issue where we tried to query project
properties in a call that might not have a project available
(specifically, getting FormatConfig values out of the generator for
use in the "quick set" buttons for Display Format).

Fixed a regression test harness issue where, if the assembler reported
success but didn't actually generate output, an exception would be
thrown that halted the tests.

Increased the width of text entry fields on the Pseudo-Op tab of app
settings.  The previous 8-character limit wasn't wide enough to hold
ACME's "!pseudopc".  Also, use TrimEnd() to remove trailing spaces
(leading spaces are still allowed).

In the last couple of months, Win10 started stalling for a fraction
of a second when executing assemblers.  It doesn't do this every
time; mostly it happens if it has been a while since the assembler
was run.  My guess is this has to do with changes to the built-in
malware scanner.  Whatever the case, we now change the mouse pointer
to a wait cursor while updating the assembler version cache.

											
										
										
											2019-08-17 18:14:05 +00:00
+								        private static char[] sPetsciiToUnicode = CreatePetsciiToUnicodeMap();
 								        private static char[] CreatePetsciiToUnicodeMap() {
 								            // There are performance arguments for doing this with and without a table.  For
 								            // x64 with fast memory and large caches, table seems reasonable.
 								            char[] map = new char[256];
 								            for (int val = 0; val < 256; val++) {
 								                char ch;
 								                if ((val >= 0x20 && val <= 0x40) || val == 0x5b || val == 0x5d) {
 								                    ch = (char)val;               // number/symbols, '[', ']'
 								                } else if (val >= 0x41 && val <= 0x5a) {
 								                    ch = (char)(val + 0x20);      // lower case
 								                } else if (val >= 0xc1 && val <= 0xda) {
 								                    ch = (char)(val - 0x80);      // upper case
 								                } else {
 								                    Debug.Assert(!IsPrintableC64Petscii((byte)val));
 								                    ch = UNPRINTABLE_CHAR;
 								                }
 								                map[val] = ch;
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								            }
-												Various improvements

The PseudoOpNames class is increasingly being used in situations
where mutability is undesirable.  This change makes instances
immutable, eliminating the Copy() method and adding a constructor
that takes a Dictionary.  The serialization code now operates on a
Dictionary instead of the class properties, but the JSON encoding is
identical, so this doesn't invalidate app settings file data.

Added an equality test to PseudoOpNames.  In LineListGen, don't
reset the line list if the names haven't actually changed.

Use a table lookup for C64 character conversions.  I figure that
should be faster than multiple conditionals on a modern x64 system.

Fixed a 64tass generator issue where we tried to query project
properties in a call that might not have a project available
(specifically, getting FormatConfig values out of the generator for
use in the "quick set" buttons for Display Format).

Fixed a regression test harness issue where, if the assembler reported
success but didn't actually generate output, an exception would be
thrown that halted the tests.

Increased the width of text entry fields on the Pseudo-Op tab of app
settings.  The previous 8-character limit wasn't wide enough to hold
ACME's "!pseudopc".  Also, use TrimEnd() to remove trailing spaces
(leading spaces are still allowed).

In the last couple of months, Win10 started stalling for a fraction
of a second when executing assemblers.  It doesn't do this every
time; mostly it happens if it has been a while since the assembler
was run.  My guess is this has to do with changes to the built-in
malware scanner.  Whatever the case, we now change the mouse pointer
to a wait cursor while updating the assembler version cache.

											
										
										
											2019-08-17 18:14:05 +00:00
+								            return map;
 								        }
 								        public static char ConvertC64Petscii(byte val) {
 								            return sPetsciiToUnicode[val];
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								        }
-												PETSCII does DCI

I didn't think it made sense, but I found something that used it,
so apparently it's a thing.  This updates the operand editor to
let you choose PETSCII+DCI, and updates the assemblers to handle
it correctly (really just 64tass, since the others either don't
have a DCI directive or don't deal with PETSCII at all).

Changed the char-encoding sample from "bad dcI" to "pet dcI", and
updated the documentation.

											
										
										
											2019-08-21 00:55:12 +00:00
+								        public static char ConvertLowAndHighC64Petscii(byte val) {
 								            // This is an odd one.  Some programs use DCI with PETSCII, which means the
 								            // string is allow lower case except for the last letteR.
 								            //
 								            // There's no such thing as "high PETSCII", in the same sense that ASCII or
 								            // C64 screen codes have it, but I'm giving the method a similar name for
 								            // the sake of consistency.
 								            return ConvertC64Petscii((byte)(val & 0x7f));
 								        }
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
 								        //
 								        // C64 Screen Codes
 								        //
 								        // Using character set 2, which includes lower case letters.
 								        //
 								        //  $00-1f: lower case letters (PETSCII $40-5f)
 								        //  $20-3f: same as ASCII (PETSCII $20-3f)
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								        //  $40-5f: upper case letters (PETSCII $60-7f / $c0-df)
-												Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.

											
										
										
											2019-08-11 18:27:09 +00:00
+								        //  $60-7f: non-ASCII symbols (PETSCII $a0-bf)
 								        //
 								        // With the high bit set, character colors are reversed.  The printable ASCII set
 								        // is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a].  By definition, only printable characters
 								        // are included in the set, so there are no control codes.
 								        //
 								        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
 								        //
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								        private static bool[] sPrintableScreenCode = CreatePrintableScreenCodeMap();
 								        private static bool[] CreatePrintableScreenCodeMap() {
 								            bool[] map = new bool[256];
 								            for (int i = 0x00; i <= 0x1b; i++) {
 								                map[i] = true;
 								            }
 								            map[0x1d] = true;
 								            for (int i = 0x20; i <= 0x3f; i++) {
 								                map[i] = true;
 								            }
 								            for (int i = 0x41; i <= 0x5a; i++) {
 								                map[i] = true;
 								            }
 								            return map;
 								        }
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								        public static bool IsPrintableC64ScreenCode(byte val) {
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								            return sPrintableScreenCode[val];
 								        }
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								        public static bool IsExtendedC64ScreenCode(byte val) {
-												Add multiple encoding support to uncategorized data analyzer

The code that searches for character strings in uncategorized data
now recognizes the C64 encodings when selected in the project
properties.

The new code avoids some redundant comparisons when runs of
printable characters are found.  I suspect the new implementation
loses on overall performance because we're now calling through
delegates instead of testing characters directly, but I haven't
tested for that.

											
										
										
											2019-08-13 21:08:27 +00:00
+								            return sPrintableScreenCode[val];
 								        }
-												Various improvements

The PseudoOpNames class is increasingly being used in situations
where mutability is undesirable.  This change makes instances
immutable, eliminating the Copy() method and adding a constructor
that takes a Dictionary.  The serialization code now operates on a
Dictionary instead of the class properties, but the JSON encoding is
identical, so this doesn't invalidate app settings file data.

Added an equality test to PseudoOpNames.  In LineListGen, don't
reset the line list if the names haven't actually changed.

Use a table lookup for C64 character conversions.  I figure that
should be faster than multiple conditionals on a modern x64 system.

Fixed a 64tass generator issue where we tried to query project
properties in a call that might not have a project available
(specifically, getting FormatConfig values out of the generator for
use in the "quick set" buttons for Display Format).

Fixed a regression test harness issue where, if the assembler reported
success but didn't actually generate output, an exception would be
thrown that halted the tests.

Increased the width of text entry fields on the Pseudo-Op tab of app
settings.  The previous 8-character limit wasn't wide enough to hold
ACME's "!pseudopc".  Also, use TrimEnd() to remove trailing spaces
(leading spaces are still allowed).

In the last couple of months, Win10 started stalling for a fraction
of a second when executing assemblers.  It doesn't do this every
time; mostly it happens if it has been a while since the assembler
was run.  My guess is this has to do with changes to the built-in
malware scanner.  Whatever the case, we now change the mouse pointer
to a wait cursor while updating the assembler version cache.

											
										
										
											2019-08-17 18:14:05 +00:00
+								        private static char[] sScreenCodeToUnicode = CreateScreenCodeToUnicodeMap();
 								        private static char[] CreateScreenCodeToUnicodeMap() {
 								            char[] map = new char[256];
 								            for (int val = 0; val < 256; val++) {
 								                char ch;
 								                if (val == 0x00 || val == 0x1b || val == 0x1d) {
 								                    ch = (char)(val + 0x40);      // '@', '[', ']'
 								                } else if (val >= 0x01 && val <= 0x1a) {
 								                    ch = (char)(val + 0x60);      // lower case
 								                } else if (val >= 0x20 && val <= 0x3f) {
 								                    ch = (char)(val);             // numbers/symbols
 								                } else if (val >= 0x41 && val <= 0x5a) {
 								                    ch = (char)(val);             // upper case
 								                } else {
 								                    Debug.Assert(!IsPrintableC64ScreenCode((byte)val));
 								                    ch = UNPRINTABLE_CHAR;
 								                }
 								                map[val] = ch;
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								            }
-												Various improvements

The PseudoOpNames class is increasingly being used in situations
where mutability is undesirable.  This change makes instances
immutable, eliminating the Copy() method and adding a constructor
that takes a Dictionary.  The serialization code now operates on a
Dictionary instead of the class properties, but the JSON encoding is
identical, so this doesn't invalidate app settings file data.

Added an equality test to PseudoOpNames.  In LineListGen, don't
reset the line list if the names haven't actually changed.

Use a table lookup for C64 character conversions.  I figure that
should be faster than multiple conditionals on a modern x64 system.

Fixed a 64tass generator issue where we tried to query project
properties in a call that might not have a project available
(specifically, getting FormatConfig values out of the generator for
use in the "quick set" buttons for Display Format).

Fixed a regression test harness issue where, if the assembler reported
success but didn't actually generate output, an exception would be
thrown that halted the tests.

Increased the width of text entry fields on the Pseudo-Op tab of app
settings.  The previous 8-character limit wasn't wide enough to hold
ACME's "!pseudopc".  Also, use TrimEnd() to remove trailing spaces
(leading spaces are still allowed).

In the last couple of months, Win10 started stalling for a fraction
of a second when executing assemblers.  It doesn't do this every
time; mostly it happens if it has been a while since the assembler
was run.  My guess is this has to do with changes to the built-in
malware scanner.  Whatever the case, we now change the mouse pointer
to a wait cursor while updating the assembler version cache.

											
										
										
											2019-08-17 18:14:05 +00:00
+								            return map;
 								        }
 								        public static char ConvertC64ScreenCode(byte val) {
 								            return sScreenCodeToUnicode[val];
-												Replace on-screen string operand formatting

The previous functions just grabbed 62 characters and slapped quotes
on the ends, but that doesn't work if we want to show strings with
embedded control characters.  This change replaces the simple
formatter with the one used to generate assembly source code.  This
increases the cost of refreshing the display list, so a cache will
need to be added in a future change.

Converters for C64 PETSCII and C64 Screen Code have been defined.
The results of changing the auto-scan encoding can now be viewed.

The string operand formatter was using a single delimiter, but for
the on-screen version we want open-quote and close-quote, and might
want to identify some encodings with a prefix.  The formatter now
takes a class that defines the various parts.  (It might be worth
replacing the delimiter patterns recently added for single-character
operands with this, so we don't have two mechanisms for very nearly
the same thing.)

While working on this change I remembered why there were two kinds
of "reverse" in the old Merlin 32 string operand generator: what you
want for assembly code is different from what you want on screen.
The ReverseMode enum has been resurrected.

											
										
										
											2019-08-14 00:22:21 +00:00
+								        }
-												Add C64 encodings to instruction and data operand editors

Both dialogs got a couple extra radio buttons for selection of
single character operands.  The data operand editor got a combo box
that lets you specify how it scans for viable strings.

Various string scanning methods were made more generic.  This got a
little strange with auto-detection of low/high ASCII, but that was
mostly a matter of keeping the previous code around as a special
case.

Made C64 Screen Code DCI strings a thing that works.

											
										
										
											2019-08-16 00:53:12 +00:00
+								        public static char ConvertLowAndHighC64ScreenCode(byte val) {
 								            return ConvertC64ScreenCode((byte)(val & 0x7f));
 								        }
-												Rework string operand formatting

This generalizes the string pseudo-operand formatter, moving it into
the Asm65 library.  The assembly source generators have been updated
to use it.  This makes the individual generators simpler, and by
virtue of avoiding "test runs" should make them slightly faster.

This also introduces byte-to-character converters, though we're
currently still only supporting low/high ASCII.

Regression test output is unchanged.

											
										
										
											2019-08-09 23:42:30 +00:00
+								    }
 								}