From bc633288ad5f36058f4f6e2403293cb24da4439e Mon Sep 17 00:00:00 2001
From: Andy McFadden <fadden@fadden.com>
Date: Sun, 11 Aug 2019 11:27:09 -0700
Subject: [PATCH] Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.
---
 Asm65/CharEncoding.cs                         |  67 ++++++++-
 SourceGen/RuntimeData/Help/codegen.html       |   9 ++
 SourceGen/SGTestData/README.md                |   5 +-
 .../SGTestData/Source/2016-char-encoding.S    | 141 ++++++++++++++++++
 4 files changed, 214 insertions(+), 8 deletions(-)
 create mode 100644 SourceGen/SGTestData/Source/2016-char-encoding.S
diff --git a/Asm65/CharEncoding.cs b/Asm65/CharEncoding.cs
index 7f7b6a5..dbf1461 100644
--- a/Asm65/CharEncoding.cs
+++ b/Asm65/CharEncoding.cs
@@ -22,10 +22,11 @@ namespace Asm65 {
     /// Character encoding helper methods.
     /// </summary>
     public static class CharEncoding {
-        public const char UNPRINTABLE_CHAR = '\ufffd';
+        public const char UNPRINTABLE_CHAR = '\ufffd';  // Unicode REPLACEMENT CHARACTER
 
         /// <summary>
-        /// Determines whether the byte represents a character in the character set.
+        /// Determines whether the byte represents a member of the character set.  The
+        /// specifics (e.g. printable only) are defined by the method.
         /// </summary>
         public delegate bool InclusionTest(byte val);
 
@@ -34,11 +35,13 @@ namespace Asm65 {
         /// does not map to something printable.
         /// </summary>
         /// <remarks>
-        /// Yes, I'm assuming it all fits in the Unicode BMP.  Should be a safe assumption
-        /// for 8-bit computer character sets.
+        /// Yes, I'm assuming it all fits in a UTF-16 char.  PETSCII has some glyphs that
+        /// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so
+        /// anything non-ASCII is getting hexified anyway.
         /// </remarks>
         public delegate char Convert(byte val);
 
+
         //
         // Standard ASCII.
         //
@@ -46,7 +49,7 @@ namespace Asm65 {
             return (val >= 0x20 && val < 0x7f);
         }
         public static bool IsExtendedLowAscii(byte val) {
-            return IsPrintableLowAscii(val) || val == 0x0a || val == 0x0d;
+            return IsPrintableLowAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
         }
         public static char ConvertLowAscii(byte val) {
             if (IsPrintableLowAscii(val)) {
@@ -63,7 +66,7 @@ namespace Asm65 {
             return (val >= 0xa0 && val < 0xff);
         }
         public static bool IsExtendedHighAscii(byte val) {
-            return IsPrintableHighAscii(val) || val == 0x8a || val == 0x8d;
+            return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
         }
         public static char ConvertHighAscii(byte val) {
             if (IsPrintableHighAscii(val)) {
@@ -84,5 +87,57 @@ namespace Asm65 {
             }
         }
 
+        //
+        // C64 PETSCII
+        //
+        // Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
+        // mode, or text mode.
+        //
+        // Comparison to ASCII:
+        //  $00-1f: control codes, many with C64-specific meanings
+        //  $20-3f: same as ASCII
+        //  $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
+        //   replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
+        //  $60-7f: upper case letters (rather than lower case); backquote, curly braces,
+        //   vertical bar, and tilde replaced with non-ASCII symbols
+        //  $80-9f: more control codes
+        //  $a0-bf: non-ASCII symbols
+        //  $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
+        //   equal to lower case with the high bit set
+        //  $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
+        //
+        // The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
+        // (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
+        // Anything outside that range will get printed as hex to ensure proper conversion.
+        //
+        // Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
+        // assigned to the caret and underscore values.  So arguably those are "ASCII" as
+        // well, unless you're sane and define ASCII more narrowly.
+        //
+        // Control codes that we might expect to appear in the middle of a string:
+        //  $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
+        //  $93 - clear
+        //  $12 92 - reverse on/off
+        //  $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
+        //
+        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
+        //
+
+        //
+        // C64 Screen Codes
+        //
+        // Using character set 2, which includes lower case letters.
+        //
+        //  $00-1f: lower case letters (PETSCII $40-5f)
+        //  $20-3f: same as ASCII (PETSCII $20-3f)
+        //  $40-5f: upper case letters (PETSCII $60-7f)
+        //  $60-7f: non-ASCII symbols (PETSCII $a0-bf)
+        //
+        // With the high bit set, character colors are reversed.  The printable ASCII set
+        // is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a].  By definition, only printable characters
+        // are included in the set, so there are no control codes.
+        //
+        // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
+        //
     }
 }
diff --git a/SourceGen/RuntimeData/Help/codegen.html b/SourceGen/RuntimeData/Help/codegen.html
index 759fb58..a7549ef 100644
--- a/SourceGen/RuntimeData/Help/codegen.html
+++ b/SourceGen/RuntimeData/Help/codegen.html
@@ -156,6 +156,15 @@ code, but also needs to know how to handle the corner cases.</p>
   <li>For 65816, selecting the bank byte is done with the back-quote ('`')
     rather than the caret ('^').  (There's a note in the docs to the effect
     that they plan to move to carets.)</li>
+  <li>By default, the assembler assumes that the input is PETSCII, but
+    doesn't convert characters in text strings.  So PETSCII source files
+    generate PETSCII strings, and ASCII source files generate ASCII
+    strings.  However, if you use the built-in "screen" encoding, you will
+    get the wrong behavior if you compile an ASCII source without the
+    "--ascii" command-line flag, because it expects to convert from
+    PETSCII.  To get the behavior expected of a cross-assembler, it's
+    necessary to pass "--ascii" and explicitly define an ASCII encoding
+    for use with ASCII text strings.</li>
 </ul>
 
 
diff --git a/SourceGen/SGTestData/README.md b/SourceGen/SGTestData/README.md
index e4c2fba..3c01a94 100644
--- a/SourceGen/SGTestData/README.md
+++ b/SourceGen/SGTestData/README.md
@@ -4,10 +4,11 @@ This directory contains various regression tests.
 
 NOTE: some tests may fail if you use a version of the assembler that is
 different from the one used to generate the expected output.  The current
-set was generated with:
+set was generated for:
 
  * 64tass v1.53.1515
- * cc65 v2.17
+ * ACME v0.96.4
+ * cc65 v2.18
  * Merlin 32 v1.0
 
 
diff --git a/SourceGen/SGTestData/Source/2016-char-encoding.S b/SourceGen/SGTestData/Source/2016-char-encoding.S
new file mode 100644
index 0000000..7635f1c
--- /dev/null
+++ b/SourceGen/SGTestData/Source/2016-char-encoding.S
@@ -0,0 +1,141 @@
+; Copyright 2019 faddenSoft. All Rights Reserved.
+; See the LICENSE.txt file for distribution terms (Apache 2.0).
+;
+; Assembler: ACME (good PETSCII/screen code support)
+
+        !cpu    65816
+*       =       $1000
+
+        clc
+        xce
+        sep     #$30
+        !as
+        !rs
+
+; Single-byte operand
+        lda     #'A'            ;format as low ASCII
+        lda     #'A' | $80      ;format as high ASCII
+        lda     #'A' | $80      ;format as PETSCII
+        lda     #'A'            ;format as screen code
+
+        ldx     #'a'            ;format as low ASCII
+        ldx     #'a' | $80      ;format as high ASCII
+        ldx     #'a' - $20      ;format as PETSCII
+        ldx     #$01            ;format as screen code
+
+        lda     #$7f            ;EDIT: force to low ASCII
+        lda     #$7f            ;EDIT: force to high ASCII
+        lda     #$7f            ;EDIT: force to PETSCII
+        lda     #$7f            ;EDIT: force to screen code
+
+; Single letter in a 16-bit immediate
+        rep     #$30
+        !al
+        !rl
+        lda     #'B'
+        lda     #'B' | $80
+        lda     #'B' | $80
+        lda     #'B'
+
+        sep     #$30
+        !as
+        !rs
+        rts
+
+; Single-byte data items
+        !byte   'C'
+        !byte   'C' | $80
+        !byte   'C' | $80
+        !byte   'C'
+
+; Double-byte data items
+        !byte   'd', 0
+        !byte   'd' | $80, 0
+        !byte   'd' - $20, 0
+        !byte   $04, 0
+
+; Double-byte big-endian data items
+        !byte   0, 'E'
+        !byte   0, 'E' | $80
+        !byte   0, 'E' | $80
+        !byte   0, 'E'
+
+; Start with the basics
+        !byte   $80
+        !text   "low ASCII str"
+        !byte   $80
+        !xor $80 {
+            !text   "high ASCII str"
+        }
+        !byte   $80
+        !pet    "PETSCII str"
+        !byte   $80
+        !scr    "Screen Code str"
+
+; Get a bit fancy
+        !byte   $82
+        !text   "Low ASCII CRLF",$07,$0d,$0a
+        !byte   $82
+        !xor $80 {
+            !text   "High ASCII CRLF",$07,$0d,$0a
+        }
+        !byte   $82
+        !pet    $93,"PETSCII with ",$96,"control",$05," codes",$0d
+        ; no control chars in screen code
+
+; Test the ASCII $20-7e range.
+        !byte   $83
+        !text   " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+        !text   "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+        !text   "`abcdefghijklmnopqrstuvwxyz{|}~"
+        !byte   $83
+        !xor $80 {
+            !text   " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+            !text   "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+            !text   "`abcdefghijklmnopqrstuvwxyz{|}~"
+        }
+        !byte   $83
+        !pet    " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+        !pet    "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+        !pet    "`abcdefghijklmnopqrstuvwxyz{|}~"
+        !byte   $83
+        !scr    " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+        !scr    "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+        !scr    "`abcdefghijklmnopqrstuvwxyz{|}~"
+
+; The 2005 test exercises low/high ASCII strings, so no need to do that here.
+; Do a quick test with PETSCII.
+        !byte   $84
+        !pet    "IICSTEP esrever"               ;format as StringReverse
+        !pet    "null term PETSCII",0           ;format as StringNullTerm
+        !pet    "This null-terminated string is too long to fit on a single line, and will be split.",0
+        !pet    19,"PETSCII with length"        ;format as StringL8
+        !pet    20,0,"PETSCII with length2"     ;format as StringL16
+
+        !byte   $84
+        !scr    "edoC neercS esrever"           ;format as StringReverse
+        !scr    "null term Screen Code",0       ;format as StringNullTerm
+        !scr    "This null-terminated string is too long to fit on a single line, and will be split.",0
+        !scr    23,"Screen Code with length"    ;format as StringL8
+        !scr    24,0,"Screen Code with length2" ;format as StringL16
+
+        !byte   $85
+
+; All bytes, from 00-ff.  Handy for seeing what the auto-scanner picks up.
+allbytes
+        !hex    000102030405060708090a0b0c0d0e0f
+        !hex    101112131415161718191a1b1c1d1e1f
+        !hex    202122232425262728292a2b2c2d2e2f
+        !hex    303132333435363738393a3b3c3d3e3f
+        !hex    404142434445464748494a4b4c4d4e4f
+        !hex    505152535455565758595a5b5c5d5e5f
+        !hex    606162636465666768696a6b6c6d6e6f
+        !hex    707172737475767778797a7b7c7d7e7f
+        !hex    808182838485868788898a8b8c8d8e8f
+        !hex    909192939495969798999a9b9c9d9e9f
+        !hex    a0a1a2a3a4a5a6a7a8a9aaabacadaeaf
+        !hex    b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+        !hex    c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+        !hex    d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+        !hex    e0e1e2e3e4e5e6e7e8e9eaebecedeeef
+        !hex    f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff