From bc633288ad5f36058f4f6e2403293cb24da4439e Mon Sep 17 00:00:00 2001
From: Andy McFadden
Date: Sun, 11 Aug 2019 11:27:09 -0700
Subject: [PATCH] Prep work for multi-encoding support
Wrote down research into C64 encodings.
Added source for a first cut at 2016-char-encoding test.
---
Asm65/CharEncoding.cs | 67 ++++++++-
SourceGen/RuntimeData/Help/codegen.html | 9 ++
SourceGen/SGTestData/README.md | 5 +-
.../SGTestData/Source/2016-char-encoding.S | 141 ++++++++++++++++++
4 files changed, 214 insertions(+), 8 deletions(-)
create mode 100644 SourceGen/SGTestData/Source/2016-char-encoding.S
diff --git a/Asm65/CharEncoding.cs b/Asm65/CharEncoding.cs
index 7f7b6a5..dbf1461 100644
--- a/Asm65/CharEncoding.cs
+++ b/Asm65/CharEncoding.cs
@@ -22,10 +22,11 @@ namespace Asm65 {
/// Character encoding helper methods.
///
public static class CharEncoding {
- public const char UNPRINTABLE_CHAR = '\ufffd';
+ public const char UNPRINTABLE_CHAR = '\ufffd'; // Unicode REPLACEMENT CHARACTER
///
- /// Determines whether the byte represents a character in the character set.
+ /// Determines whether the byte represents a member of the character set. The
+ /// specifics (e.g. printable only) are defined by the method.
///
public delegate bool InclusionTest(byte val);
@@ -34,11 +35,13 @@ namespace Asm65 {
/// does not map to something printable.
///
///
- /// Yes, I'm assuming it all fits in the Unicode BMP. Should be a safe assumption
- /// for 8-bit computer character sets.
+ /// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that
+ /// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so
+ /// anything non-ASCII is getting hexified anyway.
///
public delegate char Convert(byte val);
+
//
// Standard ASCII.
//
@@ -46,7 +49,7 @@ namespace Asm65 {
return (val >= 0x20 && val < 0x7f);
}
public static bool IsExtendedLowAscii(byte val) {
- return IsPrintableLowAscii(val) || val == 0x0a || val == 0x0d;
+ return IsPrintableLowAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
}
public static char ConvertLowAscii(byte val) {
if (IsPrintableLowAscii(val)) {
@@ -63,7 +66,7 @@ namespace Asm65 {
return (val >= 0xa0 && val < 0xff);
}
public static bool IsExtendedHighAscii(byte val) {
- return IsPrintableHighAscii(val) || val == 0x8a || val == 0x8d;
+ return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
}
public static char ConvertHighAscii(byte val) {
if (IsPrintableHighAscii(val)) {
@@ -84,5 +87,57 @@ namespace Asm65 {
}
}
+ //
+ // C64 PETSCII
+ //
+ // Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
+ // mode, or text mode.
+ //
+ // Comparison to ASCII:
+ // $00-1f: control codes, many with C64-specific meanings
+ // $20-3f: same as ASCII
+ // $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
+ // replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
+ // $60-7f: upper case letters (rather than lower case); backquote, curly braces,
+ // vertical bar, and tilde replaced with non-ASCII symbols
+ // $80-9f: more control codes
+ // $a0-bf: non-ASCII symbols
+ // $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
+ // equal to lower case with the high bit set
+ // $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
+ //
+ // The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
+ // (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
+ // Anything outside that range will get printed as hex to ensure proper conversion.
+ //
+ // Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
+ // assigned to the caret and underscore values. So arguably those are "ASCII" as
+ // well, unless you're sane and define ASCII more narrowly.
+ //
+ // Control codes that we might expect to appear in the middle of a string:
+ // $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
+ // $93 - clear
+ // $12 92 - reverse on/off
+ // $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
+ //
+ // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
+ //
+
+ //
+ // C64 Screen Codes
+ //
+ // Using character set 2, which includes lower case letters.
+ //
+ // $00-1f: lower case letters (PETSCII $40-5f)
+ // $20-3f: same as ASCII (PETSCII $20-3f)
+ // $40-5f: upper case letters (PETSCII $60-7f)
+ // $60-7f: non-ASCII symbols (PETSCII $a0-bf)
+ //
+ // With the high bit set, character colors are reversed. The printable ASCII set
+ // is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a]. By definition, only printable characters
+ // are included in the set, so there are no control codes.
+ //
+ // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
+ //
}
}
diff --git a/SourceGen/RuntimeData/Help/codegen.html b/SourceGen/RuntimeData/Help/codegen.html
index 759fb58..a7549ef 100644
--- a/SourceGen/RuntimeData/Help/codegen.html
+++ b/SourceGen/RuntimeData/Help/codegen.html
@@ -156,6 +156,15 @@ code, but also needs to know how to handle the corner cases.
For 65816, selecting the bank byte is done with the back-quote ('`')
rather than the caret ('^'). (There's a note in the docs to the effect
that they plan to move to carets.)
+ By default, the assembler assumes that the input is PETSCII, but
+ doesn't convert characters in text strings. So PETSCII source files
+ generate PETSCII strings, and ASCII source files generate ASCII
+ strings. However, if you use the built-in "screen" encoding, you will
+ get the wrong behavior if you compile an ASCII source without the
+ "--ascii" command-line flag, because it expects to convert from
+ PETSCII. To get the behavior expected of a cross-assembler, it's
+ necessary to pass "--ascii" and explicitly define an ASCII encoding
+ for use with ASCII text strings.
diff --git a/SourceGen/SGTestData/README.md b/SourceGen/SGTestData/README.md
index e4c2fba..3c01a94 100644
--- a/SourceGen/SGTestData/README.md
+++ b/SourceGen/SGTestData/README.md
@@ -4,10 +4,11 @@ This directory contains various regression tests.
NOTE: some tests may fail if you use a version of the assembler that is
different from the one used to generate the expected output. The current
-set was generated with:
+set was generated for:
* 64tass v1.53.1515
- * cc65 v2.17
+ * ACME v0.96.4
+ * cc65 v2.18
* Merlin 32 v1.0
diff --git a/SourceGen/SGTestData/Source/2016-char-encoding.S b/SourceGen/SGTestData/Source/2016-char-encoding.S
new file mode 100644
index 0000000..7635f1c
--- /dev/null
+++ b/SourceGen/SGTestData/Source/2016-char-encoding.S
@@ -0,0 +1,141 @@
+; Copyright 2019 faddenSoft. All Rights Reserved.
+; See the LICENSE.txt file for distribution terms (Apache 2.0).
+;
+; Assembler: ACME (good PETSCII/screen code support)
+
+ !cpu 65816
+* = $1000
+
+ clc
+ xce
+ sep #$30
+ !as
+ !rs
+
+; Single-byte operand
+ lda #'A' ;format as low ASCII
+ lda #'A' | $80 ;format as high ASCII
+ lda #'A' | $80 ;format as PETSCII
+ lda #'A' ;format as screen code
+
+ ldx #'a' ;format as low ASCII
+ ldx #'a' | $80 ;format as high ASCII
+ ldx #'a' - $20 ;format as PETSCII
+ ldx #$01 ;format as screen code
+
+ lda #$7f ;EDIT: force to low ASCII
+ lda #$7f ;EDIT: force to high ASCII
+ lda #$7f ;EDIT: force to PETSCII
+ lda #$7f ;EDIT: force to screen code
+
+; Single letter in a 16-bit immediate
+ rep #$30
+ !al
+ !rl
+ lda #'B'
+ lda #'B' | $80
+ lda #'B' | $80
+ lda #'B'
+
+ sep #$30
+ !as
+ !rs
+ rts
+
+; Single-byte data items
+ !byte 'C'
+ !byte 'C' | $80
+ !byte 'C' | $80
+ !byte 'C'
+
+; Double-byte data items
+ !byte 'd', 0
+ !byte 'd' | $80, 0
+ !byte 'd' - $20, 0
+ !byte $04, 0
+
+; Double-byte big-endian data items
+ !byte 0, 'E'
+ !byte 0, 'E' | $80
+ !byte 0, 'E' | $80
+ !byte 0, 'E'
+
+; Start with the basics
+ !byte $80
+ !text "low ASCII str"
+ !byte $80
+ !xor $80 {
+ !text "high ASCII str"
+ }
+ !byte $80
+ !pet "PETSCII str"
+ !byte $80
+ !scr "Screen Code str"
+
+; Get a bit fancy
+ !byte $82
+ !text "Low ASCII CRLF",$07,$0d,$0a
+ !byte $82
+ !xor $80 {
+ !text "High ASCII CRLF",$07,$0d,$0a
+ }
+ !byte $82
+ !pet $93,"PETSCII with ",$96,"control",$05," codes",$0d
+ ; no control chars in screen code
+
+; Test the ASCII $20-7e range.
+ !byte $83
+ !text " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+ !text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+ !text "`abcdefghijklmnopqrstuvwxyz{|}~"
+ !byte $83
+ !xor $80 {
+ !text " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+ !text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+ !text "`abcdefghijklmnopqrstuvwxyz{|}~"
+ }
+ !byte $83
+ !pet " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+ !pet "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+ !pet "`abcdefghijklmnopqrstuvwxyz{|}~"
+ !byte $83
+ !scr " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
+ !scr "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
+ !scr "`abcdefghijklmnopqrstuvwxyz{|}~"
+
+; The 2005 test exercises low/high ASCII strings, so no need to do that here.
+; Do a quick test with PETSCII.
+ !byte $84
+ !pet "IICSTEP esrever" ;format as StringReverse
+ !pet "null term PETSCII",0 ;format as StringNullTerm
+ !pet "This null-terminated string is too long to fit on a single line, and will be split.",0
+ !pet 19,"PETSCII with length" ;format as StringL8
+ !pet 20,0,"PETSCII with length2" ;format as StringL16
+
+ !byte $84
+ !scr "edoC neercS esrever" ;format as StringReverse
+ !scr "null term Screen Code",0 ;format as StringNullTerm
+ !scr "This null-terminated string is too long to fit on a single line, and will be split.",0
+ !scr 23,"Screen Code with length" ;format as StringL8
+ !scr 24,0,"Screen Code with length2" ;format as StringL16
+
+ !byte $85
+
+; All bytes, from 00-ff. Handy for seeing what the auto-scanner picks up.
+allbytes
+ !hex 000102030405060708090a0b0c0d0e0f
+ !hex 101112131415161718191a1b1c1d1e1f
+ !hex 202122232425262728292a2b2c2d2e2f
+ !hex 303132333435363738393a3b3c3d3e3f
+ !hex 404142434445464748494a4b4c4d4e4f
+ !hex 505152535455565758595a5b5c5d5e5f
+ !hex 606162636465666768696a6b6c6d6e6f
+ !hex 707172737475767778797a7b7c7d7e7f
+ !hex 808182838485868788898a8b8c8d8e8f
+ !hex 909192939495969798999a9b9c9d9e9f
+ !hex a0a1a2a3a4a5a6a7a8a9aaabacadaeaf
+ !hex b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ !hex c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ !hex d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ !hex e0e1e2e3e4e5e6e7e8e9eaebecedeeef
+ !hex f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff