From bc633288ad5f36058f4f6e2403293cb24da4439e Mon Sep 17 00:00:00 2001 From: Andy McFadden Date: Sun, 11 Aug 2019 11:27:09 -0700 Subject: [PATCH] Prep work for multi-encoding support Wrote down research into C64 encodings. Added source for a first cut at 2016-char-encoding test. --- Asm65/CharEncoding.cs | 67 ++++++++- SourceGen/RuntimeData/Help/codegen.html | 9 ++ SourceGen/SGTestData/README.md | 5 +- .../SGTestData/Source/2016-char-encoding.S | 141 ++++++++++++++++++ 4 files changed, 214 insertions(+), 8 deletions(-) create mode 100644 SourceGen/SGTestData/Source/2016-char-encoding.S diff --git a/Asm65/CharEncoding.cs b/Asm65/CharEncoding.cs index 7f7b6a5..dbf1461 100644 --- a/Asm65/CharEncoding.cs +++ b/Asm65/CharEncoding.cs @@ -22,10 +22,11 @@ namespace Asm65 { /// Character encoding helper methods. /// public static class CharEncoding { - public const char UNPRINTABLE_CHAR = '\ufffd'; + public const char UNPRINTABLE_CHAR = '\ufffd'; // Unicode REPLACEMENT CHARACTER /// - /// Determines whether the byte represents a character in the character set. + /// Determines whether the byte represents a member of the character set. The + /// specifics (e.g. printable only) are defined by the method. /// public delegate bool InclusionTest(byte val); @@ -34,11 +35,13 @@ namespace Asm65 { /// does not map to something printable. /// /// - /// Yes, I'm assuming it all fits in the Unicode BMP. Should be a safe assumption - /// for 8-bit computer character sets. + /// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that + /// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so + /// anything non-ASCII is getting hexified anyway. /// public delegate char Convert(byte val); + // // Standard ASCII. // @@ -46,7 +49,7 @@ namespace Asm65 { return (val >= 0x20 && val < 0x7f); } public static bool IsExtendedLowAscii(byte val) { - return IsPrintableLowAscii(val) || val == 0x0a || val == 0x0d; + return IsPrintableLowAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d; } public static char ConvertLowAscii(byte val) { if (IsPrintableLowAscii(val)) { @@ -63,7 +66,7 @@ namespace Asm65 { return (val >= 0xa0 && val < 0xff); } public static bool IsExtendedHighAscii(byte val) { - return IsPrintableHighAscii(val) || val == 0x8a || val == 0x8d; + return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d; } public static char ConvertHighAscii(byte val) { if (IsPrintableHighAscii(val)) { @@ -84,5 +87,57 @@ namespace Asm65 { } } + // + // C64 PETSCII + // + // Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case + // mode, or text mode. + // + // Comparison to ASCII: + // $00-1f: control codes, many with C64-specific meanings + // $20-3f: same as ASCII + // $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore + // replaced with non-ASCII symbols (though the up-arrow in place of caret is close) + // $60-7f: upper case letters (rather than lower case); backquote, curly braces, + // vertical bar, and tilde replaced with non-ASCII symbols + // $80-9f: more control codes + // $a0-bf: non-ASCII symbols + // $c0-df: clone of $60-7f; by convention this is used for upper case, since it's + // equal to lower case with the high bit set + // $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf) + // + // The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da]. + // (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.) + // Anything outside that range will get printed as hex to ensure proper conversion. + // + // Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were + // assigned to the caret and underscore values. So arguably those are "ASCII" as + // well, unless you're sane and define ASCII more narrowly. + // + // Control codes that we might expect to appear in the middle of a string: + // $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color + // $93 - clear + // $12 92 - reverse on/off + // $07 0a 0d - bell, LF, CR (note CR is favored for EOL) + // + // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf + // + + // + // C64 Screen Codes + // + // Using character set 2, which includes lower case letters. + // + // $00-1f: lower case letters (PETSCII $40-5f) + // $20-3f: same as ASCII (PETSCII $20-3f) + // $40-5f: upper case letters (PETSCII $60-7f) + // $60-7f: non-ASCII symbols (PETSCII $a0-bf) + // + // With the high bit set, character colors are reversed. The printable ASCII set + // is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a]. By definition, only printable characters + // are included in the set, so there are no control codes. + // + // For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf + // } } diff --git a/SourceGen/RuntimeData/Help/codegen.html b/SourceGen/RuntimeData/Help/codegen.html index 759fb58..a7549ef 100644 --- a/SourceGen/RuntimeData/Help/codegen.html +++ b/SourceGen/RuntimeData/Help/codegen.html @@ -156,6 +156,15 @@ code, but also needs to know how to handle the corner cases.

  • For 65816, selecting the bank byte is done with the back-quote ('`') rather than the caret ('^'). (There's a note in the docs to the effect that they plan to move to carets.)
  • +
  • By default, the assembler assumes that the input is PETSCII, but + doesn't convert characters in text strings. So PETSCII source files + generate PETSCII strings, and ASCII source files generate ASCII + strings. However, if you use the built-in "screen" encoding, you will + get the wrong behavior if you compile an ASCII source without the + "--ascii" command-line flag, because it expects to convert from + PETSCII. To get the behavior expected of a cross-assembler, it's + necessary to pass "--ascii" and explicitly define an ASCII encoding + for use with ASCII text strings.
  • diff --git a/SourceGen/SGTestData/README.md b/SourceGen/SGTestData/README.md index e4c2fba..3c01a94 100644 --- a/SourceGen/SGTestData/README.md +++ b/SourceGen/SGTestData/README.md @@ -4,10 +4,11 @@ This directory contains various regression tests. NOTE: some tests may fail if you use a version of the assembler that is different from the one used to generate the expected output. The current -set was generated with: +set was generated for: * 64tass v1.53.1515 - * cc65 v2.17 + * ACME v0.96.4 + * cc65 v2.18 * Merlin 32 v1.0 diff --git a/SourceGen/SGTestData/Source/2016-char-encoding.S b/SourceGen/SGTestData/Source/2016-char-encoding.S new file mode 100644 index 0000000..7635f1c --- /dev/null +++ b/SourceGen/SGTestData/Source/2016-char-encoding.S @@ -0,0 +1,141 @@ +; Copyright 2019 faddenSoft. All Rights Reserved. +; See the LICENSE.txt file for distribution terms (Apache 2.0). +; +; Assembler: ACME (good PETSCII/screen code support) + + !cpu 65816 +* = $1000 + + clc + xce + sep #$30 + !as + !rs + +; Single-byte operand + lda #'A' ;format as low ASCII + lda #'A' | $80 ;format as high ASCII + lda #'A' | $80 ;format as PETSCII + lda #'A' ;format as screen code + + ldx #'a' ;format as low ASCII + ldx #'a' | $80 ;format as high ASCII + ldx #'a' - $20 ;format as PETSCII + ldx #$01 ;format as screen code + + lda #$7f ;EDIT: force to low ASCII + lda #$7f ;EDIT: force to high ASCII + lda #$7f ;EDIT: force to PETSCII + lda #$7f ;EDIT: force to screen code + +; Single letter in a 16-bit immediate + rep #$30 + !al + !rl + lda #'B' + lda #'B' | $80 + lda #'B' | $80 + lda #'B' + + sep #$30 + !as + !rs + rts + +; Single-byte data items + !byte 'C' + !byte 'C' | $80 + !byte 'C' | $80 + !byte 'C' + +; Double-byte data items + !byte 'd', 0 + !byte 'd' | $80, 0 + !byte 'd' - $20, 0 + !byte $04, 0 + +; Double-byte big-endian data items + !byte 0, 'E' + !byte 0, 'E' | $80 + !byte 0, 'E' | $80 + !byte 0, 'E' + +; Start with the basics + !byte $80 + !text "low ASCII str" + !byte $80 + !xor $80 { + !text "high ASCII str" + } + !byte $80 + !pet "PETSCII str" + !byte $80 + !scr "Screen Code str" + +; Get a bit fancy + !byte $82 + !text "Low ASCII CRLF",$07,$0d,$0a + !byte $82 + !xor $80 { + !text "High ASCII CRLF",$07,$0d,$0a + } + !byte $82 + !pet $93,"PETSCII with ",$96,"control",$05," codes",$0d + ; no control chars in screen code + +; Test the ASCII $20-7e range. + !byte $83 + !text " !",$22,"#$%&'()*+,-./0123456789:;<=>?" + !text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_" + !text "`abcdefghijklmnopqrstuvwxyz{|}~" + !byte $83 + !xor $80 { + !text " !",$22,"#$%&'()*+,-./0123456789:;<=>?" + !text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_" + !text "`abcdefghijklmnopqrstuvwxyz{|}~" + } + !byte $83 + !pet " !",$22,"#$%&'()*+,-./0123456789:;<=>?" + !pet "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_" + !pet "`abcdefghijklmnopqrstuvwxyz{|}~" + !byte $83 + !scr " !",$22,"#$%&'()*+,-./0123456789:;<=>?" + !scr "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_" + !scr "`abcdefghijklmnopqrstuvwxyz{|}~" + +; The 2005 test exercises low/high ASCII strings, so no need to do that here. +; Do a quick test with PETSCII. + !byte $84 + !pet "IICSTEP esrever" ;format as StringReverse + !pet "null term PETSCII",0 ;format as StringNullTerm + !pet "This null-terminated string is too long to fit on a single line, and will be split.",0 + !pet 19,"PETSCII with length" ;format as StringL8 + !pet 20,0,"PETSCII with length2" ;format as StringL16 + + !byte $84 + !scr "edoC neercS esrever" ;format as StringReverse + !scr "null term Screen Code",0 ;format as StringNullTerm + !scr "This null-terminated string is too long to fit on a single line, and will be split.",0 + !scr 23,"Screen Code with length" ;format as StringL8 + !scr 24,0,"Screen Code with length2" ;format as StringL16 + + !byte $85 + +; All bytes, from 00-ff. Handy for seeing what the auto-scanner picks up. +allbytes + !hex 000102030405060708090a0b0c0d0e0f + !hex 101112131415161718191a1b1c1d1e1f + !hex 202122232425262728292a2b2c2d2e2f + !hex 303132333435363738393a3b3c3d3e3f + !hex 404142434445464748494a4b4c4d4e4f + !hex 505152535455565758595a5b5c5d5e5f + !hex 606162636465666768696a6b6c6d6e6f + !hex 707172737475767778797a7b7c7d7e7f + !hex 808182838485868788898a8b8c8d8e8f + !hex 909192939495969798999a9b9c9d9e9f + !hex a0a1a2a3a4a5a6a7a8a9aaabacadaeaf + !hex b0b1b2b3b4b5b6b7b8b9babbbcbdbebf + !hex c0c1c2c3c4c5c6c7c8c9cacbcccdcecf + !hex d0d1d2d3d4d5d6d7d8d9dadbdcdddedf + !hex e0e1e2e3e4e5e6e7e8e9eaebecedeeef + !hex f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff