1
0
mirror of https://github.com/fadden/6502bench.git synced 2025-02-18 08:30:28 +00:00

Prep work for multi-encoding support

Wrote down research into C64 encodings.

Added source for a first cut at 2016-char-encoding test.
This commit is contained in:
Andy McFadden 2019-08-11 11:27:09 -07:00
parent 15d26c9ebd
commit bc633288ad
4 changed files with 214 additions and 8 deletions

View File

@ -22,10 +22,11 @@ namespace Asm65 {
/// Character encoding helper methods.
/// </summary>
public static class CharEncoding {
public const char UNPRINTABLE_CHAR = '\ufffd';
public const char UNPRINTABLE_CHAR = '\ufffd'; // Unicode REPLACEMENT CHARACTER
/// <summary>
/// Determines whether the byte represents a character in the character set.
/// Determines whether the byte represents a member of the character set. The
/// specifics (e.g. printable only) are defined by the method.
/// </summary>
public delegate bool InclusionTest(byte val);
@ -34,11 +35,13 @@ namespace Asm65 {
/// does not map to something printable.
/// </summary>
/// <remarks>
/// Yes, I'm assuming it all fits in the Unicode BMP. Should be a safe assumption
/// for 8-bit computer character sets.
/// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that
/// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so
/// anything non-ASCII is getting hexified anyway.
/// </remarks>
public delegate char Convert(byte val);
//
// Standard ASCII.
//
@ -46,7 +49,7 @@ namespace Asm65 {
return (val >= 0x20 && val < 0x7f);
}
public static bool IsExtendedLowAscii(byte val) {
return IsPrintableLowAscii(val) || val == 0x0a || val == 0x0d;
return IsPrintableLowAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
}
public static char ConvertLowAscii(byte val) {
if (IsPrintableLowAscii(val)) {
@ -63,7 +66,7 @@ namespace Asm65 {
return (val >= 0xa0 && val < 0xff);
}
public static bool IsExtendedHighAscii(byte val) {
return IsPrintableHighAscii(val) || val == 0x8a || val == 0x8d;
return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
}
public static char ConvertHighAscii(byte val) {
if (IsPrintableHighAscii(val)) {
@ -84,5 +87,57 @@ namespace Asm65 {
}
}
//
// C64 PETSCII
//
// Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
// mode, or text mode.
//
// Comparison to ASCII:
// $00-1f: control codes, many with C64-specific meanings
// $20-3f: same as ASCII
// $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
// replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
// $60-7f: upper case letters (rather than lower case); backquote, curly braces,
// vertical bar, and tilde replaced with non-ASCII symbols
// $80-9f: more control codes
// $a0-bf: non-ASCII symbols
// $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
// equal to lower case with the high bit set
// $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
//
// The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
// (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
// Anything outside that range will get printed as hex to ensure proper conversion.
//
// Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
// assigned to the caret and underscore values. So arguably those are "ASCII" as
// well, unless you're sane and define ASCII more narrowly.
//
// Control codes that we might expect to appear in the middle of a string:
// $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
// $93 - clear
// $12 92 - reverse on/off
// $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
//
// C64 Screen Codes
//
// Using character set 2, which includes lower case letters.
//
// $00-1f: lower case letters (PETSCII $40-5f)
// $20-3f: same as ASCII (PETSCII $20-3f)
// $40-5f: upper case letters (PETSCII $60-7f)
// $60-7f: non-ASCII symbols (PETSCII $a0-bf)
//
// With the high bit set, character colors are reversed. The printable ASCII set
// is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a]. By definition, only printable characters
// are included in the set, so there are no control codes.
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
}
}

View File

@ -156,6 +156,15 @@ code, but also needs to know how to handle the corner cases.</p>
<li>For 65816, selecting the bank byte is done with the back-quote ('`')
rather than the caret ('^'). (There's a note in the docs to the effect
that they plan to move to carets.)</li>
<li>By default, the assembler assumes that the input is PETSCII, but
doesn't convert characters in text strings. So PETSCII source files
generate PETSCII strings, and ASCII source files generate ASCII
strings. However, if you use the built-in "screen" encoding, you will
get the wrong behavior if you compile an ASCII source without the
"--ascii" command-line flag, because it expects to convert from
PETSCII. To get the behavior expected of a cross-assembler, it's
necessary to pass "--ascii" and explicitly define an ASCII encoding
for use with ASCII text strings.</li>
</ul>

View File

@ -4,10 +4,11 @@ This directory contains various regression tests.
NOTE: some tests may fail if you use a version of the assembler that is
different from the one used to generate the expected output. The current
set was generated with:
set was generated for:
* 64tass v1.53.1515
* cc65 v2.17
* ACME v0.96.4
* cc65 v2.18
* Merlin 32 v1.0

View File

@ -0,0 +1,141 @@
; Copyright 2019 faddenSoft. All Rights Reserved.
; See the LICENSE.txt file for distribution terms (Apache 2.0).
;
; Assembler: ACME (good PETSCII/screen code support)
!cpu 65816
* = $1000
clc
xce
sep #$30
!as
!rs
; Single-byte operand
lda #'A' ;format as low ASCII
lda #'A' | $80 ;format as high ASCII
lda #'A' | $80 ;format as PETSCII
lda #'A' ;format as screen code
ldx #'a' ;format as low ASCII
ldx #'a' | $80 ;format as high ASCII
ldx #'a' - $20 ;format as PETSCII
ldx #$01 ;format as screen code
lda #$7f ;EDIT: force to low ASCII
lda #$7f ;EDIT: force to high ASCII
lda #$7f ;EDIT: force to PETSCII
lda #$7f ;EDIT: force to screen code
; Single letter in a 16-bit immediate
rep #$30
!al
!rl
lda #'B'
lda #'B' | $80
lda #'B' | $80
lda #'B'
sep #$30
!as
!rs
rts
; Single-byte data items
!byte 'C'
!byte 'C' | $80
!byte 'C' | $80
!byte 'C'
; Double-byte data items
!byte 'd', 0
!byte 'd' | $80, 0
!byte 'd' - $20, 0
!byte $04, 0
; Double-byte big-endian data items
!byte 0, 'E'
!byte 0, 'E' | $80
!byte 0, 'E' | $80
!byte 0, 'E'
; Start with the basics
!byte $80
!text "low ASCII str"
!byte $80
!xor $80 {
!text "high ASCII str"
}
!byte $80
!pet "PETSCII str"
!byte $80
!scr "Screen Code str"
; Get a bit fancy
!byte $82
!text "Low ASCII CRLF",$07,$0d,$0a
!byte $82
!xor $80 {
!text "High ASCII CRLF",$07,$0d,$0a
}
!byte $82
!pet $93,"PETSCII with ",$96,"control",$05," codes",$0d
; no control chars in screen code
; Test the ASCII $20-7e range.
!byte $83
!text " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
!text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
!text "`abcdefghijklmnopqrstuvwxyz{|}~"
!byte $83
!xor $80 {
!text " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
!text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
!text "`abcdefghijklmnopqrstuvwxyz{|}~"
}
!byte $83
!pet " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
!pet "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
!pet "`abcdefghijklmnopqrstuvwxyz{|}~"
!byte $83
!scr " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
!scr "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
!scr "`abcdefghijklmnopqrstuvwxyz{|}~"
; The 2005 test exercises low/high ASCII strings, so no need to do that here.
; Do a quick test with PETSCII.
!byte $84
!pet "IICSTEP esrever" ;format as StringReverse
!pet "null term PETSCII",0 ;format as StringNullTerm
!pet "This null-terminated string is too long to fit on a single line, and will be split.",0
!pet 19,"PETSCII with length" ;format as StringL8
!pet 20,0,"PETSCII with length2" ;format as StringL16
!byte $84
!scr "edoC neercS esrever" ;format as StringReverse
!scr "null term Screen Code",0 ;format as StringNullTerm
!scr "This null-terminated string is too long to fit on a single line, and will be split.",0
!scr 23,"Screen Code with length" ;format as StringL8
!scr 24,0,"Screen Code with length2" ;format as StringL16
!byte $85
; All bytes, from 00-ff. Handy for seeing what the auto-scanner picks up.
allbytes
!hex 000102030405060708090a0b0c0d0e0f
!hex 101112131415161718191a1b1c1d1e1f
!hex 202122232425262728292a2b2c2d2e2f
!hex 303132333435363738393a3b3c3d3e3f
!hex 404142434445464748494a4b4c4d4e4f
!hex 505152535455565758595a5b5c5d5e5f
!hex 606162636465666768696a6b6c6d6e6f
!hex 707172737475767778797a7b7c7d7e7f
!hex 808182838485868788898a8b8c8d8e8f
!hex 909192939495969798999a9b9c9d9e9f
!hex a0a1a2a3a4a5a6a7a8a9aaabacadaeaf
!hex b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
!hex c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
!hex d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
!hex e0e1e2e3e4e5e6e7e8e9eaebecedeeef
!hex f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff