mirror of
https://github.com/fadden/6502bench.git
synced 2025-02-18 08:30:28 +00:00
Prep work for multi-encoding support
Wrote down research into C64 encodings. Added source for a first cut at 2016-char-encoding test.
This commit is contained in:
parent
15d26c9ebd
commit
bc633288ad
@ -22,10 +22,11 @@ namespace Asm65 {
|
||||
/// Character encoding helper methods.
|
||||
/// </summary>
|
||||
public static class CharEncoding {
|
||||
public const char UNPRINTABLE_CHAR = '\ufffd';
|
||||
public const char UNPRINTABLE_CHAR = '\ufffd'; // Unicode REPLACEMENT CHARACTER
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether the byte represents a character in the character set.
|
||||
/// Determines whether the byte represents a member of the character set. The
|
||||
/// specifics (e.g. printable only) are defined by the method.
|
||||
/// </summary>
|
||||
public delegate bool InclusionTest(byte val);
|
||||
|
||||
@ -34,11 +35,13 @@ namespace Asm65 {
|
||||
/// does not map to something printable.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Yes, I'm assuming it all fits in the Unicode BMP. Should be a safe assumption
|
||||
/// for 8-bit computer character sets.
|
||||
/// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that
|
||||
/// aren't part of the BMP, but we're targeting a variety of cross-assemblers, so
|
||||
/// anything non-ASCII is getting hexified anyway.
|
||||
/// </remarks>
|
||||
public delegate char Convert(byte val);
|
||||
|
||||
|
||||
//
|
||||
// Standard ASCII.
|
||||
//
|
||||
@ -46,7 +49,7 @@ namespace Asm65 {
|
||||
return (val >= 0x20 && val < 0x7f);
|
||||
}
|
||||
public static bool IsExtendedLowAscii(byte val) {
|
||||
return IsPrintableLowAscii(val) || val == 0x0a || val == 0x0d;
|
||||
return IsPrintableLowAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
|
||||
}
|
||||
public static char ConvertLowAscii(byte val) {
|
||||
if (IsPrintableLowAscii(val)) {
|
||||
@ -63,7 +66,7 @@ namespace Asm65 {
|
||||
return (val >= 0xa0 && val < 0xff);
|
||||
}
|
||||
public static bool IsExtendedHighAscii(byte val) {
|
||||
return IsPrintableHighAscii(val) || val == 0x8a || val == 0x8d;
|
||||
return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
|
||||
}
|
||||
public static char ConvertHighAscii(byte val) {
|
||||
if (IsPrintableHighAscii(val)) {
|
||||
@ -84,5 +87,57 @@ namespace Asm65 {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// C64 PETSCII
|
||||
//
|
||||
// Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
|
||||
// mode, or text mode.
|
||||
//
|
||||
// Comparison to ASCII:
|
||||
// $00-1f: control codes, many with C64-specific meanings
|
||||
// $20-3f: same as ASCII
|
||||
// $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
|
||||
// replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
|
||||
// $60-7f: upper case letters (rather than lower case); backquote, curly braces,
|
||||
// vertical bar, and tilde replaced with non-ASCII symbols
|
||||
// $80-9f: more control codes
|
||||
// $a0-bf: non-ASCII symbols
|
||||
// $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
|
||||
// equal to lower case with the high bit set
|
||||
// $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
|
||||
//
|
||||
// The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
|
||||
// (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
|
||||
// Anything outside that range will get printed as hex to ensure proper conversion.
|
||||
//
|
||||
// Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
|
||||
// assigned to the caret and underscore values. So arguably those are "ASCII" as
|
||||
// well, unless you're sane and define ASCII more narrowly.
|
||||
//
|
||||
// Control codes that we might expect to appear in the middle of a string:
|
||||
// $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
|
||||
// $93 - clear
|
||||
// $12 92 - reverse on/off
|
||||
// $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
|
||||
//
|
||||
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
|
||||
//
|
||||
|
||||
//
|
||||
// C64 Screen Codes
|
||||
//
|
||||
// Using character set 2, which includes lower case letters.
|
||||
//
|
||||
// $00-1f: lower case letters (PETSCII $40-5f)
|
||||
// $20-3f: same as ASCII (PETSCII $20-3f)
|
||||
// $40-5f: upper case letters (PETSCII $60-7f)
|
||||
// $60-7f: non-ASCII symbols (PETSCII $a0-bf)
|
||||
//
|
||||
// With the high bit set, character colors are reversed. The printable ASCII set
|
||||
// is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a]. By definition, only printable characters
|
||||
// are included in the set, so there are no control codes.
|
||||
//
|
||||
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
|
||||
//
|
||||
}
|
||||
}
|
||||
|
@ -156,6 +156,15 @@ code, but also needs to know how to handle the corner cases.</p>
|
||||
<li>For 65816, selecting the bank byte is done with the back-quote ('`')
|
||||
rather than the caret ('^'). (There's a note in the docs to the effect
|
||||
that they plan to move to carets.)</li>
|
||||
<li>By default, the assembler assumes that the input is PETSCII, but
|
||||
doesn't convert characters in text strings. So PETSCII source files
|
||||
generate PETSCII strings, and ASCII source files generate ASCII
|
||||
strings. However, if you use the built-in "screen" encoding, you will
|
||||
get the wrong behavior if you compile an ASCII source without the
|
||||
"--ascii" command-line flag, because it expects to convert from
|
||||
PETSCII. To get the behavior expected of a cross-assembler, it's
|
||||
necessary to pass "--ascii" and explicitly define an ASCII encoding
|
||||
for use with ASCII text strings.</li>
|
||||
</ul>
|
||||
|
||||
|
||||
|
@ -4,10 +4,11 @@ This directory contains various regression tests.
|
||||
|
||||
NOTE: some tests may fail if you use a version of the assembler that is
|
||||
different from the one used to generate the expected output. The current
|
||||
set was generated with:
|
||||
set was generated for:
|
||||
|
||||
* 64tass v1.53.1515
|
||||
* cc65 v2.17
|
||||
* ACME v0.96.4
|
||||
* cc65 v2.18
|
||||
* Merlin 32 v1.0
|
||||
|
||||
|
||||
|
141
SourceGen/SGTestData/Source/2016-char-encoding.S
Normal file
141
SourceGen/SGTestData/Source/2016-char-encoding.S
Normal file
@ -0,0 +1,141 @@
|
||||
; Copyright 2019 faddenSoft. All Rights Reserved.
|
||||
; See the LICENSE.txt file for distribution terms (Apache 2.0).
|
||||
;
|
||||
; Assembler: ACME (good PETSCII/screen code support)
|
||||
|
||||
!cpu 65816
|
||||
* = $1000
|
||||
|
||||
clc
|
||||
xce
|
||||
sep #$30
|
||||
!as
|
||||
!rs
|
||||
|
||||
; Single-byte operand
|
||||
lda #'A' ;format as low ASCII
|
||||
lda #'A' | $80 ;format as high ASCII
|
||||
lda #'A' | $80 ;format as PETSCII
|
||||
lda #'A' ;format as screen code
|
||||
|
||||
ldx #'a' ;format as low ASCII
|
||||
ldx #'a' | $80 ;format as high ASCII
|
||||
ldx #'a' - $20 ;format as PETSCII
|
||||
ldx #$01 ;format as screen code
|
||||
|
||||
lda #$7f ;EDIT: force to low ASCII
|
||||
lda #$7f ;EDIT: force to high ASCII
|
||||
lda #$7f ;EDIT: force to PETSCII
|
||||
lda #$7f ;EDIT: force to screen code
|
||||
|
||||
; Single letter in a 16-bit immediate
|
||||
rep #$30
|
||||
!al
|
||||
!rl
|
||||
lda #'B'
|
||||
lda #'B' | $80
|
||||
lda #'B' | $80
|
||||
lda #'B'
|
||||
|
||||
sep #$30
|
||||
!as
|
||||
!rs
|
||||
rts
|
||||
|
||||
; Single-byte data items
|
||||
!byte 'C'
|
||||
!byte 'C' | $80
|
||||
!byte 'C' | $80
|
||||
!byte 'C'
|
||||
|
||||
; Double-byte data items
|
||||
!byte 'd', 0
|
||||
!byte 'd' | $80, 0
|
||||
!byte 'd' - $20, 0
|
||||
!byte $04, 0
|
||||
|
||||
; Double-byte big-endian data items
|
||||
!byte 0, 'E'
|
||||
!byte 0, 'E' | $80
|
||||
!byte 0, 'E' | $80
|
||||
!byte 0, 'E'
|
||||
|
||||
; Start with the basics
|
||||
!byte $80
|
||||
!text "low ASCII str"
|
||||
!byte $80
|
||||
!xor $80 {
|
||||
!text "high ASCII str"
|
||||
}
|
||||
!byte $80
|
||||
!pet "PETSCII str"
|
||||
!byte $80
|
||||
!scr "Screen Code str"
|
||||
|
||||
; Get a bit fancy
|
||||
!byte $82
|
||||
!text "Low ASCII CRLF",$07,$0d,$0a
|
||||
!byte $82
|
||||
!xor $80 {
|
||||
!text "High ASCII CRLF",$07,$0d,$0a
|
||||
}
|
||||
!byte $82
|
||||
!pet $93,"PETSCII with ",$96,"control",$05," codes",$0d
|
||||
; no control chars in screen code
|
||||
|
||||
; Test the ASCII $20-7e range.
|
||||
!byte $83
|
||||
!text " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
|
||||
!text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
|
||||
!text "`abcdefghijklmnopqrstuvwxyz{|}~"
|
||||
!byte $83
|
||||
!xor $80 {
|
||||
!text " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
|
||||
!text "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
|
||||
!text "`abcdefghijklmnopqrstuvwxyz{|}~"
|
||||
}
|
||||
!byte $83
|
||||
!pet " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
|
||||
!pet "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
|
||||
!pet "`abcdefghijklmnopqrstuvwxyz{|}~"
|
||||
!byte $83
|
||||
!scr " !",$22,"#$%&'()*+,-./0123456789:;<=>?"
|
||||
!scr "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"
|
||||
!scr "`abcdefghijklmnopqrstuvwxyz{|}~"
|
||||
|
||||
; The 2005 test exercises low/high ASCII strings, so no need to do that here.
|
||||
; Do a quick test with PETSCII.
|
||||
!byte $84
|
||||
!pet "IICSTEP esrever" ;format as StringReverse
|
||||
!pet "null term PETSCII",0 ;format as StringNullTerm
|
||||
!pet "This null-terminated string is too long to fit on a single line, and will be split.",0
|
||||
!pet 19,"PETSCII with length" ;format as StringL8
|
||||
!pet 20,0,"PETSCII with length2" ;format as StringL16
|
||||
|
||||
!byte $84
|
||||
!scr "edoC neercS esrever" ;format as StringReverse
|
||||
!scr "null term Screen Code",0 ;format as StringNullTerm
|
||||
!scr "This null-terminated string is too long to fit on a single line, and will be split.",0
|
||||
!scr 23,"Screen Code with length" ;format as StringL8
|
||||
!scr 24,0,"Screen Code with length2" ;format as StringL16
|
||||
|
||||
!byte $85
|
||||
|
||||
; All bytes, from 00-ff. Handy for seeing what the auto-scanner picks up.
|
||||
allbytes
|
||||
!hex 000102030405060708090a0b0c0d0e0f
|
||||
!hex 101112131415161718191a1b1c1d1e1f
|
||||
!hex 202122232425262728292a2b2c2d2e2f
|
||||
!hex 303132333435363738393a3b3c3d3e3f
|
||||
!hex 404142434445464748494a4b4c4d4e4f
|
||||
!hex 505152535455565758595a5b5c5d5e5f
|
||||
!hex 606162636465666768696a6b6c6d6e6f
|
||||
!hex 707172737475767778797a7b7c7d7e7f
|
||||
!hex 808182838485868788898a8b8c8d8e8f
|
||||
!hex 909192939495969798999a9b9c9d9e9f
|
||||
!hex a0a1a2a3a4a5a6a7a8a9aaabacadaeaf
|
||||
!hex b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
|
||||
!hex c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
|
||||
!hex d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
|
||||
!hex e0e1e2e3e4e5e6e7e8e9eaebecedeeef
|
||||
!hex f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
Loading…
x
Reference in New Issue
Block a user