6502bench/Asm65/CharEncoding.cs

296 lines
12 KiB
C#

/*
* Copyright 2019 faddenSoft
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Diagnostics;
namespace Asm65 {
/// <summary>
/// Character encoding helper methods.
/// </summary>
public static class CharEncoding {
public const char UNPRINTABLE_CHAR = '\ufffd'; // Unicode REPLACEMENT CHARACTER
/// <summary>
/// Determines whether the byte represents a member of the character set. The
/// specifics (e.g. printable only) are defined by the method.
/// </summary>
public delegate bool InclusionTest(byte val);
/// <summary>
/// Converts the byte to a printable character. Returns UNPRINTABLE_CHAR if the value
/// does not map to something printable.
/// </summary>
/// <remarks>
/// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that
/// aren't part of the BMP, but we're targeting a variety of cross-assemblers with
/// potentially different notions of Unicode mappings, so anything non-ASCII is
/// getting hexified anyway.
/// </remarks>
public delegate char Convert(byte val);
/// <summary>
/// Character encoding.
/// </summary>
public enum Encoding {
Unknown = 0,
Ascii,
HighAscii,
C64Petscii,
C64ScreenCode,
}
//
// Plain ASCII.
//
// We recognize BELL, LF, and CR as control characters that may be present in
// text strings. This allows use to generate:
//
// .str "hello",$0d
//
// instead of:
//
// .str "hello"
// .dd1 $0d
//
public static bool IsPrintableAscii(byte val) {
return (val >= 0x20 && val < 0x7f);
}
public static bool IsExtendedAscii(byte val) {
return IsPrintableAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
}
public static char ConvertAscii(byte val) {
if (IsPrintableAscii(val)) {
return (char)val;
} else {
return UNPRINTABLE_CHAR;
}
}
//
// High ASCII: plain ASCII with the high bit set.
//
public static bool IsPrintableHighAscii(byte val) {
return (val >= 0xa0 && val < 0xff);
}
public static bool IsExtendedHighAscii(byte val) {
return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
}
public static char ConvertHighAscii(byte val) {
if (IsPrintableHighAscii(val)) {
return (char)(val & 0x7f);
} else {
return UNPRINTABLE_CHAR;
}
}
//
// High and/or low ASCII.
//
public static bool IsPrintableLowOrHighAscii(byte val) {
return IsPrintableAscii((byte)(val & 0x7f));
}
public static bool IsExtendedLowOrHighAscii(byte val) {
return IsExtendedAscii((byte)(val & 0x7f));
}
public static char ConvertLowAndHighAscii(byte val) {
//if (IsPrintableAscii(val) || IsPrintableHighAscii(val)) {
// return (char)(val & 0x7f);
//} else {
// return UNPRINTABLE_CHAR;
//}
return ConvertAscii((byte)(val & 0x7f));
}
//
// ATASCII (Atari 400/800)
//
// Substantially similar to ASCII, but with printable symbols in the control character
// range ($00-1f). Characters $60 and $7b-7f don't correspond to ASCII symbols.
//
// Characters with the high bit set are shown with colors reversed.
//
//
// PETSCII (C64 variant)
//
// Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
// mode, or text mode.
//
// Comparison to ASCII:
// $00-1f: control codes, many with C64-specific meanings
// $20-3f: same as ASCII
// $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
// replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
// $60-7f: upper case letters (rather than lower case); backquote, curly braces,
// vertical bar, and tilde replaced with non-ASCII symbols
// $80-9f: more control codes
// $a0-bf: non-ASCII symbols
// $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
// equal to lower case with the high bit set
// $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
//
// The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
// (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
// Anything outside that range will get printed as hex to ensure proper conversion.
//
// Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
// assigned to the caret and underscore values. So arguably those are "ASCII" as
// well, unless you're sane and define ASCII more narrowly.
//
// Control codes that we might expect to appear in the middle of a string:
// $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
// $93 - clear
// $12 92 - reverse on/off
// $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
//
// Other Commodore systems use variants on PETSCII, but the ASCII correspondence remains
// the same -- only the non-ASCII symbols change. (On the original PET, $60-7f was a
// duplicate of $20-3f rather than a duplicate of the upper-case letters, which might be
// why $c0-df is preferred for upper case.)
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
private static bool[] sPrintablePetscii = CreatePrintablePetsciiMap();
private static bool[] sExtendedPetscii = CreateExtendedPetsciiMap();
private static bool[] CreatePrintablePetsciiMap() {
bool[] map = new bool[256];
for (int i = 0x20; i <= 0x5b; i++) {
map[i] = true;
}
map[0x5d] = true;
for (int i = 0xc1; i <= 0xda; i++) {
map[i] = true;
}
return map;
}
private static bool[] CreateExtendedPetsciiMap() {
bool[] map = CreatePrintablePetsciiMap();
// control codes that we might expect to find in strings
map[0x05] = map[0x1c] = map[0x1e] = map[0x1f] = map[0x81] = map[0x90] = map[0x95] =
map[0x96] = map[0x97] = map[0x98] = map[0x99] = map[0x9a] = map[0x9b] =
map[0x9c] = map[0x9e] = map[0x9f] = true;
map[0x93] = map[0x12] = map[0x92] = true;
map[0x07] = map[0x0a] = map[0x0d] = true;
return map;
}
public static bool IsPrintableC64Petscii(byte val) {
return sPrintablePetscii[val];
}
public static bool IsExtendedC64Petscii(byte val) {
return sExtendedPetscii[val];
}
private static char[] sPetsciiToUnicode = CreatePetsciiToUnicodeMap();
private static char[] CreatePetsciiToUnicodeMap() {
// There are performance arguments for doing this with and without a table. For
// x64 with fast memory and large caches, table seems reasonable.
char[] map = new char[256];
for (int val = 0; val < 256; val++) {
char ch;
if ((val >= 0x20 && val <= 0x40) || val == 0x5b || val == 0x5d) {
ch = (char)val; // number/symbols, '[', ']'
} else if (val >= 0x41 && val <= 0x5a) {
ch = (char)(val + 0x20); // lower case
} else if (val >= 0xc1 && val <= 0xda) {
ch = (char)(val - 0x80); // upper case
} else {
Debug.Assert(!IsPrintableC64Petscii((byte)val));
ch = UNPRINTABLE_CHAR;
}
map[val] = ch;
}
return map;
}
public static char ConvertC64Petscii(byte val) {
return sPetsciiToUnicode[val];
}
public static char ConvertLowAndHighC64Petscii(byte val) {
// This is an odd one. Some programs use DCI with PETSCII, which means the
// string is allow lower case except for the last letteR.
//
// There's no such thing as "high PETSCII", in the same sense that ASCII or
// C64 screen codes have it, but I'm giving the method a similar name for
// the sake of consistency.
return ConvertC64Petscii((byte)(val & 0x7f));
}
//
// C64 Screen Codes
//
// Using character set 2, which includes lower case letters.
//
// $00-1f: lower case letters (PETSCII $40-5f)
// $20-3f: same as ASCII (PETSCII $20-3f)
// $40-5f: upper case letters (PETSCII $60-7f / $c0-df)
// $60-7f: non-ASCII symbols (PETSCII $a0-bf)
//
// With the high bit set, character colors are reversed. The printable ASCII set
// is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a]. By definition, only printable characters
// are included in the set, so there are no control codes.
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
private static bool[] sPrintableScreenCode = CreatePrintableScreenCodeMap();
private static bool[] CreatePrintableScreenCodeMap() {
bool[] map = new bool[256];
for (int i = 0x00; i <= 0x1b; i++) {
map[i] = true;
}
map[0x1d] = true;
for (int i = 0x20; i <= 0x3f; i++) {
map[i] = true;
}
for (int i = 0x41; i <= 0x5a; i++) {
map[i] = true;
}
return map;
}
public static bool IsPrintableC64ScreenCode(byte val) {
return sPrintableScreenCode[val];
}
public static bool IsExtendedC64ScreenCode(byte val) {
return sPrintableScreenCode[val];
}
private static char[] sScreenCodeToUnicode = CreateScreenCodeToUnicodeMap();
private static char[] CreateScreenCodeToUnicodeMap() {
char[] map = new char[256];
for (int val = 0; val < 256; val++) {
char ch;
if (val == 0x00 || val == 0x1b || val == 0x1d) {
ch = (char)(val + 0x40); // '@', '[', ']'
} else if (val >= 0x01 && val <= 0x1a) {
ch = (char)(val + 0x60); // lower case
} else if (val >= 0x20 && val <= 0x3f) {
ch = (char)(val); // numbers/symbols
} else if (val >= 0x41 && val <= 0x5a) {
ch = (char)(val); // upper case
} else {
Debug.Assert(!IsPrintableC64ScreenCode((byte)val));
ch = UNPRINTABLE_CHAR;
}
map[val] = ch;
}
return map;
}
public static char ConvertC64ScreenCode(byte val) {
return sScreenCodeToUnicode[val];
}
public static char ConvertLowAndHighC64ScreenCode(byte val) {
return ConvertC64ScreenCode((byte)(val & 0x7f));
}
}
}