1
0
mirror of https://github.com/fadden/6502bench.git synced 2024-11-19 21:31:30 +00:00
6502bench/Asm65/CharEncoding.cs

296 lines
12 KiB
C#
Raw Normal View History

/*
* Copyright 2019 faddenSoft
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Diagnostics;
namespace Asm65 {
/// <summary>
/// Character encoding helper methods.
/// </summary>
public static class CharEncoding {
public const char UNPRINTABLE_CHAR = '\ufffd'; // Unicode REPLACEMENT CHARACTER
/// <summary>
/// Determines whether the byte represents a member of the character set. The
/// specifics (e.g. printable only) are defined by the method.
/// </summary>
public delegate bool InclusionTest(byte val);
/// <summary>
/// Converts the byte to a printable character. Returns UNPRINTABLE_CHAR if the value
/// does not map to something printable.
/// </summary>
/// <remarks>
/// Yes, I'm assuming it all fits in a UTF-16 char. PETSCII has some glyphs that
/// aren't part of the BMP, but we're targeting a variety of cross-assemblers with
/// potentially different notions of Unicode mappings, so anything non-ASCII is
/// getting hexified anyway.
/// </remarks>
public delegate char Convert(byte val);
/// <summary>
/// Character encoding.
/// </summary>
public enum Encoding {
Unknown = 0,
Ascii,
HighAscii,
C64Petscii,
C64ScreenCode,
}
//
// Plain ASCII.
//
// We recognize BELL, LF, and CR as control characters that may be present in
// text strings. This allows use to generate:
//
// .str "hello",$0d
//
// instead of:
//
// .str "hello"
// .dd1 $0d
//
public static bool IsPrintableAscii(byte val) {
return (val >= 0x20 && val < 0x7f);
}
public static bool IsExtendedAscii(byte val) {
return IsPrintableAscii(val) || val == 0x07 || val == 0x0a || val == 0x0d;
}
public static char ConvertAscii(byte val) {
if (IsPrintableAscii(val)) {
return (char)val;
} else {
return UNPRINTABLE_CHAR;
}
}
//
// High ASCII: plain ASCII with the high bit set.
//
public static bool IsPrintableHighAscii(byte val) {
return (val >= 0xa0 && val < 0xff);
}
public static bool IsExtendedHighAscii(byte val) {
return IsPrintableHighAscii(val) || val == 0x87 || val == 0x8a || val == 0x8d;
}
public static char ConvertHighAscii(byte val) {
if (IsPrintableHighAscii(val)) {
return (char)(val & 0x7f);
} else {
return UNPRINTABLE_CHAR;
}
}
//
// High and/or low ASCII.
//
public static bool IsPrintableLowOrHighAscii(byte val) {
return IsPrintableAscii((byte)(val & 0x7f));
}
public static bool IsExtendedLowOrHighAscii(byte val) {
return IsExtendedAscii((byte)(val & 0x7f));
}
public static char ConvertLowAndHighAscii(byte val) {
//if (IsPrintableAscii(val) || IsPrintableHighAscii(val)) {
// return (char)(val & 0x7f);
//} else {
// return UNPRINTABLE_CHAR;
//}
return ConvertAscii((byte)(val & 0x7f));
}
//
// ATASCII (Atari 400/800)
//
// Substantially similar to ASCII, but with printable symbols in the control character
// range ($00-1f). Characters $60 and $7b-7f don't correspond to ASCII symbols.
//
// Characters with the high bit set are shown with colors reversed.
//
//
// PETSCII (C64 variant)
//
// Assemblers like ACME use the C64 character set 2, a/k/a shifted mode, lower case
// mode, or text mode.
//
// Comparison to ASCII:
// $00-1f: control codes, many with C64-specific meanings
// $20-3f: same as ASCII
// $40-5f: lower case letters (rather than upper case); backslash, caret, and underscore
// replaced with non-ASCII symbols (though the up-arrow in place of caret is close)
// $60-7f: upper case letters (rather than lower case); backquote, curly braces,
// vertical bar, and tilde replaced with non-ASCII symbols
// $80-9f: more control codes
// $a0-bf: non-ASCII symbols
// $c0-df: clone of $60-7f; by convention this is used for upper case, since it's
// equal to lower case with the high bit set
// $e0-ff: non-ASCII symbols (mostly a clone of $a0-bf)
//
// The printable ASCII set (glyphs in [$20,$7e]) is [$20,$5b]+$5d+[$c1,$da].
// (Looks like the Pet had $5c=backslash, but C64 went with a \u00a3 POUND SIGN instead.)
// Anything outside that range will get printed as hex to ensure proper conversion.
//
// Note for the pedantic: in ASCII-1963, up-arrow and left-arrow characters were
// assigned to the caret and underscore values. So arguably those are "ASCII" as
// well, unless you're sane and define ASCII more narrowly.
//
// Control codes that we might expect to appear in the middle of a string:
// $05 1c 1e 1f 81 90 95 96 97 98 99 9a 9b 9c 9e 9f - set text color
// $93 - clear
// $12 92 - reverse on/off
// $07 0a 0d - bell, LF, CR (note CR is favored for EOL)
//
// Other Commodore systems use variants on PETSCII, but the ASCII correspondence remains
// the same -- only the non-ASCII symbols change. (On the original PET, $60-7f was a
// duplicate of $20-3f rather than a duplicate of the upper-case letters, which might be
// why $c0-df is preferred for upper case.)
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
private static bool[] sPrintablePetscii = CreatePrintablePetsciiMap();
private static bool[] sExtendedPetscii = CreateExtendedPetsciiMap();
private static bool[] CreatePrintablePetsciiMap() {
bool[] map = new bool[256];
for (int i = 0x20; i <= 0x5b; i++) {
map[i] = true;
}
map[0x5d] = true;
for (int i = 0xc1; i <= 0xda; i++) {
map[i] = true;
}
return map;
}
private static bool[] CreateExtendedPetsciiMap() {
bool[] map = CreatePrintablePetsciiMap();
// control codes that we might expect to find in strings
map[0x05] = map[0x1c] = map[0x1e] = map[0x1f] = map[0x81] = map[0x90] = map[0x95] =
map[0x96] = map[0x97] = map[0x98] = map[0x99] = map[0x9a] = map[0x9b] =
map[0x9c] = map[0x9e] = map[0x9f] = true;
map[0x93] = map[0x12] = map[0x92] = true;
map[0x07] = map[0x0a] = map[0x0d] = true;
return map;
}
public static bool IsPrintableC64Petscii(byte val) {
return sPrintablePetscii[val];
}
public static bool IsExtendedC64Petscii(byte val) {
return sExtendedPetscii[val];
}
Various improvements The PseudoOpNames class is increasingly being used in situations where mutability is undesirable. This change makes instances immutable, eliminating the Copy() method and adding a constructor that takes a Dictionary. The serialization code now operates on a Dictionary instead of the class properties, but the JSON encoding is identical, so this doesn't invalidate app settings file data. Added an equality test to PseudoOpNames. In LineListGen, don't reset the line list if the names haven't actually changed. Use a table lookup for C64 character conversions. I figure that should be faster than multiple conditionals on a modern x64 system. Fixed a 64tass generator issue where we tried to query project properties in a call that might not have a project available (specifically, getting FormatConfig values out of the generator for use in the "quick set" buttons for Display Format). Fixed a regression test harness issue where, if the assembler reported success but didn't actually generate output, an exception would be thrown that halted the tests. Increased the width of text entry fields on the Pseudo-Op tab of app settings. The previous 8-character limit wasn't wide enough to hold ACME's "!pseudopc". Also, use TrimEnd() to remove trailing spaces (leading spaces are still allowed). In the last couple of months, Win10 started stalling for a fraction of a second when executing assemblers. It doesn't do this every time; mostly it happens if it has been a while since the assembler was run. My guess is this has to do with changes to the built-in malware scanner. Whatever the case, we now change the mouse pointer to a wait cursor while updating the assembler version cache.
2019-08-17 18:14:05 +00:00
private static char[] sPetsciiToUnicode = CreatePetsciiToUnicodeMap();
private static char[] CreatePetsciiToUnicodeMap() {
// There are performance arguments for doing this with and without a table. For
// x64 with fast memory and large caches, table seems reasonable.
char[] map = new char[256];
for (int val = 0; val < 256; val++) {
char ch;
if ((val >= 0x20 && val <= 0x40) || val == 0x5b || val == 0x5d) {
ch = (char)val; // number/symbols, '[', ']'
} else if (val >= 0x41 && val <= 0x5a) {
ch = (char)(val + 0x20); // lower case
} else if (val >= 0xc1 && val <= 0xda) {
ch = (char)(val - 0x80); // upper case
} else {
Debug.Assert(!IsPrintableC64Petscii((byte)val));
ch = UNPRINTABLE_CHAR;
}
map[val] = ch;
}
Various improvements The PseudoOpNames class is increasingly being used in situations where mutability is undesirable. This change makes instances immutable, eliminating the Copy() method and adding a constructor that takes a Dictionary. The serialization code now operates on a Dictionary instead of the class properties, but the JSON encoding is identical, so this doesn't invalidate app settings file data. Added an equality test to PseudoOpNames. In LineListGen, don't reset the line list if the names haven't actually changed. Use a table lookup for C64 character conversions. I figure that should be faster than multiple conditionals on a modern x64 system. Fixed a 64tass generator issue where we tried to query project properties in a call that might not have a project available (specifically, getting FormatConfig values out of the generator for use in the "quick set" buttons for Display Format). Fixed a regression test harness issue where, if the assembler reported success but didn't actually generate output, an exception would be thrown that halted the tests. Increased the width of text entry fields on the Pseudo-Op tab of app settings. The previous 8-character limit wasn't wide enough to hold ACME's "!pseudopc". Also, use TrimEnd() to remove trailing spaces (leading spaces are still allowed). In the last couple of months, Win10 started stalling for a fraction of a second when executing assemblers. It doesn't do this every time; mostly it happens if it has been a while since the assembler was run. My guess is this has to do with changes to the built-in malware scanner. Whatever the case, we now change the mouse pointer to a wait cursor while updating the assembler version cache.
2019-08-17 18:14:05 +00:00
return map;
}
public static char ConvertC64Petscii(byte val) {
return sPetsciiToUnicode[val];
}
public static char ConvertLowAndHighC64Petscii(byte val) {
// This is an odd one. Some programs use DCI with PETSCII, which means the
// string is allow lower case except for the last letteR.
//
// There's no such thing as "high PETSCII", in the same sense that ASCII or
// C64 screen codes have it, but I'm giving the method a similar name for
// the sake of consistency.
return ConvertC64Petscii((byte)(val & 0x7f));
}
//
// C64 Screen Codes
//
// Using character set 2, which includes lower case letters.
//
// $00-1f: lower case letters (PETSCII $40-5f)
// $20-3f: same as ASCII (PETSCII $20-3f)
// $40-5f: upper case letters (PETSCII $60-7f / $c0-df)
// $60-7f: non-ASCII symbols (PETSCII $a0-bf)
//
// With the high bit set, character colors are reversed. The printable ASCII set
// is [$00,$1b]+$1d+[$20,$3f]+[$41,$5a]. By definition, only printable characters
// are included in the set, so there are no control codes.
//
// For full details, see the chart at https://www.aivosto.com/articles/petscii.pdf
//
private static bool[] sPrintableScreenCode = CreatePrintableScreenCodeMap();
private static bool[] CreatePrintableScreenCodeMap() {
bool[] map = new bool[256];
for (int i = 0x00; i <= 0x1b; i++) {
map[i] = true;
}
map[0x1d] = true;
for (int i = 0x20; i <= 0x3f; i++) {
map[i] = true;
}
for (int i = 0x41; i <= 0x5a; i++) {
map[i] = true;
}
return map;
}
public static bool IsPrintableC64ScreenCode(byte val) {
return sPrintableScreenCode[val];
}
public static bool IsExtendedC64ScreenCode(byte val) {
return sPrintableScreenCode[val];
}
Various improvements The PseudoOpNames class is increasingly being used in situations where mutability is undesirable. This change makes instances immutable, eliminating the Copy() method and adding a constructor that takes a Dictionary. The serialization code now operates on a Dictionary instead of the class properties, but the JSON encoding is identical, so this doesn't invalidate app settings file data. Added an equality test to PseudoOpNames. In LineListGen, don't reset the line list if the names haven't actually changed. Use a table lookup for C64 character conversions. I figure that should be faster than multiple conditionals on a modern x64 system. Fixed a 64tass generator issue where we tried to query project properties in a call that might not have a project available (specifically, getting FormatConfig values out of the generator for use in the "quick set" buttons for Display Format). Fixed a regression test harness issue where, if the assembler reported success but didn't actually generate output, an exception would be thrown that halted the tests. Increased the width of text entry fields on the Pseudo-Op tab of app settings. The previous 8-character limit wasn't wide enough to hold ACME's "!pseudopc". Also, use TrimEnd() to remove trailing spaces (leading spaces are still allowed). In the last couple of months, Win10 started stalling for a fraction of a second when executing assemblers. It doesn't do this every time; mostly it happens if it has been a while since the assembler was run. My guess is this has to do with changes to the built-in malware scanner. Whatever the case, we now change the mouse pointer to a wait cursor while updating the assembler version cache.
2019-08-17 18:14:05 +00:00
private static char[] sScreenCodeToUnicode = CreateScreenCodeToUnicodeMap();
private static char[] CreateScreenCodeToUnicodeMap() {
char[] map = new char[256];
for (int val = 0; val < 256; val++) {
char ch;
if (val == 0x00 || val == 0x1b || val == 0x1d) {
ch = (char)(val + 0x40); // '@', '[', ']'
} else if (val >= 0x01 && val <= 0x1a) {
ch = (char)(val + 0x60); // lower case
} else if (val >= 0x20 && val <= 0x3f) {
ch = (char)(val); // numbers/symbols
} else if (val >= 0x41 && val <= 0x5a) {
ch = (char)(val); // upper case
} else {
Debug.Assert(!IsPrintableC64ScreenCode((byte)val));
ch = UNPRINTABLE_CHAR;
}
map[val] = ch;
}
Various improvements The PseudoOpNames class is increasingly being used in situations where mutability is undesirable. This change makes instances immutable, eliminating the Copy() method and adding a constructor that takes a Dictionary. The serialization code now operates on a Dictionary instead of the class properties, but the JSON encoding is identical, so this doesn't invalidate app settings file data. Added an equality test to PseudoOpNames. In LineListGen, don't reset the line list if the names haven't actually changed. Use a table lookup for C64 character conversions. I figure that should be faster than multiple conditionals on a modern x64 system. Fixed a 64tass generator issue where we tried to query project properties in a call that might not have a project available (specifically, getting FormatConfig values out of the generator for use in the "quick set" buttons for Display Format). Fixed a regression test harness issue where, if the assembler reported success but didn't actually generate output, an exception would be thrown that halted the tests. Increased the width of text entry fields on the Pseudo-Op tab of app settings. The previous 8-character limit wasn't wide enough to hold ACME's "!pseudopc". Also, use TrimEnd() to remove trailing spaces (leading spaces are still allowed). In the last couple of months, Win10 started stalling for a fraction of a second when executing assemblers. It doesn't do this every time; mostly it happens if it has been a while since the assembler was run. My guess is this has to do with changes to the built-in malware scanner. Whatever the case, we now change the mouse pointer to a wait cursor while updating the assembler version cache.
2019-08-17 18:14:05 +00:00
return map;
}
public static char ConvertC64ScreenCode(byte val) {
return sScreenCodeToUnicode[val];
}
public static char ConvertLowAndHighC64ScreenCode(byte val) {
return ConvertC64ScreenCode((byte)(val & 0x7f));
}
}
}