add "kata" string encoding (Katakana)

This commit is contained in:
Irmen de Jong 2024-08-20 21:40:43 +02:00
parent d12b7ccc6b
commit b4facaeb3c
8 changed files with 102 additions and 9 deletions

View File

@ -8,7 +8,8 @@ enum class Encoding(val prefix: String) {
ISO("iso"), // cx16 (iso-8859-15)
ISO5("iso5"), // cx16 (iso-8859-5, cyrillic)
ISO16("iso16"), // cx16 (iso-8859-16, eastern european)
CP437("cp437") // cx16 (ibm pc, codepage 437)
CP437("cp437"), // cx16 (ibm pc, codepage 437)
KATAKANA("kata") // cx16 (katakana)
}
interface IStringEncoding {

View File

@ -19,6 +19,7 @@ object Encoder: IStringEncoding {
Encoding.ISO5 -> IsoCyrillicEncoding.encode(str)
Encoding.ISO16 -> IsoEasternEncoding.encode(str)
Encoding.CP437 -> Cp437Encoding.encode(str)
Encoding.KATAKANA -> KatakanaEncoding.encode(str)
else -> throw InternalCompilerException("unsupported encoding $encoding")
}
return coded.fold(
@ -35,6 +36,7 @@ object Encoder: IStringEncoding {
Encoding.ISO5 -> IsoCyrillicEncoding.decode(bytes)
Encoding.ISO16 -> IsoEasternEncoding.decode(bytes)
Encoding.CP437 -> Cp437Encoding.decode(bytes)
Encoding.KATAKANA -> KatakanaEncoding.decode(bytes)
else -> throw InternalCompilerException("unsupported encoding $encoding")
}
return decoded.fold(

View File

@ -0,0 +1,72 @@
package prog8.code.target.encodings
import com.github.michaelbull.result.Err
import com.github.michaelbull.result.Ok
import com.github.michaelbull.result.Result
import java.io.CharConversionException
import java.nio.charset.Charset
object KatakanaEncoding {
val charset: Charset = Charset.forName("JIS_X0201")
fun encode(str: String): Result<List<UByte>, CharConversionException> {
return try {
val mapped = str.map { chr ->
when (chr) {
// TODO: Convert regular katakana to halfwidth katakana (java lib doesn't do that for us
// and simply returns '?' upon reaching a regular katakana character)
// NOTE: we probably need to somehow do that before we reach this `when`,
// as one regular katakana character often results in two HW katakana characters
// due to differences in how diacritics are handled.
'\u0000' -> 0u
'\u00a0' -> 0xa0u // $a0 isn't technically a part of JIS X 0201 spec, and so we need to handle this ourselves
'♥' -> 0xe3u
'♦' -> 0xe4u
'♣' -> 0xe5u
'♠' -> 0xe6u
'大' -> 0xeau
'中' -> 0xebu
'小' -> 0xecu
'百' -> 0xedu
'千' -> 0xeeu
'万' -> 0xefu
'♪' -> 0xf0u
'土' -> 0xf1u
'金' -> 0xf2u
'木' -> 0xf3u
'水' -> 0xf4u
'火' -> 0xf5u
'月' -> 0xf6u
'日' -> 0xf7u
'時' -> 0xf8u
'分' -> 0xf9u
'秒' -> 0xfau
'年' -> 0xfbu
'円' -> 0xfcu
'人' -> 0xfdu
'生' -> 0xfeu
'〒' -> 0xffu
in '\u8000'..'\u80ff' -> {
// special case: take the lower 8 bit hex value directly
(chr.code - 0x8000).toUByte()
}
else -> charset.encode(chr.toString())[0].toUByte()
}
}
Ok(mapped)
} catch (ce: CharConversionException) {
Err(ce)
}
}
fun decode(bytes: Iterable<UByte>): Result<String, CharConversionException> {
return try {
Ok(String(bytes.map { it.toByte() }.toByteArray(), charset))
} catch (ce: CharConversionException) {
Err(ce)
}
}
}

View File

@ -262,6 +262,12 @@ sub iso16() {
cx16.screen_set_charset(10, 0) ; charset
}
sub kata() {
; -- switch to katakana character set
cbm.CHROUT($0f) ; iso mode
cx16.screen_set_charset(12, 0) ; charset
}
asmsub scroll_left() clobbers(A, X, Y) {
; ---- scroll the whole screen 1 character to the left
; contents of the rightmost column are unchanged, you should clear/refill this yourself

View File

@ -548,13 +548,14 @@ String length is limited to 255 characters.
Here are examples of the various encodings:
- ``"hello"`` a string translated into the default character encoding (PETSCII on the CBM machines)
- ``petscii:"hello"`` string in CBM PETSCII encoding
- ``sc:"my name is Alice"`` string in CBM screencode encoding
- ``iso:"Ich heiße François"`` string in iso-8859-15 encoding (Latin)
- ``iso5:"Хозяин и Работник"`` string in iso-8859-5 encoding (Cyrillic)
- ``iso16:"zażółć gęślą jaźń"`` string in iso-8859-16 encoding (Eastern Europe)
- ``atascii:"I am Atari!"`` string in "atascii" encoding (Atari 8-bit)
- ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"`` string in "cp437" encoding (IBM PC codepage 437)
- ``petscii:"hello"`` string in CBM PETSCII encoding
- ``sc:"my name is Alice"`` string in CBM screencode encoding
- ``iso:"Ich heiße François"`` string in iso-8859-15 encoding (Latin)
- ``iso5:"Хозяин и Работник"`` string in iso-8859-5 encoding (Cyrillic)
- ``iso16:"zażółć gęślą jaźń"`` string in iso-8859-16 encoding (Eastern Europe)
- ``atascii:"I am Atari!"`` string in "atascii" encoding (Atari 8-bit)
- ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"`` string in "cp437" encoding (IBM PC codepage 437)
- ``kata:"アノ ニホンジン ワ ガイコクジン。"`` string in "kata" encoding (Katakana half-width support only for now)
There are several escape sequences available to put special characters into your string value:

View File

@ -18,6 +18,7 @@ Compiler:
- Relax newline / bracket in parser so that you can put open and close brackets on the same line or on the next line if you so wish. For example be able to write a true one liner?
- Can we support signed % (remainder) somehow?
- IR: implement missing operators in AssignmentGen (array shifts etc)
- expand the kata encoding to somehow translate normal katana to half-widths? (see comment in KatakanaEncoding)
- instead of copy-pasting inline asmsubs, make them into a 64tass macro and use that instead.
that will allow them to be reused from custom user written assembly code as well.
- Multidimensional arrays and chained indexing, purely as syntactic sugar over regular arrays.

View File

@ -13,7 +13,10 @@ main {
wait()
eastern()
wait()
kata()
wait()
ibmpc()
wait()
}
sub latin() {
@ -48,6 +51,13 @@ main {
txt.nl()
}
sub kata() {
txt.kata()
repeat 3 txt.nl()
write_screencodes(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。")
txt.print(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。")
}
sub wait() {
txt.print("\n\npress enter: ")
void txt.input_chars(buf)

View File

@ -12,7 +12,7 @@
<option name="HAS_STRING_ESCAPES" value="true" />
</options>
<keywords keywords="&amp;;-&gt;;@;and;as;asmsub;break;clobbers;continue;do;downto;else;false;for;goto;if;if_cc;if_cs;if_eq;if_mi;if_ne;if_neg;if_nz;if_pl;if_pos;if_vc;if_vs;if_z;in;inline;not;or;repeat;return;romsub;step;sub;to;true;unroll;until;when;while;xor;~" ignore_case="false" />
<keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%encoding;%import;%ir;%launcher;%option;%output;%zeropage;%zpallowed;%zpreserved;atascii:;cp437:;default:;iso16:;iso5:;iso:;petscii:;sc:" />
<keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%encoding;%import;%ir;%launcher;%option;%output;%zeropage;%zpallowed;%zpreserved;atascii:;cp437:;default:;iso16:;iso5:;iso:;kata:;petscii:;sc:" />
<keywords3 keywords="@nozp;@requirezp;@shared;@split;@zp;bool;byte;const;float;str;ubyte;uword;void;word" />
<keywords4 keywords="abs;call;callfar;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekf;peekw;poke;pokef;pokew;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;setlsb;setmsb;sgn;sizeof;sqrt" />
</highlighting>