diff --git a/codeCore/src/prog8/code/core/IStringEncoding.kt b/codeCore/src/prog8/code/core/IStringEncoding.kt index 843a3f1c2..e80e69bb4 100644 --- a/codeCore/src/prog8/code/core/IStringEncoding.kt +++ b/codeCore/src/prog8/code/core/IStringEncoding.kt @@ -8,7 +8,8 @@ enum class Encoding(val prefix: String) { ISO("iso"), // cx16 (iso-8859-15) ISO5("iso5"), // cx16 (iso-8859-5, cyrillic) ISO16("iso16"), // cx16 (iso-8859-16, eastern european) - CP437("cp437") // cx16 (ibm pc, codepage 437) + CP437("cp437"), // cx16 (ibm pc, codepage 437) + KATAKANA("kata") // cx16 (katakana) } interface IStringEncoding { diff --git a/codeCore/src/prog8/code/target/Encoder.kt b/codeCore/src/prog8/code/target/Encoder.kt index b1294c95d..9b1284aa5 100644 --- a/codeCore/src/prog8/code/target/Encoder.kt +++ b/codeCore/src/prog8/code/target/Encoder.kt @@ -19,6 +19,7 @@ object Encoder: IStringEncoding { Encoding.ISO5 -> IsoCyrillicEncoding.encode(str) Encoding.ISO16 -> IsoEasternEncoding.encode(str) Encoding.CP437 -> Cp437Encoding.encode(str) + Encoding.KATAKANA -> KatakanaEncoding.encode(str) else -> throw InternalCompilerException("unsupported encoding $encoding") } return coded.fold( @@ -35,6 +36,7 @@ object Encoder: IStringEncoding { Encoding.ISO5 -> IsoCyrillicEncoding.decode(bytes) Encoding.ISO16 -> IsoEasternEncoding.decode(bytes) Encoding.CP437 -> Cp437Encoding.decode(bytes) + Encoding.KATAKANA -> KatakanaEncoding.decode(bytes) else -> throw InternalCompilerException("unsupported encoding $encoding") } return decoded.fold( diff --git a/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt b/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt new file mode 100644 index 000000000..3a4e11ba4 --- /dev/null +++ b/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt @@ -0,0 +1,72 @@ +package prog8.code.target.encodings + +import com.github.michaelbull.result.Err +import com.github.michaelbull.result.Ok +import com.github.michaelbull.result.Result +import java.io.CharConversionException +import java.nio.charset.Charset + +object KatakanaEncoding { + val charset: Charset = Charset.forName("JIS_X0201") + + fun encode(str: String): Result, CharConversionException> { + return try { + val mapped = str.map { chr -> + when (chr) { + // TODO: Convert regular katakana to halfwidth katakana (java lib doesn't do that for us + // and simply returns '?' upon reaching a regular katakana character) + // NOTE: we probably need to somehow do that before we reach this `when`, + // as one regular katakana character often results in two HW katakana characters + // due to differences in how diacritics are handled. + + '\u0000' -> 0u + '\u00a0' -> 0xa0u // $a0 isn't technically a part of JIS X 0201 spec, and so we need to handle this ourselves + + '♥' -> 0xe3u + '♦' -> 0xe4u + '♣' -> 0xe5u + '♠' -> 0xe6u + + '大' -> 0xeau + '中' -> 0xebu + '小' -> 0xecu + '百' -> 0xedu + '千' -> 0xeeu + '万' -> 0xefu + '♪' -> 0xf0u + '土' -> 0xf1u + '金' -> 0xf2u + '木' -> 0xf3u + '水' -> 0xf4u + '火' -> 0xf5u + '月' -> 0xf6u + '日' -> 0xf7u + '時' -> 0xf8u + '分' -> 0xf9u + '秒' -> 0xfau + '年' -> 0xfbu + '円' -> 0xfcu + '人' -> 0xfdu + '生' -> 0xfeu + '〒' -> 0xffu + in '\u8000'..'\u80ff' -> { + // special case: take the lower 8 bit hex value directly + (chr.code - 0x8000).toUByte() + } + else -> charset.encode(chr.toString())[0].toUByte() + } + } + Ok(mapped) + } catch (ce: CharConversionException) { + Err(ce) + } + } + + fun decode(bytes: Iterable): Result { + return try { + Ok(String(bytes.map { it.toByte() }.toByteArray(), charset)) + } catch (ce: CharConversionException) { + Err(ce) + } + } +} \ No newline at end of file diff --git a/compiler/res/prog8lib/cx16/textio.p8 b/compiler/res/prog8lib/cx16/textio.p8 index 6cc002184..8957c36a8 100644 --- a/compiler/res/prog8lib/cx16/textio.p8 +++ b/compiler/res/prog8lib/cx16/textio.p8 @@ -262,6 +262,12 @@ sub iso16() { cx16.screen_set_charset(10, 0) ; charset } +sub kata() { + ; -- switch to katakana character set + cbm.CHROUT($0f) ; iso mode + cx16.screen_set_charset(12, 0) ; charset +} + asmsub scroll_left() clobbers(A, X, Y) { ; ---- scroll the whole screen 1 character to the left ; contents of the rightmost column are unchanged, you should clear/refill this yourself diff --git a/docs/source/syntaxreference.rst b/docs/source/syntaxreference.rst index b1fe97278..856b6a2bf 100644 --- a/docs/source/syntaxreference.rst +++ b/docs/source/syntaxreference.rst @@ -548,13 +548,14 @@ String length is limited to 255 characters. Here are examples of the various encodings: - ``"hello"`` a string translated into the default character encoding (PETSCII on the CBM machines) - - ``petscii:"hello"`` string in CBM PETSCII encoding - - ``sc:"my name is Alice"`` string in CBM screencode encoding - - ``iso:"Ich heiße François"`` string in iso-8859-15 encoding (Latin) - - ``iso5:"Хозяин и Работник"`` string in iso-8859-5 encoding (Cyrillic) - - ``iso16:"zażółć gęślą jaźń"`` string in iso-8859-16 encoding (Eastern Europe) - - ``atascii:"I am Atari!"`` string in "atascii" encoding (Atari 8-bit) - - ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"`` string in "cp437" encoding (IBM PC codepage 437) + - ``petscii:"hello"`` string in CBM PETSCII encoding + - ``sc:"my name is Alice"`` string in CBM screencode encoding + - ``iso:"Ich heiße François"`` string in iso-8859-15 encoding (Latin) + - ``iso5:"Хозяин и Работник"`` string in iso-8859-5 encoding (Cyrillic) + - ``iso16:"zażółć gęślą jaźń"`` string in iso-8859-16 encoding (Eastern Europe) + - ``atascii:"I am Atari!"`` string in "atascii" encoding (Atari 8-bit) + - ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"`` string in "cp437" encoding (IBM PC codepage 437) + - ``kata:"アノ ニホンジン ワ ガイコクジン。"`` string in "kata" encoding (Katakana half-width support only for now) There are several escape sequences available to put special characters into your string value: diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 45a687f17..e20b8a576 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -18,6 +18,7 @@ Compiler: - Relax newline / bracket in parser so that you can put open and close brackets on the same line or on the next line if you so wish. For example be able to write a true one liner? - Can we support signed % (remainder) somehow? - IR: implement missing operators in AssignmentGen (array shifts etc) +- expand the kata encoding to somehow translate normal katana to half-widths? (see comment in KatakanaEncoding) - instead of copy-pasting inline asmsubs, make them into a 64tass macro and use that instead. that will allow them to be reused from custom user written assembly code as well. - Multidimensional arrays and chained indexing, purely as syntactic sugar over regular arrays. diff --git a/examples/cx16/charsets.p8 b/examples/cx16/charsets.p8 index 893204233..63027f894 100644 --- a/examples/cx16/charsets.p8 +++ b/examples/cx16/charsets.p8 @@ -13,7 +13,10 @@ main { wait() eastern() wait() + kata() + wait() ibmpc() + wait() } sub latin() { @@ -48,6 +51,13 @@ main { txt.nl() } + sub kata() { + txt.kata() + repeat 3 txt.nl() + write_screencodes(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。") + txt.print(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。") + } + sub wait() { txt.print("\n\npress enter: ") void txt.input_chars(buf) diff --git a/syntax-files/IDEA/Prog8.xml b/syntax-files/IDEA/Prog8.xml index b0b633718..b2459c545 100644 --- a/syntax-files/IDEA/Prog8.xml +++ b/syntax-files/IDEA/Prog8.xml @@ -12,7 +12,7 @@