add "kata" string encoding (Katakana)

2024-11-25 19:31:36 +00:00 · 2024-08-20 21:40:43 +02:00 · 2024-08-20 21:40:43 +02:00 · b4facaeb3c
commit b4facaeb3c
parent d12b7ccc6b
8 changed files with 102 additions and 9 deletions
--- a/codeCore/src/prog8/code/core/IStringEncoding.kt
+++ b/codeCore/src/prog8/code/core/IStringEncoding.kt
@ -8,7 +8,8 @@ enum class Encoding(val prefix: String) {
    ISO("iso"),                 // cx16  (iso-8859-15)
    ISO5("iso5"),               // cx16  (iso-8859-5, cyrillic)
    ISO16("iso16"),             // cx16  (iso-8859-16, eastern european)
-    CP437("cp437")              // cx16  (ibm pc, codepage 437)
+    CP437("cp437"),             // cx16  (ibm pc, codepage 437)
+    KATAKANA("kata")                 // cx16  (katakana)
 }

 interface IStringEncoding {
--- a/codeCore/src/prog8/code/target/Encoder.kt
+++ b/codeCore/src/prog8/code/target/Encoder.kt
@ -19,6 +19,7 @@ object Encoder: IStringEncoding {
            Encoding.ISO5 -> IsoCyrillicEncoding.encode(str)
            Encoding.ISO16 -> IsoEasternEncoding.encode(str)
            Encoding.CP437 -> Cp437Encoding.encode(str)
+            Encoding.KATAKANA -> KatakanaEncoding.encode(str)
            else -> throw InternalCompilerException("unsupported encoding $encoding")
        }
        return coded.fold(
@ -35,6 +36,7 @@ object Encoder: IStringEncoding {
            Encoding.ISO5 -> IsoCyrillicEncoding.decode(bytes)
            Encoding.ISO16 -> IsoEasternEncoding.decode(bytes)
            Encoding.CP437 -> Cp437Encoding.decode(bytes)
+            Encoding.KATAKANA -> KatakanaEncoding.decode(bytes)
            else -> throw InternalCompilerException("unsupported encoding $encoding")
        }
        return decoded.fold(
--- a/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt
+++ b/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt
@ -0,0 +1,72 @@
+package prog8.code.target.encodings
+
+import com.github.michaelbull.result.Err
+import com.github.michaelbull.result.Ok
+import com.github.michaelbull.result.Result
+import java.io.CharConversionException
+import java.nio.charset.Charset
+
+object KatakanaEncoding {
+    val charset: Charset = Charset.forName("JIS_X0201")
+
+    fun encode(str: String): Result<List<UByte>, CharConversionException> {
+        return try {
+            val mapped = str.map { chr ->
+                when (chr) {
+                    // TODO: Convert regular katakana to halfwidth katakana (java lib doesn't do that for us
+                    //       and simply returns '?' upon reaching a regular katakana character)
+                    //       NOTE: we probably need to somehow do that before we reach this `when`,
+                    //             as one regular katakana character often results in two HW katakana characters
+                    //             due to differences in how diacritics are handled.
+
+                    '\u0000' -> 0u
+                    '\u00a0' -> 0xa0u // $a0 isn't technically a part of JIS X 0201 spec, and so we need to handle this ourselves
+
+                    '♥' -> 0xe3u
+                    '♦' -> 0xe4u
+                    '♣' -> 0xe5u
+                    '♠' -> 0xe6u
+
+                    '大' -> 0xeau
+                    '中' -> 0xebu
+                    '小' -> 0xecu
+                    '百' -> 0xedu
+                    '千' -> 0xeeu
+                    '万' -> 0xefu
+                    '♪' -> 0xf0u
+                    '土' -> 0xf1u
+                    '金' -> 0xf2u
+                    '木' -> 0xf3u
+                    '水' -> 0xf4u
+                    '火' -> 0xf5u
+                    '月' -> 0xf6u
+                    '日' -> 0xf7u
+                    '時' -> 0xf8u
+                    '分' -> 0xf9u
+                    '秒' -> 0xfau
+                    '年' -> 0xfbu
+                    '円' -> 0xfcu
+                    '人' -> 0xfdu
+                    '生' -> 0xfeu
+                    '〒' -> 0xffu
+                    in '\u8000'..'\u80ff' -> {
+                        // special case: take the lower 8 bit hex value directly
+                        (chr.code - 0x8000).toUByte()
+                    }
+                    else -> charset.encode(chr.toString())[0].toUByte()
+                }
+            }
+            Ok(mapped)
+        } catch (ce: CharConversionException) {
+            Err(ce)
+        }
+    }
+
+    fun decode(bytes: Iterable<UByte>): Result<String, CharConversionException> {
+        return try {
+            Ok(String(bytes.map { it.toByte() }.toByteArray(), charset))
+        } catch (ce: CharConversionException) {
+            Err(ce)
+        }
+    }
+}
--- a/compiler/res/prog8lib/cx16/textio.p8
+++ b/compiler/res/prog8lib/cx16/textio.p8
@ -262,6 +262,12 @@ sub iso16() {
    cx16.screen_set_charset(10, 0)  ; charset
 }

+sub kata() {
+    ; -- switch to katakana character set
+    cbm.CHROUT($0f)                 ; iso mode
+    cx16.screen_set_charset(12, 0)  ; charset
+}
+
 asmsub  scroll_left() clobbers(A, X, Y)  {
 	; ---- scroll the whole screen 1 character to the left
 	;      contents of the rightmost column are unchanged, you should clear/refill this yourself
--- a/docs/source/syntaxreference.rst
+++ b/docs/source/syntaxreference.rst
@ -548,13 +548,14 @@ String length is limited to 255 characters.
 Here are examples of the various encodings:

    - ``"hello"``   a string translated into the default character encoding (PETSCII on the CBM machines)
-    - ``petscii:"hello"``            string in CBM PETSCII encoding
-    - ``sc:"my name is Alice"``      string in CBM screencode encoding
-    - ``iso:"Ich heiße François"``   string in iso-8859-15 encoding (Latin)
-    - ``iso5:"Хозяин и Работник"``   string in iso-8859-5 encoding (Cyrillic)
-    - ``iso16:"zażółć gęślą jaźń"``  string in iso-8859-16 encoding (Eastern Europe)
-    - ``atascii:"I am Atari!"``      string in "atascii" encoding (Atari 8-bit)
-    - ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"``  string in "cp437" encoding (IBM PC codepage 437)
+    - ``petscii:"hello"``               string in CBM PETSCII encoding
+    - ``sc:"my name is Alice"``         string in CBM screencode encoding
+    - ``iso:"Ich heiße François"``      string in iso-8859-15 encoding (Latin)
+    - ``iso5:"Хозяин и Работник"``      string in iso-8859-5 encoding (Cyrillic)
+    - ``iso16:"zażółć gęślą jaźń"``     string in iso-8859-16 encoding (Eastern Europe)
+    - ``atascii:"I am Atari!"``         string in "atascii" encoding (Atari 8-bit)
+    - ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"``     string in "cp437" encoding (IBM PC codepage 437)
+    - ``kata:"ｱﾉ ﾆﾎﾝｼﾞﾝ ﾜ ｶﾞｲｺｸｼﾞﾝ｡"``  string in "kata" encoding (Katakana half-width support only for now)


 There are several escape sequences available to put special characters into your string value:
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@ -18,6 +18,7 @@ Compiler:
 - Relax newline / bracket in parser so that you can put open and close brackets on the same line or on the next line if you so wish. For example be able to write a true one liner?
 - Can we support signed % (remainder) somehow?
 - IR: implement missing operators in AssignmentGen  (array shifts etc)
+- expand the kata encoding to somehow translate normal katana to half-widths?  (see comment in KatakanaEncoding)
 - instead of copy-pasting inline asmsubs, make them into a 64tass macro and use that instead.
  that will allow them to be reused from custom user written assembly code as well.
 - Multidimensional arrays and chained indexing, purely as syntactic sugar over regular arrays.
--- a/examples/cx16/charsets.p8
+++ b/examples/cx16/charsets.p8
@ -13,7 +13,10 @@ main {
        wait()
        eastern()
        wait()
+        kata()
+        wait()
        ibmpc()
+        wait()
    }

    sub latin() {
@ -48,6 +51,13 @@ main {
        txt.nl()
    }

+    sub kata() {
+        txt.kata()
+        repeat 3 txt.nl()
+        write_screencodes(kata:"Katakana hw: ｱﾉ ﾆﾎﾝｼﾞﾝ ﾜ ｶﾞｲｺｸｼﾞﾝ ﾉ ﾆﾎﾝｺﾞ ｶﾞ ｼﾞｮｳｽﾞ ﾀﾞｯﾃ ﾕｯﾀ｡")
+        txt.print(kata:"Katakana hw: ｱﾉ ﾆﾎﾝｼﾞﾝ ﾜ ｶﾞｲｺｸｼﾞﾝ ﾉ ﾆﾎﾝｺﾞ ｶﾞ ｼﾞｮｳｽﾞ ﾀﾞｯﾃ ﾕｯﾀ｡")
+    }
+
    sub wait() {
        txt.print("\n\npress enter: ")
        void txt.input_chars(buf)
--- a/syntax-files/IDEA/Prog8.xml
+++ b/syntax-files/IDEA/Prog8.xml
@ -12,7 +12,7 @@
      <option name="HAS_STRING_ESCAPES" value="true" />
    </options>
    <keywords keywords="&amp;;-&gt;;@;and;as;asmsub;break;clobbers;continue;do;downto;else;false;for;goto;if;if_cc;if_cs;if_eq;if_mi;if_ne;if_neg;if_nz;if_pl;if_pos;if_vc;if_vs;if_z;in;inline;not;or;repeat;return;romsub;step;sub;to;true;unroll;until;when;while;xor;~" ignore_case="false" />
-    <keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%encoding;%import;%ir;%launcher;%option;%output;%zeropage;%zpallowed;%zpreserved;atascii:;cp437:;default:;iso16:;iso5:;iso:;petscii:;sc:" />
+    <keywords2 keywords="%address;%asm;%asmbinary;%asminclude;%breakpoint;%encoding;%import;%ir;%launcher;%option;%output;%zeropage;%zpallowed;%zpreserved;atascii:;cp437:;default:;iso16:;iso5:;iso:;kata:;petscii:;sc:" />
    <keywords3 keywords="@nozp;@requirezp;@shared;@split;@zp;bool;byte;const;float;str;ubyte;uword;void;word" />
    <keywords4 keywords="abs;call;callfar;clamp;cmp;divmod;len;lsb;max;memory;min;mkword;msb;peek;peekf;peekw;poke;pokef;pokew;rol;rol2;ror;ror2;rrestore;rrestorex;rsave;rsavex;setlsb;setmsb;sgn;sizeof;sqrt" />
  </highlighting>