diff --git a/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt b/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt index 3a4e11ba4..b808bd093 100644 --- a/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt +++ b/codeCore/src/prog8/code/target/encodings/KatakanaEncoding.kt @@ -6,6 +6,61 @@ import com.github.michaelbull.result.Result import java.io.CharConversionException import java.nio.charset.Charset + +object JapaneseCharacterConverter { + // adapted from https://github.com/raminduw/Japanese-Character-Converter + + private val ZENKAKU_KATAKANA = charArrayOf( + 'ァ', 'ア', 'ィ', 'イ', 'ゥ', + 'ウ', 'ェ', 'エ', 'ォ', 'オ', 'カ', 'ガ', 'キ', 'ギ', 'ク', 'グ', 'ケ', 'ゲ', + 'コ', 'ゴ', 'サ', 'ザ', 'シ', 'ジ', 'ス', 'ズ', 'セ', 'ゼ', 'ソ', 'ゾ', 'タ', + 'ダ', 'チ', 'ヂ', 'ッ', 'ツ', 'ヅ', 'テ', 'デ', 'ト', 'ド', 'ナ', 'ニ', 'ヌ', + 'ネ', 'ノ', 'ハ', 'バ', 'パ', 'ヒ', 'ビ', 'ピ', 'フ', 'ブ', 'プ', 'ヘ', 'ベ', + 'ペ', 'ホ', 'ボ', 'ポ', 'マ', 'ミ', 'ム', 'メ', 'モ', 'ャ', 'ヤ', 'ュ', 'ユ', + 'ョ', 'ヨ', 'ラ', 'リ', 'ル', 'レ', 'ロ', 'ヮ', 'ワ', 'ヰ', 'ヱ', 'ヲ', 'ン', + 'ヴ', 'ヵ', 'ヶ' + ) + + private val HANKAKU_HIRAGANA = charArrayOf( + 'ぁ', 'あ', 'ぃ', 'い', 'ぅ', 'う', 'ぇ', 'え', + 'ぉ', 'お', 'か', 'が', 'き', 'ぎ', 'く', 'ぐ', + 'け', 'げ', 'こ', 'ご', 'さ', 'ざ', 'し', 'じ', + 'す', 'ず', 'せ', 'ぜ', 'そ', 'ぞ', 'た', 'だ', + 'ち', 'ぢ', 'っ', 'つ', 'づ', 'て', 'で', 'と', + 'ど', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ば', + 'ぱ', 'ひ', 'び', 'ぴ', 'ふ', 'ぶ', 'ぷ', 'へ', + 'べ', 'ぺ', 'ほ', 'ぼ', 'ぽ', 'ま', 'み', 'む', + 'め', 'も', 'ゃ', 'や', 'ゅ', 'ゆ', 'ょ', 'よ', + 'ら', 'り', 'る', 'れ', 'ろ', 'ゎ', 'わ', 'ゐ', + 'ゑ', 'を', 'ん', 'ゔ', 'ゕ', 'ゖ' + ) + + private val HANKAKU_KATAKANA = arrayOf( + "ァ", "ア", "ィ", "イ", "ゥ", + "ウ", "ェ", "エ", "ォ", "オ", "カ", "ガ", "キ", "ギ", "ク", "グ", "ケ", + "ゲ", "コ", "ゴ", "サ", "ザ", "シ", "ジ", "ス", "ズ", "セ", "ゼ", "ソ", + "ゾ", "タ", "ダ", "チ", "ヂ", "ッ", "ツ", "ヅ", "テ", "デ", "ト", "ド", + "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", "バ", "パ", "ヒ", "ビ", "ピ", "フ", + "ブ", "プ", "ヘ", "ベ", "ペ", "ホ", "ボ", "ポ", "マ", "ミ", "ム", "メ", + "モ", "ャ", "ヤ", "ュ", "ユ", "ョ", "ヨ", "ラ", "リ", "ル", "レ", "ロ", "ワ", + "ワ", "イ", "エ", "ヲ", "ン", "ヴ", "カ", "ケ" + ) + + private val ZENKAKU_KATAKANA_FIRST_CHAR_CODE = ZENKAKU_KATAKANA.first().code + private val HANKAKU_HIRAGANA_FIRST_CHAR_CODE = HANKAKU_HIRAGANA.first().code + + private fun zenkakuKatakanaToHankakuKatakana(c: Char): String = if (c in ZENKAKU_KATAKANA) HANKAKU_KATAKANA[c.code - ZENKAKU_KATAKANA_FIRST_CHAR_CODE] else c.toString() + private fun hankakuKatakanaToZenkakuKatakana(c: Char): Char = if (c in HANKAKU_HIRAGANA) ZENKAKU_KATAKANA[c.code - HANKAKU_HIRAGANA_FIRST_CHAR_CODE] else c + + fun zenkakuKatakanaToHankakuKatakana(s: String): String = buildString { + for (element in s) { + val converted = hankakuKatakanaToZenkakuKatakana(element) + val convertedChar = zenkakuKatakanaToHankakuKatakana(converted) + append(convertedChar) + } + } +} + object KatakanaEncoding { val charset: Charset = Charset.forName("JIS_X0201") @@ -13,11 +68,6 @@ object KatakanaEncoding { return try { val mapped = str.map { chr -> when (chr) { - // TODO: Convert regular katakana to halfwidth katakana (java lib doesn't do that for us - // and simply returns '?' upon reaching a regular katakana character) - // NOTE: we probably need to somehow do that before we reach this `when`, - // as one regular katakana character often results in two HW katakana characters - // due to differences in how diacritics are handled. '\u0000' -> 0u '\u00a0' -> 0xa0u // $a0 isn't technically a part of JIS X 0201 spec, and so we need to handle this ourselves diff --git a/codeOptimizers/src/prog8/optimizer/ConstantFoldingOptimizer.kt b/codeOptimizers/src/prog8/optimizer/ConstantFoldingOptimizer.kt index d0db88efb..589e21b21 100644 --- a/codeOptimizers/src/prog8/optimizer/ConstantFoldingOptimizer.kt +++ b/codeOptimizers/src/prog8/optimizer/ConstantFoldingOptimizer.kt @@ -91,7 +91,7 @@ class ConstantFoldingOptimizer(private val program: Program, private val errors: program.encoding.encodeString(leftString.value, leftString.encoding) + program.encoding.encodeString(rightString.value, rightString.encoding), leftString.encoding) } - val concatStr = StringLiteral(concatenated, leftString.encoding, expr.position) + val concatStr = StringLiteral.create(concatenated, leftString.encoding, expr.position) return listOf(IAstModification.ReplaceNode(expr, concatStr, parent)) } else if(expr.operator=="*" && rightconst!=null && expr.left is StringLiteral) { @@ -99,7 +99,7 @@ class ConstantFoldingOptimizer(private val program: Program, private val errors: val part = expr.left as StringLiteral if(part.value.isEmpty()) errors.warn("resulting string has length zero", part.position) - val newStr = StringLiteral(part.value.repeat(rightconst.number.toInt()), part.encoding, expr.position) + val newStr = StringLiteral.create(part.value.repeat(rightconst.number.toInt()), part.encoding, expr.position) return listOf(IAstModification.ReplaceNode(expr, newStr, parent)) } } diff --git a/compiler/res/prog8lib/cx16/textio.p8 b/compiler/res/prog8lib/cx16/textio.p8 index 8957c36a8..32e81a692 100644 --- a/compiler/res/prog8lib/cx16/textio.p8 +++ b/compiler/res/prog8lib/cx16/textio.p8 @@ -263,7 +263,7 @@ sub iso16() { } sub kata() { - ; -- switch to katakana character set + ; -- switch to katakana character set (requires rom 48+) cbm.CHROUT($0f) ; iso mode cx16.screen_set_charset(12, 0) ; charset } diff --git a/compiler/src/prog8/compiler/BuiltinFunctions.kt b/compiler/src/prog8/compiler/BuiltinFunctions.kt index d67cfd2d1..7be27983e 100644 --- a/compiler/src/prog8/compiler/BuiltinFunctions.kt +++ b/compiler/src/prog8/compiler/BuiltinFunctions.kt @@ -75,17 +75,6 @@ private fun oneFloatArgOutputFloat(args: List, position: Position, p return NumericLiteral(DataType.FLOAT, function(constval.number), args[0].position) } -private fun collectionArgBoolResult(args: List, position: Position, program: Program, function: (arg: List)->Boolean): NumericLiteral { - if(args.size!=1) - throw SyntaxError("builtin function requires one non-scalar argument", position) - - val array= args[0] as? ArrayLiteral ?: throw NotConstArgumentException() - val constElements = array.value.map{it.constValue(program)?.number} - if(constElements.contains(null)) - throw NotConstArgumentException() - return NumericLiteral.fromBoolean(function(constElements.mapNotNull { it }), args[0].position) -} - private fun builtinAbs(args: List, position: Position, program: Program): NumericLiteral { // 1 arg, type = int, result type= uword if(args.size!=1) @@ -138,6 +127,8 @@ private fun builtinLen(args: List, position: Position, program: Prog return NumericLiteral.optimalInteger(arraySize, position) if(args[0] is ArrayLiteral) return NumericLiteral.optimalInteger((args[0] as ArrayLiteral).value.size, position) + if(args[0] is StringLiteral) + return NumericLiteral.optimalInteger((args[0] as StringLiteral).value.length, position) if(args[0] !is IdentifierReference) throw SyntaxError("len argument should be an identifier", position) val target = (args[0] as IdentifierReference).targetVarDecl(program) diff --git a/compiler/src/prog8/compiler/astprocessing/AstIdentifiersChecker.kt b/compiler/src/prog8/compiler/astprocessing/AstIdentifiersChecker.kt index e90d5c3c3..c5bbf1e36 100644 --- a/compiler/src/prog8/compiler/astprocessing/AstIdentifiersChecker.kt +++ b/compiler/src/prog8/compiler/astprocessing/AstIdentifiersChecker.kt @@ -177,7 +177,7 @@ internal class AstIdentifiersChecker(private val errors: IErrorReporter, '_' }.joinToString("") val textEncoding = (call as Node).definingModule.textEncoding - call.args[0] = StringLiteral(processed, textEncoding, name.position) + call.args[0] = StringLiteral.create(processed, textEncoding, name.position) call.args[0].linkParents(call as Node) } } diff --git a/compiler/test/TestNumericLiteral.kt b/compiler/test/TestNumericLiteral.kt index ca556480b..68007fd60 100644 --- a/compiler/test/TestNumericLiteral.kt +++ b/compiler/test/TestNumericLiteral.kt @@ -96,11 +96,11 @@ class TestNumericLiteral: FunSpec({ } test("testEqualsRef") { - (StringLiteral("hello", Encoding.PETSCII, dummyPos) == StringLiteral("hello", Encoding.PETSCII, dummyPos)) shouldBe true - (StringLiteral("hello", Encoding.PETSCII, dummyPos) != StringLiteral("bye", Encoding.PETSCII, dummyPos)) shouldBe true - (StringLiteral("hello", Encoding.SCREENCODES, dummyPos) == StringLiteral("hello", Encoding.SCREENCODES, dummyPos)) shouldBe true - (StringLiteral("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral("bye", Encoding.SCREENCODES, dummyPos)) shouldBe true - (StringLiteral("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral("hello", Encoding.PETSCII, dummyPos)) shouldBe true + (StringLiteral.create("hello", Encoding.PETSCII, dummyPos) == StringLiteral.create("hello", Encoding.PETSCII, dummyPos)) shouldBe true + (StringLiteral.create("hello", Encoding.PETSCII, dummyPos) != StringLiteral.create("bye", Encoding.PETSCII, dummyPos)) shouldBe true + (StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos) == StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos)) shouldBe true + (StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral.create("bye", Encoding.SCREENCODES, dummyPos)) shouldBe true + (StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral.create("hello", Encoding.PETSCII, dummyPos)) shouldBe true val lvOne = NumericLiteral(DataType.UBYTE, 1.0, dummyPos) val lvTwo = NumericLiteral(DataType.UBYTE, 2.0, dummyPos) diff --git a/compiler/test/TestStringEncodings.kt b/compiler/test/TestStringEncodings.kt index 8a2e31610..c988dbbc1 100644 --- a/compiler/test/TestStringEncodings.kt +++ b/compiler/test/TestStringEncodings.kt @@ -3,15 +3,18 @@ package prog8tests import com.github.michaelbull.result.Ok import com.github.michaelbull.result.expectError import com.github.michaelbull.result.getOrElse +import io.kotest.assertions.throwables.shouldThrow import io.kotest.assertions.withClue import io.kotest.core.spec.style.FunSpec import io.kotest.matchers.shouldBe import io.kotest.matchers.shouldNotBe +import prog8.ast.expressions.CharLiteral import prog8.ast.expressions.NumericLiteral import prog8.ast.expressions.StringLiteral import prog8.ast.statements.Assignment import prog8.ast.statements.VarDecl import prog8.code.core.Encoding +import prog8.code.core.Position import prog8.code.core.unescape import prog8.code.target.C64Target import prog8.code.target.Cx16Target @@ -21,6 +24,7 @@ import prog8.code.target.encodings.IsoEncoding import prog8.code.target.encodings.PetsciiEncoding import prog8tests.helpers.ErrorReporterForTests import prog8tests.helpers.compileText +import java.io.CharConversionException class TestStringEncodings: FunSpec({ @@ -231,6 +235,22 @@ class TestStringEncodings: FunSpec({ } } + context("kata") { + test("kata translation to half width glyphs") { + val orig = "カ が ガ" + orig.length shouldBe 5 + val str = StringLiteral.create(orig, Encoding.KATAKANA, Position.DUMMY) + str.value.length shouldBe 7 + + val character = CharLiteral.create('カ', Encoding.KATAKANA, Position.DUMMY) + character.value shouldBe 'カ' + + shouldThrow { + CharLiteral.create('ガ', Encoding.KATAKANA, Position.DUMMY) + } + } + } + test("special pass-through") { val passthroughEscaped= """\x00\x1b\x99\xff""" val passthrough = passthroughEscaped.unescape() @@ -299,12 +319,22 @@ class TestStringEncodings: FunSpec({ str string1 = "default" str string2 = sc:"screencodes" str string3 = iso:"iso" - str string4 = petscii:"petscii" + str string4 = iso5:"Хозяин и Работник" + str string5 = iso16:"zażółć gęślą jaźń" + str string6 = cp437:"≈ IBM Pc ≈ ♂♀♪☺¶" + str string7 = petscii:"petscii" + str string8 = atascii:"atascii" + str string9 = kata:"クジン。 # が # ガ" ubyte char1 = 'd' ubyte char2 = sc:'s' ubyte char3 = iso:'i' - ubyte char4 = petscii:'p' + ubyte char4 = iso5:'и' + ubyte char5 = iso16:'ł' + ubyte char6 = cp437:'☺' + ubyte char7 = petscii:'p' + ubyte char8 = atascii:'p' + ubyte char9 = kata:'カ' sub start() { } diff --git a/compiler/test/ast/TestProg8Parser.kt b/compiler/test/ast/TestProg8Parser.kt index e11c12a06..b090e12e6 100644 --- a/compiler/test/ast/TestProg8Parser.kt +++ b/compiler/test/ast/TestProg8Parser.kt @@ -612,9 +612,9 @@ class TestProg8Parser: FunSpec( { } test("testCharLiteralConstValue") { - val char1 = CharLiteral('A', Encoding.PETSCII, Position.DUMMY) - val char2 = CharLiteral('z', Encoding.SCREENCODES, Position.DUMMY) - val char3 = CharLiteral('_', Encoding.ISO, Position.DUMMY) + val char1 = CharLiteral.create('A', Encoding.PETSCII, Position.DUMMY) + val char2 = CharLiteral.create('z', Encoding.SCREENCODES, Position.DUMMY) + val char3 = CharLiteral.create('_', Encoding.ISO, Position.DUMMY) val program = Program("test", DummyFunctions, DummyMemsizer, AsciiStringEncoder) char1.constValue(program).number.toInt() shouldBe 65 @@ -640,8 +640,8 @@ class TestProg8Parser: FunSpec( { (ten <= ten) shouldBe true (ten < ten) shouldBe false - val abc = StringLiteral("abc", Encoding.PETSCII, Position.DUMMY) - val abd = StringLiteral("abd", Encoding.PETSCII, Position.DUMMY) + val abc = StringLiteral.create("abc", Encoding.PETSCII, Position.DUMMY) + val abd = StringLiteral.create("abd", Encoding.PETSCII, Position.DUMMY) abc shouldBe abc (abc!=abd) shouldBe true (abc!=abc) shouldBe false diff --git a/compilerAst/src/prog8/ast/expressions/AstExpressions.kt b/compilerAst/src/prog8/ast/expressions/AstExpressions.kt index f6c3fbbba..65611f7e8 100644 --- a/compilerAst/src/prog8/ast/expressions/AstExpressions.kt +++ b/compilerAst/src/prog8/ast/expressions/AstExpressions.kt @@ -10,6 +10,8 @@ import prog8.ast.statements.* import prog8.ast.walk.AstWalker import prog8.ast.walk.IAstVisitor import prog8.code.core.* +import prog8.code.target.encodings.JapaneseCharacterConverter +import java.io.CharConversionException import java.util.* import kotlin.math.abs import kotlin.math.floor @@ -713,7 +715,7 @@ class NumericLiteral(val type: DataType, // only numerical types allowed } } -class CharLiteral(val value: Char, +class CharLiteral private constructor(val value: Char, var encoding: Encoding, override val position: Position) : Expression() { override lateinit var parent: Node @@ -723,9 +725,20 @@ class CharLiteral(val value: Char, } companion object { + fun create(character: Char, encoding: Encoding, position: Position): CharLiteral { + if(encoding==Encoding.KATAKANA) { + val processed = JapaneseCharacterConverter.zenkakuKatakanaToHankakuKatakana(character.toString()) + if(processed.length==1) + return CharLiteral(processed[0], encoding, position) + else + throw CharConversionException("character literal encodes into multiple bytes at $position") + } else + return CharLiteral(character, encoding, position) + } + fun fromEscaped(raw: String, encoding: Encoding, position: Position): CharLiteral { val unescaped = raw.unescape() - return CharLiteral(unescaped[0], encoding, position) + return create(unescaped[0], encoding, position) } } @@ -756,7 +769,7 @@ class CharLiteral(val value: Char, } } -class StringLiteral(val value: String, +class StringLiteral private constructor(val value: String, var encoding: Encoding, override val position: Position) : Expression() { override lateinit var parent: Node @@ -766,10 +779,15 @@ class StringLiteral(val value: String, } companion object { - fun fromEscaped(raw: String, encoding: Encoding, position: Position): StringLiteral { - val unescaped = raw.unescape() - return StringLiteral(unescaped, encoding, position) + fun create(str: String, encoding: Encoding, position: Position): StringLiteral { + if (encoding == Encoding.KATAKANA) { + val processed = JapaneseCharacterConverter.zenkakuKatakanaToHankakuKatakana(str) + return StringLiteral(processed, encoding, position) + } else + return StringLiteral(str, encoding, position) } + + fun fromEscaped(raw: String, encoding: Encoding, position: Position): StringLiteral = create(raw.unescape(), encoding, position) } override val isSimple = true diff --git a/docs/source/syntaxreference.rst b/docs/source/syntaxreference.rst index 856b6a2bf..a811f5402 100644 --- a/docs/source/syntaxreference.rst +++ b/docs/source/syntaxreference.rst @@ -555,7 +555,7 @@ Here are examples of the various encodings: - ``iso16:"zażółć gęślą jaźń"`` string in iso-8859-16 encoding (Eastern Europe) - ``atascii:"I am Atari!"`` string in "atascii" encoding (Atari 8-bit) - ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"`` string in "cp437" encoding (IBM PC codepage 437) - - ``kata:"アノ ニホンジン ワ ガイコクジン。"`` string in "kata" encoding (Katakana half-width support only for now) + - ``kata:"アノ ニホンジン ワ ガイコクジン。 # が # ガ"`` string in "kata" encoding (Katakana) There are several escape sequences available to put special characters into your string value: @@ -976,7 +976,7 @@ where can be just a single statement or a block again:: if_XX { } else { - + } The XX corresponds to one of the processor's branching instructions, so the possibilities are: diff --git a/examples/cx16/charsets.p8 b/examples/cx16/charsets.p8 index 63027f894..c45ad3bd9 100644 --- a/examples/cx16/charsets.p8 +++ b/examples/cx16/charsets.p8 @@ -54,8 +54,8 @@ main { sub kata() { txt.kata() repeat 3 txt.nl() - write_screencodes(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。") - txt.print(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。") + write_screencodes(kata:"Katakana: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。 ## がが ## ガガ") + txt.print(kata:"Katakana: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。 ## がが ## ガガ") } sub wait() {