half width katakana conversion

This commit is contained in:
Irmen de Jong 2024-08-21 18:23:30 +02:00
parent b4facaeb3c
commit 8f6eaeac2c
11 changed files with 131 additions and 42 deletions

View File

@ -6,6 +6,61 @@ import com.github.michaelbull.result.Result
import java.io.CharConversionException
import java.nio.charset.Charset
object JapaneseCharacterConverter {
// adapted from https://github.com/raminduw/Japanese-Character-Converter
private val ZENKAKU_KATAKANA = charArrayOf(
'ァ', 'ア', 'ィ', 'イ', 'ゥ',
'ウ', 'ェ', 'エ', 'ォ', 'オ', 'カ', 'ガ', 'キ', 'ギ', 'ク', 'グ', 'ケ', 'ゲ',
'コ', 'ゴ', 'サ', 'ザ', 'シ', 'ジ', 'ス', 'ズ', 'セ', 'ゼ', 'ソ', 'ゾ', 'タ',
'ダ', 'チ', 'ヂ', 'ッ', 'ツ', 'ヅ', 'テ', 'デ', 'ト', 'ド', 'ナ', 'ニ', 'ヌ',
'ネ', '', 'ハ', 'バ', 'パ', 'ヒ', 'ビ', 'ピ', 'フ', 'ブ', 'プ', 'ヘ', 'ベ',
'ペ', 'ホ', 'ボ', 'ポ', 'マ', 'ミ', 'ム', 'メ', 'モ', 'ャ', 'ヤ', 'ュ', 'ユ',
'ョ', 'ヨ', 'ラ', 'リ', 'ル', 'レ', 'ロ', 'ヮ', 'ワ', 'ヰ', 'ヱ', 'ヲ', 'ン',
'ヴ', 'ヵ', 'ヶ'
)
private val HANKAKU_HIRAGANA = charArrayOf(
'ぁ', 'あ', 'ぃ', 'い', 'ぅ', 'う', 'ぇ', 'え',
'ぉ', 'お', 'か', 'が', 'き', 'ぎ', 'く', 'ぐ',
'け', 'げ', 'こ', 'ご', 'さ', 'ざ', 'し', 'じ',
'す', 'ず', 'せ', 'ぜ', 'そ', 'ぞ', 'た', 'だ',
'ち', 'ぢ', 'っ', 'つ', 'づ', 'て', 'で', 'と',
'ど', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ば',
'ぱ', 'ひ', 'び', 'ぴ', 'ふ', 'ぶ', 'ぷ', 'へ',
'べ', 'ぺ', 'ほ', 'ぼ', 'ぽ', 'ま', 'み', 'む',
'め', 'も', 'ゃ', 'や', 'ゅ', 'ゆ', 'ょ', 'よ',
'ら', 'り', 'る', 'れ', 'ろ', 'ゎ', 'わ', 'ゐ',
'ゑ', 'を', 'ん', 'ゔ', 'ゕ', 'ゖ'
)
private val HANKAKU_KATAKANA = arrayOf(
"", "", "", "", "",
"", "", "", "", "", "", "ガ", "", "ギ", "", "グ", "",
"ゲ", "", "ゴ", "", "ザ", "", "ジ", "", "ズ", "", "ゼ", "ソ",
"ゾ", "", "ダ", "", "ヂ", "", "", "ヅ", "", "デ", "", "ド",
"", "", "", "", "", "", "バ", "パ", "", "ビ", "ピ", "",
"ブ", "プ", "", "ベ", "ペ", "", "ボ", "ポ", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "ヴ", "", ""
)
private val ZENKAKU_KATAKANA_FIRST_CHAR_CODE = ZENKAKU_KATAKANA.first().code
private val HANKAKU_HIRAGANA_FIRST_CHAR_CODE = HANKAKU_HIRAGANA.first().code
private fun zenkakuKatakanaToHankakuKatakana(c: Char): String = if (c in ZENKAKU_KATAKANA) HANKAKU_KATAKANA[c.code - ZENKAKU_KATAKANA_FIRST_CHAR_CODE] else c.toString()
private fun hankakuKatakanaToZenkakuKatakana(c: Char): Char = if (c in HANKAKU_HIRAGANA) ZENKAKU_KATAKANA[c.code - HANKAKU_HIRAGANA_FIRST_CHAR_CODE] else c
fun zenkakuKatakanaToHankakuKatakana(s: String): String = buildString {
for (element in s) {
val converted = hankakuKatakanaToZenkakuKatakana(element)
val convertedChar = zenkakuKatakanaToHankakuKatakana(converted)
append(convertedChar)
}
}
}
object KatakanaEncoding {
val charset: Charset = Charset.forName("JIS_X0201")
@ -13,11 +68,6 @@ object KatakanaEncoding {
return try {
val mapped = str.map { chr ->
when (chr) {
// TODO: Convert regular katakana to halfwidth katakana (java lib doesn't do that for us
// and simply returns '?' upon reaching a regular katakana character)
// NOTE: we probably need to somehow do that before we reach this `when`,
// as one regular katakana character often results in two HW katakana characters
// due to differences in how diacritics are handled.
'\u0000' -> 0u
'\u00a0' -> 0xa0u // $a0 isn't technically a part of JIS X 0201 spec, and so we need to handle this ourselves

View File

@ -91,7 +91,7 @@ class ConstantFoldingOptimizer(private val program: Program, private val errors:
program.encoding.encodeString(leftString.value, leftString.encoding) + program.encoding.encodeString(rightString.value, rightString.encoding),
leftString.encoding)
}
val concatStr = StringLiteral(concatenated, leftString.encoding, expr.position)
val concatStr = StringLiteral.create(concatenated, leftString.encoding, expr.position)
return listOf(IAstModification.ReplaceNode(expr, concatStr, parent))
}
else if(expr.operator=="*" && rightconst!=null && expr.left is StringLiteral) {
@ -99,7 +99,7 @@ class ConstantFoldingOptimizer(private val program: Program, private val errors:
val part = expr.left as StringLiteral
if(part.value.isEmpty())
errors.warn("resulting string has length zero", part.position)
val newStr = StringLiteral(part.value.repeat(rightconst.number.toInt()), part.encoding, expr.position)
val newStr = StringLiteral.create(part.value.repeat(rightconst.number.toInt()), part.encoding, expr.position)
return listOf(IAstModification.ReplaceNode(expr, newStr, parent))
}
}

View File

@ -263,7 +263,7 @@ sub iso16() {
}
sub kata() {
; -- switch to katakana character set
; -- switch to katakana character set (requires rom 48+)
cbm.CHROUT($0f) ; iso mode
cx16.screen_set_charset(12, 0) ; charset
}

View File

@ -75,17 +75,6 @@ private fun oneFloatArgOutputFloat(args: List<Expression>, position: Position, p
return NumericLiteral(DataType.FLOAT, function(constval.number), args[0].position)
}
private fun collectionArgBoolResult(args: List<Expression>, position: Position, program: Program, function: (arg: List<Double>)->Boolean): NumericLiteral {
if(args.size!=1)
throw SyntaxError("builtin function requires one non-scalar argument", position)
val array= args[0] as? ArrayLiteral ?: throw NotConstArgumentException()
val constElements = array.value.map{it.constValue(program)?.number}
if(constElements.contains(null))
throw NotConstArgumentException()
return NumericLiteral.fromBoolean(function(constElements.mapNotNull { it }), args[0].position)
}
private fun builtinAbs(args: List<Expression>, position: Position, program: Program): NumericLiteral {
// 1 arg, type = int, result type= uword
if(args.size!=1)
@ -138,6 +127,8 @@ private fun builtinLen(args: List<Expression>, position: Position, program: Prog
return NumericLiteral.optimalInteger(arraySize, position)
if(args[0] is ArrayLiteral)
return NumericLiteral.optimalInteger((args[0] as ArrayLiteral).value.size, position)
if(args[0] is StringLiteral)
return NumericLiteral.optimalInteger((args[0] as StringLiteral).value.length, position)
if(args[0] !is IdentifierReference)
throw SyntaxError("len argument should be an identifier", position)
val target = (args[0] as IdentifierReference).targetVarDecl(program)

View File

@ -177,7 +177,7 @@ internal class AstIdentifiersChecker(private val errors: IErrorReporter,
'_'
}.joinToString("")
val textEncoding = (call as Node).definingModule.textEncoding
call.args[0] = StringLiteral(processed, textEncoding, name.position)
call.args[0] = StringLiteral.create(processed, textEncoding, name.position)
call.args[0].linkParents(call as Node)
}
}

View File

@ -96,11 +96,11 @@ class TestNumericLiteral: FunSpec({
}
test("testEqualsRef") {
(StringLiteral("hello", Encoding.PETSCII, dummyPos) == StringLiteral("hello", Encoding.PETSCII, dummyPos)) shouldBe true
(StringLiteral("hello", Encoding.PETSCII, dummyPos) != StringLiteral("bye", Encoding.PETSCII, dummyPos)) shouldBe true
(StringLiteral("hello", Encoding.SCREENCODES, dummyPos) == StringLiteral("hello", Encoding.SCREENCODES, dummyPos)) shouldBe true
(StringLiteral("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral("bye", Encoding.SCREENCODES, dummyPos)) shouldBe true
(StringLiteral("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral("hello", Encoding.PETSCII, dummyPos)) shouldBe true
(StringLiteral.create("hello", Encoding.PETSCII, dummyPos) == StringLiteral.create("hello", Encoding.PETSCII, dummyPos)) shouldBe true
(StringLiteral.create("hello", Encoding.PETSCII, dummyPos) != StringLiteral.create("bye", Encoding.PETSCII, dummyPos)) shouldBe true
(StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos) == StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos)) shouldBe true
(StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral.create("bye", Encoding.SCREENCODES, dummyPos)) shouldBe true
(StringLiteral.create("hello", Encoding.SCREENCODES, dummyPos) != StringLiteral.create("hello", Encoding.PETSCII, dummyPos)) shouldBe true
val lvOne = NumericLiteral(DataType.UBYTE, 1.0, dummyPos)
val lvTwo = NumericLiteral(DataType.UBYTE, 2.0, dummyPos)

View File

@ -3,15 +3,18 @@ package prog8tests
import com.github.michaelbull.result.Ok
import com.github.michaelbull.result.expectError
import com.github.michaelbull.result.getOrElse
import io.kotest.assertions.throwables.shouldThrow
import io.kotest.assertions.withClue
import io.kotest.core.spec.style.FunSpec
import io.kotest.matchers.shouldBe
import io.kotest.matchers.shouldNotBe
import prog8.ast.expressions.CharLiteral
import prog8.ast.expressions.NumericLiteral
import prog8.ast.expressions.StringLiteral
import prog8.ast.statements.Assignment
import prog8.ast.statements.VarDecl
import prog8.code.core.Encoding
import prog8.code.core.Position
import prog8.code.core.unescape
import prog8.code.target.C64Target
import prog8.code.target.Cx16Target
@ -21,6 +24,7 @@ import prog8.code.target.encodings.IsoEncoding
import prog8.code.target.encodings.PetsciiEncoding
import prog8tests.helpers.ErrorReporterForTests
import prog8tests.helpers.compileText
import java.io.CharConversionException
class TestStringEncodings: FunSpec({
@ -231,6 +235,22 @@ class TestStringEncodings: FunSpec({
}
}
context("kata") {
test("kata translation to half width glyphs") {
val orig = "カ が ガ"
orig.length shouldBe 5
val str = StringLiteral.create(orig, Encoding.KATAKANA, Position.DUMMY)
str.value.length shouldBe 7
val character = CharLiteral.create('カ', Encoding.KATAKANA, Position.DUMMY)
character.value shouldBe 'カ'
shouldThrow<CharConversionException> {
CharLiteral.create('ガ', Encoding.KATAKANA, Position.DUMMY)
}
}
}
test("special pass-through") {
val passthroughEscaped= """\x00\x1b\x99\xff"""
val passthrough = passthroughEscaped.unescape()
@ -299,12 +319,22 @@ class TestStringEncodings: FunSpec({
str string1 = "default"
str string2 = sc:"screencodes"
str string3 = iso:"iso"
str string4 = petscii:"petscii"
str string4 = iso5:"Хозяин и Работник"
str string5 = iso16:"zażółć gęślą jaźń"
str string6 = cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"
str string7 = petscii:"petscii"
str string8 = atascii:"atascii"
str string9 = kata:"クジン。 # が # ガ"
ubyte char1 = 'd'
ubyte char2 = sc:'s'
ubyte char3 = iso:'i'
ubyte char4 = petscii:'p'
ubyte char4 = iso5:'и'
ubyte char5 = iso16:'ł'
ubyte char6 = cp437:'☺'
ubyte char7 = petscii:'p'
ubyte char8 = atascii:'p'
ubyte char9 = kata:'カ'
sub start() {
}

View File

@ -612,9 +612,9 @@ class TestProg8Parser: FunSpec( {
}
test("testCharLiteralConstValue") {
val char1 = CharLiteral('A', Encoding.PETSCII, Position.DUMMY)
val char2 = CharLiteral('z', Encoding.SCREENCODES, Position.DUMMY)
val char3 = CharLiteral('_', Encoding.ISO, Position.DUMMY)
val char1 = CharLiteral.create('A', Encoding.PETSCII, Position.DUMMY)
val char2 = CharLiteral.create('z', Encoding.SCREENCODES, Position.DUMMY)
val char3 = CharLiteral.create('_', Encoding.ISO, Position.DUMMY)
val program = Program("test", DummyFunctions, DummyMemsizer, AsciiStringEncoder)
char1.constValue(program).number.toInt() shouldBe 65
@ -640,8 +640,8 @@ class TestProg8Parser: FunSpec( {
(ten <= ten) shouldBe true
(ten < ten) shouldBe false
val abc = StringLiteral("abc", Encoding.PETSCII, Position.DUMMY)
val abd = StringLiteral("abd", Encoding.PETSCII, Position.DUMMY)
val abc = StringLiteral.create("abc", Encoding.PETSCII, Position.DUMMY)
val abd = StringLiteral.create("abd", Encoding.PETSCII, Position.DUMMY)
abc shouldBe abc
(abc!=abd) shouldBe true
(abc!=abc) shouldBe false

View File

@ -10,6 +10,8 @@ import prog8.ast.statements.*
import prog8.ast.walk.AstWalker
import prog8.ast.walk.IAstVisitor
import prog8.code.core.*
import prog8.code.target.encodings.JapaneseCharacterConverter
import java.io.CharConversionException
import java.util.*
import kotlin.math.abs
import kotlin.math.floor
@ -713,7 +715,7 @@ class NumericLiteral(val type: DataType, // only numerical types allowed
}
}
class CharLiteral(val value: Char,
class CharLiteral private constructor(val value: Char,
var encoding: Encoding,
override val position: Position) : Expression() {
override lateinit var parent: Node
@ -723,9 +725,20 @@ class CharLiteral(val value: Char,
}
companion object {
fun create(character: Char, encoding: Encoding, position: Position): CharLiteral {
if(encoding==Encoding.KATAKANA) {
val processed = JapaneseCharacterConverter.zenkakuKatakanaToHankakuKatakana(character.toString())
if(processed.length==1)
return CharLiteral(processed[0], encoding, position)
else
throw CharConversionException("character literal encodes into multiple bytes at $position")
} else
return CharLiteral(character, encoding, position)
}
fun fromEscaped(raw: String, encoding: Encoding, position: Position): CharLiteral {
val unescaped = raw.unescape()
return CharLiteral(unescaped[0], encoding, position)
return create(unescaped[0], encoding, position)
}
}
@ -756,7 +769,7 @@ class CharLiteral(val value: Char,
}
}
class StringLiteral(val value: String,
class StringLiteral private constructor(val value: String,
var encoding: Encoding,
override val position: Position) : Expression() {
override lateinit var parent: Node
@ -766,10 +779,15 @@ class StringLiteral(val value: String,
}
companion object {
fun fromEscaped(raw: String, encoding: Encoding, position: Position): StringLiteral {
val unescaped = raw.unescape()
return StringLiteral(unescaped, encoding, position)
fun create(str: String, encoding: Encoding, position: Position): StringLiteral {
if (encoding == Encoding.KATAKANA) {
val processed = JapaneseCharacterConverter.zenkakuKatakanaToHankakuKatakana(str)
return StringLiteral(processed, encoding, position)
} else
return StringLiteral(str, encoding, position)
}
fun fromEscaped(raw: String, encoding: Encoding, position: Position): StringLiteral = create(raw.unescape(), encoding, position)
}
override val isSimple = true

View File

@ -555,7 +555,7 @@ Here are examples of the various encodings:
- ``iso16:"zażółć gęślą jaźń"`` string in iso-8859-16 encoding (Eastern Europe)
- ``atascii:"I am Atari!"`` string in "atascii" encoding (Atari 8-bit)
- ``cp437:"≈ IBM Pc ≈ ♂♀♪☺¶"`` string in "cp437" encoding (IBM PC codepage 437)
- ``kata:"アノ ニホンジン ワ ガイコクジン。"`` string in "kata" encoding (Katakana half-width support only for now)
- ``kata:"アノ ニホンジン ワ ガイコクジン。 # が # ガ"`` string in "kata" encoding (Katakana)
There are several escape sequences available to put special characters into your string value:
@ -976,7 +976,7 @@ where <statements> can be just a single statement or a block again::
if_XX {
<statements>
} else {
<alternative statements>
<alternative statements>
}
The XX corresponds to one of the processor's branching instructions, so the possibilities are:

View File

@ -54,8 +54,8 @@ main {
sub kata() {
txt.kata()
repeat 3 txt.nl()
write_screencodes(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。")
txt.print(kata:"Katakana hw: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。")
write_screencodes(kata:"Katakana: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。 ## がが ## ガガ")
txt.print(kata:"Katakana: アノ ニホンジン ワ ガイコクジン ノ ニホンゴ ガ ジョウズ ダッテ ユッタ。 ## がが ## ガガ")
}
sub wait() {