half width katakana conversion

This commit is contained in:
Irmen de Jong
2024-08-21 18:23:30 +02:00
parent b4facaeb3c
commit 8f6eaeac2c
11 changed files with 131 additions and 42 deletions
@@ -6,6 +6,61 @@ import com.github.michaelbull.result.Result
import java.io.CharConversionException
import java.nio.charset.Charset
object JapaneseCharacterConverter {
// adapted from https://github.com/raminduw/Japanese-Character-Converter
private val ZENKAKU_KATAKANA = charArrayOf(
'ァ', 'ア', 'ィ', 'イ', 'ゥ',
'ウ', 'ェ', 'エ', 'ォ', 'オ', 'カ', 'ガ', 'キ', 'ギ', 'ク', 'グ', 'ケ', 'ゲ',
'コ', 'ゴ', 'サ', 'ザ', 'シ', 'ジ', 'ス', 'ズ', 'セ', 'ゼ', 'ソ', 'ゾ', 'タ',
'ダ', 'チ', 'ヂ', 'ッ', 'ツ', 'ヅ', 'テ', 'デ', 'ト', 'ド', 'ナ', 'ニ', 'ヌ',
'ネ', '', 'ハ', 'バ', 'パ', 'ヒ', 'ビ', 'ピ', 'フ', 'ブ', 'プ', 'ヘ', 'ベ',
'ペ', 'ホ', 'ボ', 'ポ', 'マ', 'ミ', 'ム', 'メ', 'モ', 'ャ', 'ヤ', 'ュ', 'ユ',
'ョ', 'ヨ', 'ラ', 'リ', 'ル', 'レ', 'ロ', 'ヮ', 'ワ', 'ヰ', 'ヱ', 'ヲ', 'ン',
'ヴ', 'ヵ', 'ヶ'
)
private val HANKAKU_HIRAGANA = charArrayOf(
'ぁ', 'あ', 'ぃ', 'い', 'ぅ', 'う', 'ぇ', 'え',
'ぉ', 'お', 'か', 'が', 'き', 'ぎ', 'く', 'ぐ',
'け', 'げ', 'こ', 'ご', 'さ', 'ざ', 'し', 'じ',
'す', 'ず', 'せ', 'ぜ', 'そ', 'ぞ', 'た', 'だ',
'ち', 'ぢ', 'っ', 'つ', 'づ', 'て', 'で', 'と',
'ど', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ば',
'ぱ', 'ひ', 'び', 'ぴ', 'ふ', 'ぶ', 'ぷ', 'へ',
'べ', 'ぺ', 'ほ', 'ぼ', 'ぽ', 'ま', 'み', 'む',
'め', 'も', 'ゃ', 'や', 'ゅ', 'ゆ', 'ょ', 'よ',
'ら', 'り', 'る', 'れ', 'ろ', 'ゎ', 'わ', 'ゐ',
'ゑ', 'を', 'ん', 'ゔ', 'ゕ', 'ゖ'
)
private val HANKAKU_KATAKANA = arrayOf(
"", "", "", "", "",
"", "", "", "", "", "", "ガ", "", "ギ", "", "グ", "",
"ゲ", "", "ゴ", "", "ザ", "", "ジ", "", "ズ", "", "ゼ", "ソ",
"ゾ", "", "ダ", "", "ヂ", "", "", "ヅ", "", "デ", "", "ド",
"", "", "", "", "", "", "バ", "パ", "", "ビ", "ピ", "",
"ブ", "プ", "", "ベ", "ペ", "", "ボ", "ポ", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "ヴ", "", ""
)
private val ZENKAKU_KATAKANA_FIRST_CHAR_CODE = ZENKAKU_KATAKANA.first().code
private val HANKAKU_HIRAGANA_FIRST_CHAR_CODE = HANKAKU_HIRAGANA.first().code
private fun zenkakuKatakanaToHankakuKatakana(c: Char): String = if (c in ZENKAKU_KATAKANA) HANKAKU_KATAKANA[c.code - ZENKAKU_KATAKANA_FIRST_CHAR_CODE] else c.toString()
private fun hankakuKatakanaToZenkakuKatakana(c: Char): Char = if (c in HANKAKU_HIRAGANA) ZENKAKU_KATAKANA[c.code - HANKAKU_HIRAGANA_FIRST_CHAR_CODE] else c
fun zenkakuKatakanaToHankakuKatakana(s: String): String = buildString {
for (element in s) {
val converted = hankakuKatakanaToZenkakuKatakana(element)
val convertedChar = zenkakuKatakanaToHankakuKatakana(converted)
append(convertedChar)
}
}
}
object KatakanaEncoding {
val charset: Charset = Charset.forName("JIS_X0201")
@@ -13,11 +68,6 @@ object KatakanaEncoding {
return try {
val mapped = str.map { chr ->
when (chr) {
// TODO: Convert regular katakana to halfwidth katakana (java lib doesn't do that for us
// and simply returns '?' upon reaching a regular katakana character)
// NOTE: we probably need to somehow do that before we reach this `when`,
// as one regular katakana character often results in two HW katakana characters
// due to differences in how diacritics are handled.
'\u0000' -> 0u
'\u00a0' -> 0xa0u // $a0 isn't technically a part of JIS X 0201 spec, and so we need to handle this ourselves