1
0
mirror of https://github.com/KarolS/millfork.git synced 2024-05-31 18:41:30 +00:00
millfork/src/main/scala/millfork/parser/TextCodec.scala
2019-07-31 00:20:18 +02:00

710 lines
25 KiB
Scala
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package millfork.parser
import java.util.Locale
import millfork.CompilationOptions
import millfork.error.{ConsoleLogger, Logger}
import millfork.node.Position
/**
* @author Karol Stasiak
*/
class TextCodec(val name: String,
private val map: String,
private val extra: Map[Char, Int],
private val decompositions: Map[Char, String],
private val escapeSequences: Map[String, List[Int]]) {
private def isPrintable(c: Char) = {
c.getType match {
case Character.LOWERCASE_LETTER => true
case Character.UPPERCASE_LETTER => true
case Character.TITLECASE_LETTER => true
case Character.OTHER_LETTER => true
case Character.LETTER_NUMBER => true
case Character.DECIMAL_DIGIT_NUMBER => true
case Character.OTHER_NUMBER => true
case Character.DASH_PUNCTUATION => true
case Character.START_PUNCTUATION => true
case Character.END_PUNCTUATION => true
case Character.INITIAL_QUOTE_PUNCTUATION => true
case Character.FINAL_QUOTE_PUNCTUATION => true
case Character.OTHER_PUNCTUATION => true
case Character.CURRENCY_SYMBOL => true
case Character.OTHER_SYMBOL => true
case Character.MATH_SYMBOL => true
case Character.SPACE_SEPARATOR => true
case Character.PARAGRAPH_SEPARATOR => false
case Character.LINE_SEPARATOR => false
case Character.CONTROL => false
case Character.MODIFIER_SYMBOL => false
case Character.SURROGATE => false
case Character.NON_SPACING_MARK => false
case Character.COMBINING_SPACING_MARK => false
case _ => false
}
}
private def format(c:Char):String = {
val u = f"U+${c.toInt}%04X"
if (isPrintable(c)) f"`$c%c` ($u%s)"
else u
}
private def format(s:String) = {
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",")
if (s.forall(isPrintable)) f"`$s%s` ($u%s)"
else u
}
private def encodeChar(log: Logger, position: Option[Position], c: Char, lenient: Boolean): Option[List[Int]] = {
if (decompositions.contains(c)) {
Some(decompositions(c).toList.flatMap(x => encodeChar(log, position, x, lenient).getOrElse(List(x.toInt))))
} else if (extra.contains(c)) Some(List(extra(c))) else {
val index = map.indexOf(c)
if (index >= 0) {
Some(List(index))
} else if (lenient) {
val alternative = TextCodec.lossyAlternatives.getOrElse(c, Nil).:+("?").find(alts => alts.forall(alt => encodeChar(log, position, alt, lenient = false).isDefined)).getOrElse("")
log.warn(s"Cannot encode ${format(c)} in encoding `$name`, replaced it with ${format(alternative)}", position)
Some(alternative.toList.flatMap(encodeChar(log, position, _, lenient = false).get))
} else {
None
}
}
}
def encode(log: Logger, position: Option[Position], s: List[Char], lenient: Boolean): List[Int] = s match {
case '{' :: tail =>
val (escSeq, closingBrace) = tail.span(_ != '}')
closingBrace match {
case '}' :: xs =>
encodeEscapeSequence(log, escSeq.mkString(""), position, lenient) ++ encode(log, position, xs, lenient)
case _ =>
log.error(f"Unclosed escape sequence", position)
Nil
}
case head :: tail =>
(encodeChar(log, position, head, lenient) match {
case Some(x) => x
case None =>
log.error(f"Invalid character ${format(head)} in string", position)
Nil
}) ++ encode(log, position, tail, lenient)
case Nil => Nil
}
private def encodeEscapeSequence(log: Logger, escSeq: String, position: Option[Position], lenient: Boolean): List[Int] = {
if (escSeq.length == 3 && (escSeq(0) == 'X' || escSeq(0) == 'x' || escSeq(0) == '$')){
try {
return List(Integer.parseInt(escSeq.tail, 16))
} catch {
case _: NumberFormatException =>
}
}
escapeSequences.getOrElse(escSeq, {
if (lenient) {
log.warn(s"Cannot encode escape sequence {$escSeq} in encoding `$name`, skipped it", position)
} else {
log.error(s"Invalid escape sequence {$escSeq} for encoding `$name`", position)
}
Nil
})
}
def decode(by: Int): Char = {
val index = by & 0xff
if (index < map.length) map(index) else TextCodec.NotAChar
}
def dump(): Unit = {
(0 until 256).map(decode).zipWithIndex.grouped(32).map(row => row.head._2.toHexString + "\t" + row.map(_._1).mkString("")).foreach(println(_))
}
}
object TextCodec {
def forName(name: String, position: Option[Position], log: Logger): (TextCodec, Boolean) = {
val zeroTerminated = name.endsWith("z")
val cleanName = name.stripSuffix("z")
val codec = (position, cleanName) match {
case (_, "ascii") => TextCodec.Ascii
case (_, "petscii") => TextCodec.Petscii
case (_, "pet") => TextCodec.Petscii
case (_, "petsciijp") => TextCodec.PetsciiJp
case (_, "petjp") => TextCodec.PetsciiJp
case (_, "oldpetscii") => TextCodec.OldPetscii
case (_, "oldpet") => TextCodec.OldPetscii
case (_, "origpetscii") => TextCodec.OriginalPetscii
case (_, "origpet") => TextCodec.OriginalPetscii
case (_, "cbmscr") => TextCodec.CbmScreencodes
case (_, "petscr") => TextCodec.CbmScreencodes
case (_, "cbmscrjp") => TextCodec.CbmScreencodesJp
case (_, "petscrjp") => TextCodec.CbmScreencodesJp
case (_, "atascii") => TextCodec.Atascii
case (_, "atari") => TextCodec.Atascii
case (_, "atasciiscr") => TextCodec.AtasciiScreencodes
case (_, "atariscr") => TextCodec.AtasciiScreencodes
case (_, "bbc") => TextCodec.Bbc
case (_, "sinclair") => TextCodec.Sinclair
case (_, "apple2") => TextCodec.Apple2
case (_, "jis") => TextCodec.Jis
case (_, "jisx") => TextCodec.Jis
case (_, "iso_de") => TextCodec.IsoIec646De
case (_, "iso_no") => TextCodec.IsoIec646No
case (_, "iso_dk") => TextCodec.IsoIec646No
case (_, "iso_se") => TextCodec.IsoIec646Se
case (_, "iso_fi") => TextCodec.IsoIec646Se
case (_, "iso_yu") => TextCodec.IsoIec646Yu
case (_, "msx_intl") => TextCodec.MsxWest
case (_, "msx_us") => TextCodec.MsxWest
case (_, "msx_uk") => TextCodec.MsxWest
case (_, "msx_de") => TextCodec.MsxWest
case (_, "msx_fr") => TextCodec.MsxWest
case (_, "msx_es") => TextCodec.MsxWest
case (_, "msx_ru") => TextCodec.MsxRu
case (_, "msx_jp") => TextCodec.MsxJp
case (p, _) =>
log.error(s"Unknown string encoding: `$name`", p)
TextCodec.Ascii
}
codec -> zeroTerminated
}
val NotAChar = '\ufffd'
private val DefaultOverrides: Map[Char, Int] = ('\u2400' to '\u2420').map(c => c->(c.toInt - 0x2400)).toMap + ('\u2421' -> 127)
//noinspection ScalaUnusedSymbol
private val AsciiEscapeSequences: Map[String, List[Int]] = Map(
"n" -> List(13, 10),
"t" -> List(9),
"b" -> List(8),
"q" -> List('\"'.toInt),
"apos" -> List('\''.toInt),
"lbrace" -> List('{'.toInt),
"rbrace" -> List('}'.toInt))
//noinspection ScalaUnusedSymbol
private val MinimalEscapeSequencesWithoutBraces: Map[String, List[Int]] = Map(
"apos" -> List('\''.toInt),
"q" -> List('\"'.toInt))
//noinspection ScalaUnusedSymbol
private val MinimalEscapeSequencesWithBraces: Map[String, List[Int]] = Map(
"apos" -> List('\''.toInt),
"q" -> List('\"'.toInt),
"lbrace" -> List('{'.toInt),
"rbrace" -> List('}'.toInt))
private val StandardKatakanaDecompositions: Map[Char, String] = {
(("カキクケコサシスセソタチツテトハヒフヘホ")).zip(
"ガギグゲゴザジズゼゾダヂヅデドバビブベボ").map { case (u, v) => v -> (u + "゛") }.toMap ++
"ハヒフヘホ".zip("パピプペポ").map { case (h, p) => p -> (h + "゜") }.toMap
}
private val StandardHiraganaDecompositions: Map[Char, String] = {
(("かきくけこさしすせそたちつてとはひふへほ")).zip(
"がぎぐげござじずぜぞだぢづでどばびぶべぼ").map { case (u, v) => v -> (u + "゛") }.toMap ++
"はひふへほ".zip("ぱぴぷぺぽ").map { case (h, p) => p -> (h + "゜") }.toMap
}
val Ascii = new TextCodec("ASCII", 0.until(127).map { i => if (i < 32) NotAChar else i.toChar }.mkString, Map.empty, Map.empty, AsciiEscapeSequences)
val Apple2 = new TextCodec("APPLE-II", 0.until(255).map { i =>
if (i < 0xa0) NotAChar
else if (i < 0xe0) (i - 128).toChar
else NotAChar
}.mkString,
('a' to 'z').map(l => l -> (l - 'a' + 0xC1)).toMap, Map.empty, MinimalEscapeSequencesWithBraces)
val IsoIec646De = new TextCodec("ISO-IEC-646-DE",
"\ufffd" * 32 +
" !\"#$%^'()*+,-./0123456789:;<=>?" +
"§ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜ^_" +
"`abcdefghijklmnopqrstuvwxyzäöüß",
DefaultOverrides, Map.empty, AsciiEscapeSequences ++ Map(
"UE" -> List('['.toInt),
"OE" -> List('\\'.toInt),
"AE" -> List(']'.toInt),
"ue" -> List('{'.toInt),
"oe" -> List('|'.toInt),
"ae" -> List('}'.toInt),
"ss" -> List('~'.toInt)
)
)
val IsoIec646Se = new TextCodec("ISO-IEC-646-SE",
"\ufffd" * 32 +
" !\"#¤%^'()*+,-./0123456789:;<=>?" +
"@ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÅ^_" +
"`abcdefghijklmnopqrstuvwxyzäöå~",
Map('¯' -> '~'.toInt,
'‾' -> '~'.toInt,
'É' -> '@'.toInt,
'é' -> '`'.toInt,
'Ü' -> '^'.toInt,
'ü' -> '~'.toInt,
'$' -> '¤'.toInt),
Map.empty, AsciiEscapeSequences ++ Map(
"AE" -> List('['.toInt),
"OE" -> List('\\'.toInt),
"AA" -> List(']'.toInt),
"ae" -> List('{'.toInt),
"oe" -> List('|'.toInt),
"aa" -> List('}'.toInt)
)
)
val IsoIec646No = new TextCodec("ISO-IEC-646-NO",
"\ufffd" * 32 +
" !\"#$%^'()*+,-./0123456789:;<=>?" +
"@ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅ^_" +
"`abcdefghijklmnopqrstuvwxyzæøå~",
Map('¯' -> '~'.toInt,
'‾' -> '~'.toInt,
'|' -> '~'.toInt,
'¤' -> '$'.toInt,
'Ä' -> '@'.toInt,
'ä' -> '`'.toInt,
'Ü' -> '^'.toInt,
'ü' -> '~'.toInt,
'«' -> '"'.toInt,
'»' -> '"'.toInt,
'§' -> '#'.toInt),
Map.empty, AsciiEscapeSequences ++ Map(
"AE" -> List('['.toInt),
"OE" -> List('\\'.toInt),
"AA" -> List(']'.toInt),
"ae" -> List('{'.toInt),
"oe" -> List('|'.toInt),
"aa" -> List('}'.toInt)
)
)
val IsoIec646Yu = new TextCodec("ISO-IEC-646-YU",
"\ufffd" * 32 +
" !\"#$%^'()*+,-./0123456789:;<=>?" +
"ŽABCDEFGHIJKLMNOPQRSTUVWXYZŠĐĆČ_" +
"žabcdefghijklmnopqrstuvwxyzšđćč",
Map('Ë' -> '$'.toInt, 'ë' -> '_'.toInt),
Map.empty, AsciiEscapeSequences)
val CbmScreencodes = new TextCodec("CBM-Screen",
"@abcdefghijklmnopqrstuvwxyz[£]↑←" +
0x20.to(0x3f).map(_.toChar).mkString +
"ABCDEFGHIJKLMNOPQRSTUVWXYZ\ufffd\ufffd\ufffdπ",
Map('^' -> 0x1E, '♥' -> 0x53, '♡' -> 0x53, '♠' -> 0x41, '♣' -> 0x58, '♢' -> 0x5A, '•' -> 0x51),
Map.empty, MinimalEscapeSequencesWithoutBraces
)
val CbmScreencodesJp = new TextCodec("CBM-Screen-JP",
"@ABCDEFGHIJKLMNOPQRSTUVWXYZ[¥]↑←" + // 00-1f
0x20.to(0x3f).map(_.toChar).mkString +
"タチツテトナニヌネノハヒフヘホマ" + // 40-4f
"ミムメモヤユヨラリルレロワン゛゜" + // 50-5f
"\ufffd円年月\ufffd\ufffdヲ\ufffd" + // 60-67
"πアイウエオカキクケコサシスセソ" + // 70-7f
"",
Map('^' -> 0x1E, '\\' -> 0x1C,
'♥' -> 0x44, '♡' -> 0x44, '♠' -> 0x41, '♣' -> 0x7B, '♢' -> 0x42, '•' -> 0x5D,
'ー' -> '-'.toInt, 0xff70.toChar -> '-'.toInt, 0xff66.toChar -> 0x66,
'ヮ' -> 0x5C, 'ヵ' -> 0x76, 'ヶ' -> 0x79,
'ァ' -> 0x71, 0xff67.toChar -> 0x71,
'ィ' -> 0x72, 0xff68.toChar -> 0x72,
'ゥ' -> 0x73, 0xff69.toChar -> 0x73,
'ェ' -> 0x74, 0xff6a.toChar -> 0x74,
'ォ' -> 0x75, 0xff6b.toChar -> 0x75,
'ャ' -> 0x54, 0xff6c.toChar -> 0x54,
'ュ' -> 0x55, 0xff6d.toChar -> 0x55,
'ョ' -> 0x56, 0xff6e.toChar -> 0x56,
'ッ' -> 0x42, 0xff6f.toChar -> 0x42
) ++
('a' to 'z').map(l => l -> (l - 'a' + 1)) ++
(1 to 0xf).map(i => (i + 0xff70).toChar -> (i + 0x70)) ++
(0x10 to 0x2f).map(i => (i + 0xff70).toChar -> (i + 0x40)),
StandardKatakanaDecompositions, MinimalEscapeSequencesWithoutBraces
)
val Petscii = new TextCodec("PETSCII",
"\ufffd" * 32 +
0x20.to(0x3f).map(_.toChar).mkString +
"@abcdefghijklmnopqrstuvwxyz[£]↑←" +
"\ufffd" * 32 + // 60-7f
"\ufffd" * 32 + // 80-9f
"\ufffd" * 32 + // a0-bf
"ABCDEFGHIJKLMNOPQRSTUVWXYZ\ufffd\ufffd\ufffdπ", // c0-df
Map('^' -> 0x5E, '♥' -> 0xD3, '♡' -> 0xD3, '♠' -> 0xC1, '♣' -> 0xD8, '♢' -> 0xDA, '•' -> 0xD1), Map.empty, Map(
"n" -> List(13),
"q" -> List('\"'.toInt),
"apos" -> List('\''.toInt),
"up" -> List(0x91),
"down" -> List(0x11),
"left" -> List(0x9d),
"right" -> List(0x1d),
"white" -> List(5),
"black" -> List(0x90),
"red" -> List(0x1c),
"blue" -> List(0x1f),
"green" -> List(0x1e),
"cyan" -> List(0x9f),
"purple" -> List(0x9c),
"yellow" -> List(0x9e),
"reverse" -> List(0x12),
"reverseoff" -> List(0x92)
)
)
val PetsciiJp = new TextCodec("PETSCII-JP",
"\ufffd" * 32 +
0x20.to(0x3f).map(_.toChar).mkString +
"@ABCDEFGHIJKLMNOPQRSTUVWXYZ[¥]↑←" +
"\ufffd" * 32 + // 60-7f
"\ufffd" * 32 + // 80-9f
"\ufffd円年月\ufffd\ufffdヲ\ufffd" + // a0-a7
"\ufffd" * 8 + // a8-af
"πアイウエオカキクケコサシスセソ" + // b0-bf
"タチツテトナニヌネノハヒフヘホマ" + // c0-cf
"ミムメモヤユヨラリルレロワン゛゜", // d0-df
Map('^' -> 0x5E, '\\' -> 0x5C,
'♥' -> 0xC4, '♡' -> 0x73, '♠' -> 0xC1, '♣' -> 0xBB, '♢' -> 0xC2, '•' -> 0xDD,
'ー' -> '-'.toInt, 0xff70.toChar -> '-'.toInt, 0xff66.toChar -> 0xa6,
'ヮ' -> 0xDC, 'ヵ' -> 0xB6, 'ヶ' -> 0xB9,
'ァ' -> 0xB1, 0xff67.toChar -> 0xB1,
'ィ' -> 0xB2, 0xff68.toChar -> 0xB2,
'ゥ' -> 0xB3, 0xff69.toChar -> 0xB3,
'ェ' -> 0xB4, 0xff6a.toChar -> 0xB4,
'ォ' -> 0xB5, 0xff6b.toChar -> 0xB5,
'ャ' -> 0xD4, 0xff6c.toChar -> 0xD4,
'ュ' -> 0xD5, 0xff6d.toChar -> 0xD5,
'ョ' -> 0xD6, 0xff6e.toChar -> 0xD6,
'ッ' -> 0xC2, 0xff6f.toChar -> 0xC2) ++
('a' to 'z').map(l => l -> l.toUpper.toInt) ++
(1 to 0x2f).map(i => (i+0xff70).toChar -> (i+0xb0)),
StandardKatakanaDecompositions, Map(
"n" -> List(13),
"q" -> List('\"'.toInt),
"apos" -> List('\''.toInt),
"up" -> List(0x91),
"down" -> List(0x11),
"left" -> List(0x9d),
"right" -> List(0x1d),
"white" -> List(5),
"black" -> List(0x90),
"red" -> List(0x1c),
"blue" -> List(0x1f),
"green" -> List(0x1e),
"cyan" -> List(0x9f),
"purple" -> List(0x9c),
"yellow" -> List(0x9e),
"reverse" -> List(0x12),
"reverseoff" -> List(0x92)
)
)
val OldPetscii = new TextCodec("Old PETSCII",
"\ufffd" * 32 +
0x20.to(0x3f).map(_.toChar).mkString +
"@abcdefghijklmnopqrstuvwxyz[\\]↑←" +
"\ufffd" * 32 +
"\ufffd" * 32 +
"\ufffd" * 32 +
"ABCDEFGHIJKLMNOPQRSTUVWXYZ\ufffd\ufffd\ufffdπ",
Map('^' -> 0x5E, '♥' -> 0xD3, '♡' -> 0xD3, '♠' -> 0xC1, '♣' -> 0xC8, '♢' -> 0xDA, '•' -> 0xD1), Map.empty, Map(
"n" -> List(13),
"q" -> List('\"'.toInt),
"apos" -> List('\''.toInt),
"up" -> List(0x91),
"down" -> List(0x11),
"left" -> List(0x9d),
"right" -> List(0x1d),
"reverse" -> List(0x12),
"reverseoff" -> List(0x92)
)
)
val OriginalPetscii = new TextCodec("Original PETSCII",
"\ufffd" * 32 +
0x20.to(0x3f).map(_.toChar).mkString +
"@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]↑←" +
"\ufffd" * 32 +
"\ufffd" * 32 +
"\ufffd" * 32 +
"abcdefghijklmnopqrstuvwxyz\ufffd\ufffd\ufffdπ",
Map('^' -> 0x5E, '♥' -> 0xD3, '♡' -> 0xD3, '♠' -> 0xC1, '♣' -> 0xC8, '♢' -> 0xDA, '•' -> 0xD1), Map.empty, Map(
"n" -> List(13),
"q" -> List('\"'.toInt),
"apos" -> List('\''.toInt),
"up" -> List(0x91),
"down" -> List(0x11),
"left" -> List(0x9d),
"right" -> List(0x1d),
"reverse" -> List(0x12),
"reverseoff" -> List(0x92)
)
)
val Atascii = new TextCodec("ATASCII",
"♡" +
"\ufffd" * 15 +
"♣\ufffd\ufffd•" +
"\ufffd" * 11 +
0x20.to(0x5f).map(_.toChar).mkString +
"♢abcdefghijklmnopqrstuvwxyz♠|",
Map('♥' -> 0, '·' -> 0x14), Map.empty, MinimalEscapeSequencesWithoutBraces ++ Seq(
"n" -> List(0x9b),
"up" -> List(0x1c),
"down" -> List(0x1d),
"left" -> List(0x1e),
"right" -> List(0x1f),
"b" -> List(0x7e),
)
)
val AtasciiScreencodes = new TextCodec("ATASCII-Screen",
0x20.to(0x3f).map(_.toChar).mkString +
0x40.to(0x5f).map(_.toChar).mkString +
"♡" +
"\ufffd" * 15 +
"♣\ufffd\ufffd•" +
"\ufffd" * 7 + "↑↓←→"+
"♢abcdefghijklmnopqrstuvwxyz♠|",
Map('♥' -> 0x40, '·' -> 0x54), Map.empty, MinimalEscapeSequencesWithoutBraces
)
val Bbc = new TextCodec("BBC",
"\ufffd" * 32 +
0x20.to(0x5f).map(_.toChar).mkString +
"£" + 0x61.to(0x7E).map(_.toChar).mkString + "©",
Map('↑' -> '^'.toInt), Map.empty, MinimalEscapeSequencesWithBraces + ("n" -> List(13))
)
val Sinclair = new TextCodec("Sinclair",
"\ufffd" * 32 +
0x20.to(0x5f).map(_.toChar).mkString +
"£" + 0x61.to(0x7E).map(_.toChar).mkString + "©",
Map('↑' -> '^'.toInt), Map.empty, Map(
"n" -> List(13),
"q" -> List('\"'.toInt),
"apos" -> List('\''.toInt),
"lbrace" -> List('{'.toInt),
"rbrace" -> List('}'.toInt),
"up" -> List(11),
"down" -> List(10),
"left" -> List(8),
"right" -> List(9),
"white" -> List(0x10, 7),
"black" -> List(0x10, 8),
"red" -> List(0x10, 2),
"blue" -> List(0x10, 1),
"green" -> List(0x10, 4),
"cyan" -> List(0x10, 5),
"purple" -> List(0x10, 3),
"yellow" -> List(0x10, 6),
"bgwhite" -> List(0x11, 7),
"bgblack" -> List(0x11, 8),
"bgred" -> List(0x11, 2),
"bgblue" -> List(0x11, 1),
"bggreen" -> List(0x11, 4),
"bgcyan" -> List(0x11, 5),
"bgpurple" -> List(0x11, 3),
"bgyellow" -> List(0x11, 6),
"reverse" -> List(0x14, 1),
"reverseoff" -> List(0x14, 0)
)
)
private val jisHalfwidthKatakanaOrder: String =
"\ufffd。「」、・ヲァィゥェォャュョッ" +
"ーアイウエオカキクケコサシスセソ" +
"タチツテトナニヌネノハヒフヘホマ" +
"ミムメモヤユヨラリルレロワン゛゜"
//noinspection ScalaUnnecessaryParentheses
val Jis = new TextCodec("JIS-X-0201",
"\ufffd" * 32 +
' '.to('Z').mkString +
"[¥]^_" +
"`" + 'a'.to('z').mkString + "{|}~\ufffd" +
"\ufffd" * 32 +
jisHalfwidthKatakanaOrder +
"\ufffd" * 8 +
"♠♡♢♣" +
"\ufffd" * 4 +
"円年月日時分秒" +
"\ufffd" * 3 + "\\",
Map('¯' -> '~'.toInt, '‾' -> '~'.toInt, '♥' -> 0xE9) ++
1.to(0x3F).map(i => (i + 0xff60).toChar -> (i + 0xA0)).toMap,
StandardKatakanaDecompositions, MinimalEscapeSequencesWithBraces + ("n" -> List(13, 10))
)
val MsxWest = new TextCodec("MSX-International",
"\ufffd" * 32 +
(0x20 to 0x7e).map(_.toChar).mkString("") +
"\ufffd" +
"ÇüéâäàåçêëèïîìÄÅ" +
"ÉæÆôöòûùÿÖÜ¢£¥₧ƒ" +
"áíóúñѪº¿⌐¬½¼¡«»" +
"ÃãĨĩÕõŨũIJij¾\ufffd\ufffd‰¶§" +
"\ufffd" * 24 +
"Δ\ufffdω\ufffd\ufffd\ufffd\ufffd\ufffd" +
"αβΓΠΣσµγΦθΩδ∞∅∈∩" +
"≡±≥≤\ufffd\ufffd÷\ufffd\ufffd\ufffd\ufffd\ufffdⁿ²",
Map('ß' -> 0xE1, '¦' -> 0x7C),
Map('♥' -> "\u0001C", '♡' -> "\u0001C", '♢' -> "\u0001D", '♢' -> "\u0001D", '♣' -> "\u0001E", '♠' -> "\u0001F", '·' -> "\u0001G") ,
MinimalEscapeSequencesWithBraces + ("n" -> List(13, 10))
)
val MsxRu = new TextCodec("MSX-RU",
"\ufffd" * 32 +
(0x20 to 0x7e).map(_.toChar).mkString("") +
"\ufffd" +
"\ufffd" * 16 +
"\ufffd" * 8 +
"Δ\ufffdω\ufffd\ufffd\ufffd\ufffd\ufffd" +
"αβΓΠΣσµγΦθΩδ∞∅∈∩" +
"≡±≥≤\ufffd\ufffd÷\ufffd\ufffd\ufffd\ufffd\ufffdⁿ²\ufffd\ufffd" +
"юабцдефгхийклмнопярстужвьызшэщчъ" +
"ЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩ",
Map('ß' -> 0xA1, '¦' -> 0x7C),
Map('♥' -> "\u0001C", '♡' -> "\u0001C", '♢' -> "\u0001D", '♢' -> "\u0001D", '♣' -> "\u0001E", '♠' -> "\u0001F", '·' -> "\u0001G"),
MinimalEscapeSequencesWithBraces + ("n" -> List(13, 10))
)
val MsxJp = new TextCodec("MSX-JP",
"\ufffd" * 32 +
(0x20 to 0x7e).map(c => if (c == 0x5c) '¥' else c.toChar).mkString("") +
"\ufffd" +
"♠♡♣♢\uffdd·をぁぃぅぇぉゃゅょっ" +
" あいうえおかきくけこさしすせそ" +
jisHalfwidthKatakanaOrder +
"たちつてとなにぬねのはひふへほま" +
"みむめもやゆよらりるれろわん" +
"" +
"",
Map('♥' -> 0x81, '¦' -> 0x7C) ++
1.to(0x3F).map(i => (i + 0xff60).toChar -> (i + 0xA0)).toMap,
Map(
'月' -> "\u0001A",
'火' -> "\u0001B",
'水' -> "\u0001C",
'木' -> "\u0001D",
'金' -> "\u0001E",
'土' -> "\u0001F",
'日' -> "\u0001G",
'年' -> "\u0001H",
'円' -> "\u0001I",
'時' -> "\u0001J",
'分' -> "\u0001K",
'秒' -> "\u0001L",
'百' -> "\u0001M",
'千' -> "\u0001N",
'万' -> "\u0001O",
'大' -> "\u0001]",
'中' -> "\u0001^",
'小' -> "\u0001_"
) ++
StandardHiraganaDecompositions ++ StandardKatakanaDecompositions,
MinimalEscapeSequencesWithBraces + ("n" -> List(13, 10))
)
val lossyAlternatives: Map[Char, List[String]] = {
val allowLowercase: Map[Char, List[String]] = ('A' to 'Z').map(c => c -> List(c.toString.toLowerCase(Locale.ROOT))).toMap
val allowUppercase: Map[Char, List[String]] = ('a' to 'z').map(c => c -> List(c.toString.toUpperCase(Locale.ROOT))).toMap
val ligaturesAndSymbols: Map[Char, List[String]] = Map(
'¦' -> List("|"),
'|' -> List("¦"),
'ß' -> List("ss", "SS"),
'ff' -> List("ff", "FF"),
'fl' -> List("fl", "FL"),
'fi' -> List("fi", "FI"),
'ffi' -> List("ffi", "FFI"),
'ffl' -> List("ffl", "FFL"),
'½' -> List("1/2"),
'¼' -> List("1/4"),
'¾' -> List("3/4"),
'¥' -> List("Y", "y"),
'円' -> List("¥", "Y", "y"),
'年' -> List("Y", "y"),
'月' -> List("M", "m"),
'日' -> List("D", "d"),
'時' -> List("h", "H"),
'分' -> List("m", "M"),
'秒' -> List("s", "S"),
'♥' -> List("H", "h"),
'♠' -> List("S", "s"),
'♡' -> List("H", "h"),
'♢' -> List("D", "d"),
'♣' -> List("C", "c"),
'。' -> List("."),
'、' -> List(","),
'・' -> List("-"),
'•' -> List("・", "*"),
'「' -> List("[", "("),
'」' -> List("]", ")"),
'。' -> List("."),
'。' -> List("."),
'^' -> List("↑"),
'↑' -> List("^"),
'‾' -> List("~"),
'¯' -> List("~"),
'«' -> List("\""),
'»' -> List("\""),
'§' -> List("#"),
'[' -> List("("),
']' -> List(")"),
'{' -> List("("),
'}' -> List(")"),
'§' -> List("#"),
'§' -> List("#"),
'©' -> List("(C)"),
'İ' -> List("I", "i"),
'ª' -> List("a", "A"),
'º' -> List("o", "O"),
'‰' -> List("%."),
'÷' -> List("/"),
'ij' -> List("ij", "IJ"),
'IJ' -> List("IJ", "ij"),
)
val accentedLetters: Map[Char, List[String]] = List(
"áàäãåąāǎă" -> "a",
"çčċćĉ" -> "c",
"đď" -> "d",
"ð" -> "dh",
"éèêëęēėě" -> "e",
"ğǧĝģġ" -> "g",
"ħĥ" -> "h",
"íıìîïįīǐĭĩ" -> "i",
"ĵ" -> "j",
"ķ" -> "k",
"ĺľłļŀ" -> "l",
"ñńňņŋ" -> "n",
"óòöôőõøōǒ" -> "o",
"řŗŕ" -> "r",
"śšŝșşſ" -> "s",
"ţțťŧ" -> "t",
"þ" -> "th",
"úùũûüűųūǔůǘǜǚǖ" -> "u",
"ẃẁŵ" -> "w",
"ýÿỳŷȳ" -> "y",
"žźż" -> "z",
"æ" -> "ae",
"œ" -> "oe",
).flatMap{case (acc, plain) => acc.toList.flatMap(letter => List(
letter -> List(plain, plain.toUpperCase(Locale.ROOT)),
letter.toUpper -> List(plain.toUpperCase(Locale.ROOT), plain)
))}.toMap
val hiragana: Map[Char, List[String]] = (0x3041 to 0x3096).map{ kana => kana.toChar -> List(kana.+(0x60).toChar.toString)}.toMap
val fullWidth: Map[Char, List[String]] = (0xff01 to 0xff5e).map{ i =>
val fw = i.toChar
val hw = i.-(0xfee0).toChar
if (hw.isUpper) fw -> List(hw.toString, hw.toString.toLowerCase(Locale.ROOT))
else if (hw.isLower) fw -> List(hw.toString, hw.toString.toUpperCase(Locale.ROOT))
else fw -> List(hw.toString)
}.toMap
val halfWidth = (0xff61 to 0xff9f).map{ c => c.toChar -> List(jisHalfwidthKatakanaOrder(c - 0xff60).toString)}.toMap
allowLowercase ++ allowUppercase ++ ligaturesAndSymbols ++ accentedLetters ++ hiragana ++ fullWidth ++ halfWidth
}
}