1
0
mirror of https://github.com/KarolS/millfork.git synced 2025-01-10 20:29:35 +00:00

#9 Support astral characters in UTF-8

This commit is contained in:
Karol Stasiak 2019-10-18 11:01:31 +02:00
parent 3a6790e47a
commit f5b6d9999c
4 changed files with 38 additions and 26 deletions

View File

@ -55,7 +55,7 @@
* `vectrex` built-in Vectrex font * `vectrex` built-in Vectrex font
* `utf8` UTF-8 (BMP only) * `utf8` UTF-8
* `utf16be`, `utf16le` UTF-16BE and UTF-16LE * `utf16be`, `utf16le` UTF-16BE and UTF-16LE

View File

@ -68,7 +68,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
if (zt) { if (zt) {
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p)) log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
} }
co.encode(options.log, Some(p), c.toList, options, lenient = lenient) match { co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
case List(value) => case List(value) =>
LiteralExpression(value, 1) LiteralExpression(value, 1)
case _ => case _ =>
@ -87,7 +87,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map { val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
case (p, s, ((co, zt), lenient)) => case (p, s, ((co, zt), lenient)) =>
val characters = co.encode(options.log, None, s, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p)) val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1)) if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
else characters else characters
} }
@ -184,7 +184,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).? optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).?
_ <- ")" ~/ Pass _ <- ")" ~/ Pass
} yield { } yield {
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath.mkString)) val data = Files.readAllBytes(Paths.get(currentDirectory, filePath))
val slice = optSlice.fold(data) { val slice = optSlice.fold(data) {
case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt) case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt)
} }
@ -613,7 +613,7 @@ object MfParser {
val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("<identifier>") val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("<identifier>")
val doubleQuotedString: P[List[Char]] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"").map(_.toList) val doubleQuotedString: P[String] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"")
def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = { def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = {
val w = value > 255 || value < -0x80 || wordLiteral val w = value > 255 || value < -0x80 || wordLiteral

View File

@ -17,7 +17,7 @@ sealed trait TextCodec {
def stringTerminator: List[Int] def stringTerminator: List[Int]
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int]
def decode(by: Int): Char def decode(by: Int): Char
@ -55,19 +55,19 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
} }
if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) { if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) {
try { try {
return encode(log, position, Character.toChars(Integer.parseInt(escSeq.tail, 16)).toList, options, lenient) return encode(log, position, List(Integer.parseInt(escSeq.tail, 16)), options, lenient)
} catch { } catch {
case _: NumberFormatException => case _: NumberFormatException =>
} }
} }
if (escSeq == "program_name_upper") { if (escSeq == "program_name_upper") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient) return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
} }
if (escSeq == "program_name") { if (escSeq == "program_name") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient) return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
} }
if (escSeq == "copyright_year") { if (escSeq == "copyright_year") {
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient) return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
} }
if (escSeq == "null" || escSeq == "nullchar") { if (escSeq == "null" || escSeq == "nullchar") {
return stringTerminator return stringTerminator
@ -85,9 +85,10 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
} }
} }
override def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = { override def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
val LBRACE = '{'.toInt
s match { s match {
case '{' :: tail => case LBRACE :: tail =>
val (escSeq, closingBrace) = tail.span(_ != '}') val (escSeq, closingBrace) = tail.span(_ != '}')
closingBrace match { closingBrace match {
case '}' :: xs => case '}' :: xs =>
@ -97,7 +98,7 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
Nil Nil
} }
case head :: tail => case head :: tail =>
head.toString.getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient) Character.toChars(head).mkString("").getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
case Nil => Nil case Nil => Nil
} }
} }
@ -118,8 +119,8 @@ class TableTextCodec(override val name: String,
override val stringTerminator: List[Int] = List(stringTerminatorChar) override val stringTerminator: List[Int] = List(stringTerminatorChar)
private def isPrintable(c: Char) = { private def isPrintable(c: Int) = {
c.getType match { Character.getType(c) match {
case Character.LOWERCASE_LETTER => true case Character.LOWERCASE_LETTER => true
case Character.UPPERCASE_LETTER => true case Character.UPPERCASE_LETTER => true
case Character.TITLECASE_LETTER => true case Character.TITLECASE_LETTER => true
@ -148,15 +149,16 @@ class TableTextCodec(override val name: String,
} }
} }
private def format(c:Char):String = { private def format(c:Int):String = {
val u = f"U+${c.toInt}%04X" val u = f"U+${c.toInt}%04X"
if (isPrintable(c)) f"`$c%c` ($u%s)" if (isPrintable(c)) f"`${Character.toChars(c).mkString}%s` ($u%s)"
else u else u
} }
private def format(s:String) = { private def format(s:String) = {
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",") val codePoints = s.codePoints().toArray
if (s.forall(isPrintable)) f"`$s%s` ($u%s)" val u = codePoints.map(c => f"U+${c}%04X").mkString(",")
if (codePoints.forall(isPrintable)) f"`$s%s` ($u%s)"
else u else u
} }
private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = { private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = {
@ -177,10 +179,11 @@ class TableTextCodec(override val name: String,
} }
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = { def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
val LBRACE = '{'.toInt
val lenient = options.flag(CompilationFlag.LenientTextEncoding) val lenient = options.flag(CompilationFlag.LenientTextEncoding)
s match { s match {
case '{' :: tail => case LBRACE :: tail =>
val (escSeq, closingBrace) = tail.span(_ != '}') val (escSeq, closingBrace) = tail.span(_ != '}')
closingBrace match { closingBrace match {
case '}' :: xs => case '}' :: xs =>
@ -189,13 +192,16 @@ class TableTextCodec(override val name: String,
log.error(f"Unclosed escape sequence", position) log.error(f"Unclosed escape sequence", position)
Nil Nil
} }
case head :: tail => case head :: tail if head >= Char.MinValue && head <= Char.MaxValue =>
(encodeChar(log, position, head, options, lenient) match { (encodeChar(log, position, head.toChar, options, lenient) match {
case Some(x) => x case Some(x) => x
case None => case None =>
log.error(f"Invalid character ${format(head)} in string", position) log.error(f"Invalid character ${format(head)} in string", position)
Nil Nil
}) ++ encode(log, position, tail, options, lenient) }) ++ encode(log, position, tail, options, lenient)
case head :: tail =>
log.error(f"Invalid character ${format(head)} in string", position)
encode(log, position, tail, options, lenient)
case Nil => Nil case Nil => Nil
} }
} }
@ -209,13 +215,13 @@ class TableTextCodec(override val name: String,
} }
} }
if (escSeq == "program_name_upper") { if (escSeq == "program_name_upper") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient) return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
} }
if (escSeq == "program_name") { if (escSeq == "program_name") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient) return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
} }
if (escSeq == "copyright_year") { if (escSeq == "copyright_year") {
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient) return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
} }
if (escSeq == "null" || escSeq == "nullchar") { if (escSeq == "null" || escSeq == "nullchar") {
return stringTerminator return stringTerminator

View File

@ -55,6 +55,12 @@ class TextCodecSuite extends FunSuite with Matchers {
| if p[1] != 0 { poke($bff8, 0) } | if p[1] != 0 { poke($bff8, 0) }
| if p[2] != 0 { poke($bff7, 0) } | if p[2] != 0 { poke($bff7, 0) }
| if p[3] != 0 { poke($bff6, 0) } | if p[3] != 0 { poke($bff6, 0) }
| p = "𓀀"utf8z
| if p[0] == 0 { poke($bff3, p[0]) }
| if p[1] == 0 { poke($bff2, p[1]) }
| if p[2] == 0 { poke($bff1, p[2]) }
| if p[3] == 0 { poke($bff0, p[3]) }
| if p[4] != 0 { poke($bfef, p[4]) }
| } | }
| macro asm void poke(word const addr, byte a) { | macro asm void poke(word const addr, byte a) {
| STA addr | STA addr