1
0
mirror of https://github.com/KarolS/millfork.git synced 2024-06-26 11:29:28 +00:00

#9 Support astral characters in UTF-8

This commit is contained in:
Karol Stasiak 2019-10-18 11:01:31 +02:00
parent 3a6790e47a
commit f5b6d9999c
4 changed files with 38 additions and 26 deletions

View File

@ -55,7 +55,7 @@
* `vectrex` built-in Vectrex font
* `utf8` UTF-8 (BMP only)
* `utf8` UTF-8
* `utf16be`, `utf16le` UTF-16BE and UTF-16LE

View File

@ -68,7 +68,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
if (zt) {
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
}
co.encode(options.log, Some(p), c.toList, options, lenient = lenient) match {
co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
case List(value) =>
LiteralExpression(value, 1)
case _ =>
@ -87,7 +87,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
case (p, s, ((co, zt), lenient)) =>
val characters = co.encode(options.log, None, s, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
else characters
}
@ -184,7 +184,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).?
_ <- ")" ~/ Pass
} yield {
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath.mkString))
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath))
val slice = optSlice.fold(data) {
case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt)
}
@ -613,7 +613,7 @@ object MfParser {
val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("<identifier>")
val doubleQuotedString: P[List[Char]] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"").map(_.toList)
val doubleQuotedString: P[String] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"")
def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = {
val w = value > 255 || value < -0x80 || wordLiteral

View File

@ -17,7 +17,7 @@ sealed trait TextCodec {
def stringTerminator: List[Int]
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int]
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int]
def decode(by: Int): Char
@ -55,19 +55,19 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
}
if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) {
try {
return encode(log, position, Character.toChars(Integer.parseInt(escSeq.tail, 16)).toList, options, lenient)
return encode(log, position, List(Integer.parseInt(escSeq.tail, 16)), options, lenient)
} catch {
case _: NumberFormatException =>
}
}
if (escSeq == "program_name_upper") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
}
if (escSeq == "program_name") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
}
if (escSeq == "copyright_year") {
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
}
if (escSeq == "null" || escSeq == "nullchar") {
return stringTerminator
@ -85,9 +85,10 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
}
}
override def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
override def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
val LBRACE = '{'.toInt
s match {
case '{' :: tail =>
case LBRACE :: tail =>
val (escSeq, closingBrace) = tail.span(_ != '}')
closingBrace match {
case '}' :: xs =>
@ -97,7 +98,7 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
Nil
}
case head :: tail =>
head.toString.getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
Character.toChars(head).mkString("").getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
case Nil => Nil
}
}
@ -118,8 +119,8 @@ class TableTextCodec(override val name: String,
override val stringTerminator: List[Int] = List(stringTerminatorChar)
private def isPrintable(c: Char) = {
c.getType match {
private def isPrintable(c: Int) = {
Character.getType(c) match {
case Character.LOWERCASE_LETTER => true
case Character.UPPERCASE_LETTER => true
case Character.TITLECASE_LETTER => true
@ -148,15 +149,16 @@ class TableTextCodec(override val name: String,
}
}
private def format(c:Char):String = {
private def format(c:Int):String = {
val u = f"U+${c.toInt}%04X"
if (isPrintable(c)) f"`$c%c` ($u%s)"
if (isPrintable(c)) f"`${Character.toChars(c).mkString}%s` ($u%s)"
else u
}
private def format(s:String) = {
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",")
if (s.forall(isPrintable)) f"`$s%s` ($u%s)"
val codePoints = s.codePoints().toArray
val u = codePoints.map(c => f"U+${c}%04X").mkString(",")
if (codePoints.forall(isPrintable)) f"`$s%s` ($u%s)"
else u
}
private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = {
@ -177,10 +179,11 @@ class TableTextCodec(override val name: String,
}
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
val LBRACE = '{'.toInt
val lenient = options.flag(CompilationFlag.LenientTextEncoding)
s match {
case '{' :: tail =>
case LBRACE :: tail =>
val (escSeq, closingBrace) = tail.span(_ != '}')
closingBrace match {
case '}' :: xs =>
@ -189,13 +192,16 @@ class TableTextCodec(override val name: String,
log.error(f"Unclosed escape sequence", position)
Nil
}
case head :: tail =>
(encodeChar(log, position, head, options, lenient) match {
case head :: tail if head >= Char.MinValue && head <= Char.MaxValue =>
(encodeChar(log, position, head.toChar, options, lenient) match {
case Some(x) => x
case None =>
log.error(f"Invalid character ${format(head)} in string", position)
Nil
}) ++ encode(log, position, tail, options, lenient)
case head :: tail =>
log.error(f"Invalid character ${format(head)} in string", position)
encode(log, position, tail, options, lenient)
case Nil => Nil
}
}
@ -209,13 +215,13 @@ class TableTextCodec(override val name: String,
}
}
if (escSeq == "program_name_upper") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
}
if (escSeq == "program_name") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
}
if (escSeq == "copyright_year") {
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
}
if (escSeq == "null" || escSeq == "nullchar") {
return stringTerminator

View File

@ -55,6 +55,12 @@ class TextCodecSuite extends FunSuite with Matchers {
| if p[1] != 0 { poke($bff8, 0) }
| if p[2] != 0 { poke($bff7, 0) }
| if p[3] != 0 { poke($bff6, 0) }
| p = "𓀀"utf8z
| if p[0] == 0 { poke($bff3, p[0]) }
| if p[1] == 0 { poke($bff2, p[1]) }
| if p[2] == 0 { poke($bff1, p[2]) }
| if p[3] == 0 { poke($bff0, p[3]) }
| if p[4] != 0 { poke($bfef, p[4]) }
| }
| macro asm void poke(word const addr, byte a) {
| STA addr