mirror of
https://github.com/KarolS/millfork.git
synced 2024-12-23 08:29:35 +00:00
#9 Support astral characters in UTF-8
This commit is contained in:
parent
3a6790e47a
commit
f5b6d9999c
@ -55,7 +55,7 @@
|
||||
|
||||
* `vectrex` – built-in Vectrex font
|
||||
|
||||
* `utf8` – UTF-8 (BMP only)
|
||||
* `utf8` – UTF-8
|
||||
|
||||
* `utf16be`, `utf16le` – UTF-16BE and UTF-16LE
|
||||
|
||||
|
@ -68,7 +68,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
||||
if (zt) {
|
||||
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
|
||||
}
|
||||
co.encode(options.log, Some(p), c.toList, options, lenient = lenient) match {
|
||||
co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
|
||||
case List(value) =>
|
||||
LiteralExpression(value, 1)
|
||||
case _ =>
|
||||
@ -87,7 +87,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
||||
|
||||
val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
|
||||
case (p, s, ((co, zt), lenient)) =>
|
||||
val characters = co.encode(options.log, None, s, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
|
||||
val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
|
||||
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
|
||||
else characters
|
||||
}
|
||||
@ -184,7 +184,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
||||
optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).?
|
||||
_ <- ")" ~/ Pass
|
||||
} yield {
|
||||
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath.mkString))
|
||||
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath))
|
||||
val slice = optSlice.fold(data) {
|
||||
case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt)
|
||||
}
|
||||
@ -613,7 +613,7 @@ object MfParser {
|
||||
|
||||
val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("<identifier>")
|
||||
|
||||
val doubleQuotedString: P[List[Char]] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"").map(_.toList)
|
||||
val doubleQuotedString: P[String] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"")
|
||||
|
||||
def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = {
|
||||
val w = value > 255 || value < -0x80 || wordLiteral
|
||||
|
@ -17,7 +17,7 @@ sealed trait TextCodec {
|
||||
|
||||
def stringTerminator: List[Int]
|
||||
|
||||
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int]
|
||||
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int]
|
||||
|
||||
def decode(by: Int): Char
|
||||
|
||||
@ -55,19 +55,19 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
|
||||
}
|
||||
if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) {
|
||||
try {
|
||||
return encode(log, position, Character.toChars(Integer.parseInt(escSeq.tail, 16)).toList, options, lenient)
|
||||
return encode(log, position, List(Integer.parseInt(escSeq.tail, 16)), options, lenient)
|
||||
} catch {
|
||||
case _: NumberFormatException =>
|
||||
}
|
||||
}
|
||||
if (escSeq == "program_name_upper") {
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
|
||||
}
|
||||
if (escSeq == "program_name") {
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
|
||||
}
|
||||
if (escSeq == "copyright_year") {
|
||||
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
|
||||
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
|
||||
}
|
||||
if (escSeq == "null" || escSeq == "nullchar") {
|
||||
return stringTerminator
|
||||
@ -85,9 +85,10 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
|
||||
}
|
||||
}
|
||||
|
||||
override def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
||||
override def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
||||
val LBRACE = '{'.toInt
|
||||
s match {
|
||||
case '{' :: tail =>
|
||||
case LBRACE :: tail =>
|
||||
val (escSeq, closingBrace) = tail.span(_ != '}')
|
||||
closingBrace match {
|
||||
case '}' :: xs =>
|
||||
@ -97,7 +98,7 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
|
||||
Nil
|
||||
}
|
||||
case head :: tail =>
|
||||
head.toString.getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
|
||||
Character.toChars(head).mkString("").getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
|
||||
case Nil => Nil
|
||||
}
|
||||
}
|
||||
@ -118,8 +119,8 @@ class TableTextCodec(override val name: String,
|
||||
|
||||
override val stringTerminator: List[Int] = List(stringTerminatorChar)
|
||||
|
||||
private def isPrintable(c: Char) = {
|
||||
c.getType match {
|
||||
private def isPrintable(c: Int) = {
|
||||
Character.getType(c) match {
|
||||
case Character.LOWERCASE_LETTER => true
|
||||
case Character.UPPERCASE_LETTER => true
|
||||
case Character.TITLECASE_LETTER => true
|
||||
@ -148,15 +149,16 @@ class TableTextCodec(override val name: String,
|
||||
}
|
||||
}
|
||||
|
||||
private def format(c:Char):String = {
|
||||
private def format(c:Int):String = {
|
||||
val u = f"U+${c.toInt}%04X"
|
||||
if (isPrintable(c)) f"`$c%c` ($u%s)"
|
||||
if (isPrintable(c)) f"`${Character.toChars(c).mkString}%s` ($u%s)"
|
||||
else u
|
||||
}
|
||||
|
||||
private def format(s:String) = {
|
||||
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",")
|
||||
if (s.forall(isPrintable)) f"`$s%s` ($u%s)"
|
||||
val codePoints = s.codePoints().toArray
|
||||
val u = codePoints.map(c => f"U+${c}%04X").mkString(",")
|
||||
if (codePoints.forall(isPrintable)) f"`$s%s` ($u%s)"
|
||||
else u
|
||||
}
|
||||
private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = {
|
||||
@ -177,10 +179,11 @@ class TableTextCodec(override val name: String,
|
||||
}
|
||||
|
||||
|
||||
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
||||
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
||||
val LBRACE = '{'.toInt
|
||||
val lenient = options.flag(CompilationFlag.LenientTextEncoding)
|
||||
s match {
|
||||
case '{' :: tail =>
|
||||
case LBRACE :: tail =>
|
||||
val (escSeq, closingBrace) = tail.span(_ != '}')
|
||||
closingBrace match {
|
||||
case '}' :: xs =>
|
||||
@ -189,13 +192,16 @@ class TableTextCodec(override val name: String,
|
||||
log.error(f"Unclosed escape sequence", position)
|
||||
Nil
|
||||
}
|
||||
case head :: tail =>
|
||||
(encodeChar(log, position, head, options, lenient) match {
|
||||
case head :: tail if head >= Char.MinValue && head <= Char.MaxValue =>
|
||||
(encodeChar(log, position, head.toChar, options, lenient) match {
|
||||
case Some(x) => x
|
||||
case None =>
|
||||
log.error(f"Invalid character ${format(head)} in string", position)
|
||||
Nil
|
||||
}) ++ encode(log, position, tail, options, lenient)
|
||||
case head :: tail =>
|
||||
log.error(f"Invalid character ${format(head)} in string", position)
|
||||
encode(log, position, tail, options, lenient)
|
||||
case Nil => Nil
|
||||
}
|
||||
}
|
||||
@ -209,13 +215,13 @@ class TableTextCodec(override val name: String,
|
||||
}
|
||||
}
|
||||
if (escSeq == "program_name_upper") {
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
|
||||
}
|
||||
if (escSeq == "program_name") {
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
|
||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
|
||||
}
|
||||
if (escSeq == "copyright_year") {
|
||||
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
|
||||
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
|
||||
}
|
||||
if (escSeq == "null" || escSeq == "nullchar") {
|
||||
return stringTerminator
|
||||
|
@ -55,6 +55,12 @@ class TextCodecSuite extends FunSuite with Matchers {
|
||||
| if p[1] != 0 { poke($bff8, 0) }
|
||||
| if p[2] != 0 { poke($bff7, 0) }
|
||||
| if p[3] != 0 { poke($bff6, 0) }
|
||||
| p = "𓀀"utf8z
|
||||
| if p[0] == 0 { poke($bff3, p[0]) }
|
||||
| if p[1] == 0 { poke($bff2, p[1]) }
|
||||
| if p[2] == 0 { poke($bff1, p[2]) }
|
||||
| if p[3] == 0 { poke($bff0, p[3]) }
|
||||
| if p[4] != 0 { poke($bfef, p[4]) }
|
||||
| }
|
||||
| macro asm void poke(word const addr, byte a) {
|
||||
| STA addr
|
||||
|
Loading…
Reference in New Issue
Block a user