mirror of
https://github.com/KarolS/millfork.git
synced 2025-01-10 20:29:35 +00:00
#9 Support astral characters in UTF-8
This commit is contained in:
parent
3a6790e47a
commit
f5b6d9999c
@ -55,7 +55,7 @@
|
|||||||
|
|
||||||
* `vectrex` – built-in Vectrex font
|
* `vectrex` – built-in Vectrex font
|
||||||
|
|
||||||
* `utf8` – UTF-8 (BMP only)
|
* `utf8` – UTF-8
|
||||||
|
|
||||||
* `utf16be`, `utf16le` – UTF-16BE and UTF-16LE
|
* `utf16be`, `utf16le` – UTF-16BE and UTF-16LE
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
|||||||
if (zt) {
|
if (zt) {
|
||||||
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
|
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
|
||||||
}
|
}
|
||||||
co.encode(options.log, Some(p), c.toList, options, lenient = lenient) match {
|
co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
|
||||||
case List(value) =>
|
case List(value) =>
|
||||||
LiteralExpression(value, 1)
|
LiteralExpression(value, 1)
|
||||||
case _ =>
|
case _ =>
|
||||||
@ -87,7 +87,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
|||||||
|
|
||||||
val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
|
val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
|
||||||
case (p, s, ((co, zt), lenient)) =>
|
case (p, s, ((co, zt), lenient)) =>
|
||||||
val characters = co.encode(options.log, None, s, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
|
val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
|
||||||
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
|
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
|
||||||
else characters
|
else characters
|
||||||
}
|
}
|
||||||
@ -184,7 +184,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
|||||||
optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).?
|
optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).?
|
||||||
_ <- ")" ~/ Pass
|
_ <- ")" ~/ Pass
|
||||||
} yield {
|
} yield {
|
||||||
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath.mkString))
|
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath))
|
||||||
val slice = optSlice.fold(data) {
|
val slice = optSlice.fold(data) {
|
||||||
case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt)
|
case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt)
|
||||||
}
|
}
|
||||||
@ -613,7 +613,7 @@ object MfParser {
|
|||||||
|
|
||||||
val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("<identifier>")
|
val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("<identifier>")
|
||||||
|
|
||||||
val doubleQuotedString: P[List[Char]] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"").map(_.toList)
|
val doubleQuotedString: P[String] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"")
|
||||||
|
|
||||||
def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = {
|
def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = {
|
||||||
val w = value > 255 || value < -0x80 || wordLiteral
|
val w = value > 255 || value < -0x80 || wordLiteral
|
||||||
|
@ -17,7 +17,7 @@ sealed trait TextCodec {
|
|||||||
|
|
||||||
def stringTerminator: List[Int]
|
def stringTerminator: List[Int]
|
||||||
|
|
||||||
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int]
|
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int]
|
||||||
|
|
||||||
def decode(by: Int): Char
|
def decode(by: Int): Char
|
||||||
|
|
||||||
@ -55,19 +55,19 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
|
|||||||
}
|
}
|
||||||
if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) {
|
if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) {
|
||||||
try {
|
try {
|
||||||
return encode(log, position, Character.toChars(Integer.parseInt(escSeq.tail, 16)).toList, options, lenient)
|
return encode(log, position, List(Integer.parseInt(escSeq.tail, 16)), options, lenient)
|
||||||
} catch {
|
} catch {
|
||||||
case _: NumberFormatException =>
|
case _: NumberFormatException =>
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (escSeq == "program_name_upper") {
|
if (escSeq == "program_name_upper") {
|
||||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
|
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
|
||||||
}
|
}
|
||||||
if (escSeq == "program_name") {
|
if (escSeq == "program_name") {
|
||||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
|
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
|
||||||
}
|
}
|
||||||
if (escSeq == "copyright_year") {
|
if (escSeq == "copyright_year") {
|
||||||
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
|
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
|
||||||
}
|
}
|
||||||
if (escSeq == "null" || escSeq == "nullchar") {
|
if (escSeq == "null" || escSeq == "nullchar") {
|
||||||
return stringTerminator
|
return stringTerminator
|
||||||
@ -85,9 +85,10 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
override def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
override def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
||||||
|
val LBRACE = '{'.toInt
|
||||||
s match {
|
s match {
|
||||||
case '{' :: tail =>
|
case LBRACE :: tail =>
|
||||||
val (escSeq, closingBrace) = tail.span(_ != '}')
|
val (escSeq, closingBrace) = tail.span(_ != '}')
|
||||||
closingBrace match {
|
closingBrace match {
|
||||||
case '}' :: xs =>
|
case '}' :: xs =>
|
||||||
@ -97,7 +98,7 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
|
|||||||
Nil
|
Nil
|
||||||
}
|
}
|
||||||
case head :: tail =>
|
case head :: tail =>
|
||||||
head.toString.getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
|
Character.toChars(head).mkString("").getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
|
||||||
case Nil => Nil
|
case Nil => Nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -118,8 +119,8 @@ class TableTextCodec(override val name: String,
|
|||||||
|
|
||||||
override val stringTerminator: List[Int] = List(stringTerminatorChar)
|
override val stringTerminator: List[Int] = List(stringTerminatorChar)
|
||||||
|
|
||||||
private def isPrintable(c: Char) = {
|
private def isPrintable(c: Int) = {
|
||||||
c.getType match {
|
Character.getType(c) match {
|
||||||
case Character.LOWERCASE_LETTER => true
|
case Character.LOWERCASE_LETTER => true
|
||||||
case Character.UPPERCASE_LETTER => true
|
case Character.UPPERCASE_LETTER => true
|
||||||
case Character.TITLECASE_LETTER => true
|
case Character.TITLECASE_LETTER => true
|
||||||
@ -148,15 +149,16 @@ class TableTextCodec(override val name: String,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def format(c:Char):String = {
|
private def format(c:Int):String = {
|
||||||
val u = f"U+${c.toInt}%04X"
|
val u = f"U+${c.toInt}%04X"
|
||||||
if (isPrintable(c)) f"`$c%c` ($u%s)"
|
if (isPrintable(c)) f"`${Character.toChars(c).mkString}%s` ($u%s)"
|
||||||
else u
|
else u
|
||||||
}
|
}
|
||||||
|
|
||||||
private def format(s:String) = {
|
private def format(s:String) = {
|
||||||
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",")
|
val codePoints = s.codePoints().toArray
|
||||||
if (s.forall(isPrintable)) f"`$s%s` ($u%s)"
|
val u = codePoints.map(c => f"U+${c}%04X").mkString(",")
|
||||||
|
if (codePoints.forall(isPrintable)) f"`$s%s` ($u%s)"
|
||||||
else u
|
else u
|
||||||
}
|
}
|
||||||
private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = {
|
private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = {
|
||||||
@ -177,10 +179,11 @@ class TableTextCodec(override val name: String,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
|
||||||
|
val LBRACE = '{'.toInt
|
||||||
val lenient = options.flag(CompilationFlag.LenientTextEncoding)
|
val lenient = options.flag(CompilationFlag.LenientTextEncoding)
|
||||||
s match {
|
s match {
|
||||||
case '{' :: tail =>
|
case LBRACE :: tail =>
|
||||||
val (escSeq, closingBrace) = tail.span(_ != '}')
|
val (escSeq, closingBrace) = tail.span(_ != '}')
|
||||||
closingBrace match {
|
closingBrace match {
|
||||||
case '}' :: xs =>
|
case '}' :: xs =>
|
||||||
@ -189,13 +192,16 @@ class TableTextCodec(override val name: String,
|
|||||||
log.error(f"Unclosed escape sequence", position)
|
log.error(f"Unclosed escape sequence", position)
|
||||||
Nil
|
Nil
|
||||||
}
|
}
|
||||||
case head :: tail =>
|
case head :: tail if head >= Char.MinValue && head <= Char.MaxValue =>
|
||||||
(encodeChar(log, position, head, options, lenient) match {
|
(encodeChar(log, position, head.toChar, options, lenient) match {
|
||||||
case Some(x) => x
|
case Some(x) => x
|
||||||
case None =>
|
case None =>
|
||||||
log.error(f"Invalid character ${format(head)} in string", position)
|
log.error(f"Invalid character ${format(head)} in string", position)
|
||||||
Nil
|
Nil
|
||||||
}) ++ encode(log, position, tail, options, lenient)
|
}) ++ encode(log, position, tail, options, lenient)
|
||||||
|
case head :: tail =>
|
||||||
|
log.error(f"Invalid character ${format(head)} in string", position)
|
||||||
|
encode(log, position, tail, options, lenient)
|
||||||
case Nil => Nil
|
case Nil => Nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -209,13 +215,13 @@ class TableTextCodec(override val name: String,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (escSeq == "program_name_upper") {
|
if (escSeq == "program_name_upper") {
|
||||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
|
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
|
||||||
}
|
}
|
||||||
if (escSeq == "program_name") {
|
if (escSeq == "program_name") {
|
||||||
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
|
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
|
||||||
}
|
}
|
||||||
if (escSeq == "copyright_year") {
|
if (escSeq == "copyright_year") {
|
||||||
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
|
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
|
||||||
}
|
}
|
||||||
if (escSeq == "null" || escSeq == "nullchar") {
|
if (escSeq == "null" || escSeq == "nullchar") {
|
||||||
return stringTerminator
|
return stringTerminator
|
||||||
|
@ -55,6 +55,12 @@ class TextCodecSuite extends FunSuite with Matchers {
|
|||||||
| if p[1] != 0 { poke($bff8, 0) }
|
| if p[1] != 0 { poke($bff8, 0) }
|
||||||
| if p[2] != 0 { poke($bff7, 0) }
|
| if p[2] != 0 { poke($bff7, 0) }
|
||||||
| if p[3] != 0 { poke($bff6, 0) }
|
| if p[3] != 0 { poke($bff6, 0) }
|
||||||
|
| p = "𓀀"utf8z
|
||||||
|
| if p[0] == 0 { poke($bff3, p[0]) }
|
||||||
|
| if p[1] == 0 { poke($bff2, p[1]) }
|
||||||
|
| if p[2] == 0 { poke($bff1, p[2]) }
|
||||||
|
| if p[3] == 0 { poke($bff0, p[3]) }
|
||||||
|
| if p[4] != 0 { poke($bfef, p[4]) }
|
||||||
| }
|
| }
|
||||||
| macro asm void poke(word const addr, byte a) {
|
| macro asm void poke(word const addr, byte a) {
|
||||||
| STA addr
|
| STA addr
|
||||||
|
Loading…
x
Reference in New Issue
Block a user