mirror of
https://github.com/KarolS/millfork.git
synced 2025-01-08 22:30:34 +00:00
Add support for Pascal-style strings
This commit is contained in:
parent
7ce088514f
commit
5df695f2c2
@ -70,6 +70,32 @@ Warning: If you define UTF-16 to be you default or screen encoding, you will enc
|
||||
* `nullchar` and `nullchar_scr` will still be bytes, equal to zero.
|
||||
* the `string` module in the Millfork standard library will not work correctly
|
||||
|
||||
## Length-prefixed strings (Pascal strings)
|
||||
|
||||
You can also prepend `p` to the name of the encoding to make the string length-prefixed.
|
||||
|
||||
The length is measured in bytes and doesn't include the zero terminator, if present.
|
||||
In all encodings except for UTF-16 the prefix takes one byte,
|
||||
which means that length-prefixed strings cannot be longer than 255 bytes.
|
||||
|
||||
In case of UTF-16, the length prefix contains the number of code units,
|
||||
so the number of bytes divided by two,
|
||||
which allows for strings of practically unlimited length.
|
||||
The length is stores as two bytes and is always little endian,
|
||||
even in case of the `utf16be` encoding or a big-endian processor.
|
||||
|
||||
"this is a Pascal string" pascii
|
||||
"this is also a Pascal string"p
|
||||
"this is a zero-terminated Pascal string"pz
|
||||
|
||||
Note: A string that's both length-prefixed and zero-terminated does not count as a normal zero-terminated string!
|
||||
To pass it to a function that expects a zero-terminated string, add 1 (or, in case of UTF-16, 2):
|
||||
|
||||
pointer p
|
||||
p = "test"pz
|
||||
// putstrz(p) // won't work correctly
|
||||
putstrz(p + 1) // ok
|
||||
|
||||
## Escape sequences and miscellaneous compatibility issues
|
||||
|
||||
Most characters between the quotes are interpreted literally.
|
||||
|
@ -7,7 +7,7 @@ import java.util.Locale
|
||||
|
||||
import millfork.error.Logger
|
||||
import millfork.output._
|
||||
import millfork.parser.TextCodec
|
||||
import millfork.parser.{TextCodec, TextCodecWithFlags}
|
||||
import org.apache.commons.configuration2.INIConfiguration
|
||||
|
||||
/**
|
||||
@ -126,17 +126,23 @@ object Platform {
|
||||
|
||||
val codecName = cs.get(classOf[String], "encoding", "ascii")
|
||||
val srcCodecName = cs.get(classOf[String], "screen_encoding", codecName)
|
||||
val (codec, czt) = TextCodec.forName(codecName, None, log)
|
||||
val TextCodecWithFlags(codec, czt, clp, _) = TextCodec.forName(codecName, None, log)
|
||||
if (czt) {
|
||||
log.error("Default encoding cannot be zero-terminated")
|
||||
}
|
||||
if (clp) {
|
||||
log.error("Default encoding cannot be length-prefixed")
|
||||
}
|
||||
if (codec.stringTerminator.length != 1) {
|
||||
log.warn("Default encoding should be byte-based")
|
||||
}
|
||||
val (srcCodec, szt) = TextCodec.forName(srcCodecName, None, log)
|
||||
val TextCodecWithFlags(srcCodec, szt, slp, _) = TextCodec.forName(srcCodecName, None, log)
|
||||
if (szt) {
|
||||
log.error("Default screen encoding cannot be zero-terminated")
|
||||
}
|
||||
if (slp) {
|
||||
log.error("Default screen encoding cannot be length-prefixed")
|
||||
}
|
||||
|
||||
val as = conf.getSection("allocation")
|
||||
|
||||
|
@ -687,15 +687,25 @@ abstract class AbstractAssembler[T <: AbstractCode](private val program: Program
|
||||
|
||||
private def printArrayToAssemblyOutput(assembly: ArrayBuffer[String], name: String, elementType: Type, items: Seq[Expression]): Unit = {
|
||||
if (name.startsWith("textliteral$")) {
|
||||
val chars = items.lastOption match {
|
||||
case Some(LiteralExpression(0, _)) => items.init
|
||||
var suffix = ""
|
||||
var chars = items.lastOption match {
|
||||
case Some(LiteralExpression(0, _)) =>
|
||||
suffix = "z"
|
||||
items.init
|
||||
case _ => items
|
||||
}
|
||||
chars.headOption match {
|
||||
case Some(LiteralExpression(n, _)) if n + 1 == chars.size =>
|
||||
// length-prefixed
|
||||
suffix = "p" + suffix
|
||||
chars = chars.tail
|
||||
case _ =>
|
||||
}
|
||||
val text = chars.map {
|
||||
case LiteralExpression(i, _) if i >= 0 && i <= 255 => platform.defaultCodec.decode(i.toInt)
|
||||
case _ => TextCodec.NotAChar
|
||||
}.mkString("")
|
||||
if (!text.contains(TextCodec.NotAChar) && !text.exists(c => c.isControl)) assembly.append(" ; \"" + text + "\"")
|
||||
if (!text.contains(TextCodec.NotAChar) && !text.exists(c => c.isControl)) assembly.append(" ; \"" + text + "\"" + suffix)
|
||||
}
|
||||
items.flatMap(expr => env.eval(expr) match {
|
||||
case Some(c) => List.tabulate(elementType.size)(i => subbyte(c, i, elementType.size).quickSimplify.toString)
|
||||
|
@ -98,12 +98,19 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
||||
|
||||
val functionFlags: P[Set[String]] = flags_("asm", "inline", "interrupt", "macro", "noinline", "reentrant", "kernal_interrupt", "const")
|
||||
|
||||
val codec: P[((TextCodec, Boolean), Boolean)] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map {
|
||||
case (_, "" | "default") => (options.platform.defaultCodec -> false) -> options.flag(CompilationFlag.LenientTextEncoding)
|
||||
case (_, "z" | "defaultz") => (options.platform.defaultCodec -> true) -> options.flag(CompilationFlag.LenientTextEncoding)
|
||||
case (_, "scr") => (options.platform.screenCodec -> false) -> options.flag(CompilationFlag.LenientTextEncoding)
|
||||
case (_, "scrz") => (options.platform.screenCodec -> true) -> options.flag(CompilationFlag.LenientTextEncoding)
|
||||
case (p, x) => TextCodec.forName(x, Some(p), log) -> false
|
||||
val codec: P[TextCodecWithFlags] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map { case (position, encoding) =>
|
||||
val lenient = options.flag(CompilationFlag.LenientTextEncoding)
|
||||
encoding match {
|
||||
case "" | "default" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = false, lengthPrefixed = false, lenient = lenient)
|
||||
case "z" | "defaultz" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = true, lengthPrefixed = false, lenient = lenient)
|
||||
case "p" | "pdefault" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = false, lengthPrefixed = true, lenient = lenient)
|
||||
case "pz" | "pdefaultz" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = true, lengthPrefixed = true, lenient = lenient)
|
||||
case "scr" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = false, lengthPrefixed = false, lenient = lenient)
|
||||
case "scrz" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = true, lengthPrefixed = false, lenient = lenient)
|
||||
case "pscr" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = false, lengthPrefixed = true, lenient = lenient)
|
||||
case "pscrz" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = true, lengthPrefixed = true, lenient = lenient)
|
||||
case _ => TextCodec.forName(encoding, Some(position), log)
|
||||
}
|
||||
}
|
||||
|
||||
// def operator: P[String] = P(CharsWhileIn("!-+*/><=~|&^", min=1).!) // TODO: only valid operators
|
||||
@ -111,11 +118,14 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
||||
val charAtom: P[LiteralExpression] = for {
|
||||
p <- position()
|
||||
c <- "'" ~/ CharPred(c => c >= ' ' && c != '\'' && !invalidCharLiteralTypes(Character.getType(c))).rep.! ~/ "'"
|
||||
((co, zt), lenient) <- HWS ~ codec
|
||||
TextCodecWithFlags(co, zt, pascal, lenient) <- HWS ~ codec
|
||||
} yield {
|
||||
if (zt) {
|
||||
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
|
||||
}
|
||||
if (pascal) {
|
||||
log.error("Length-prefixed encoding is not a valid encoding for a character literal", Some(p))
|
||||
}
|
||||
co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
|
||||
case List(value) =>
|
||||
LiteralExpression(value, 1)
|
||||
@ -134,10 +144,19 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
|
||||
}
|
||||
|
||||
val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
|
||||
case (p, s, ((co, zt), lenient)) =>
|
||||
val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
|
||||
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
|
||||
else characters
|
||||
case (p, s, TextCodecWithFlags(co, zt, lp, lenient)) =>
|
||||
var characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
|
||||
if (lp) {
|
||||
val sizeof = co.stringTerminator.length
|
||||
val codeUnitCount = characters.length / sizeof
|
||||
val maxAllowed = 1.<<(8*sizeof) - 1
|
||||
if (codeUnitCount > maxAllowed) {
|
||||
log.error(s"Length-prefixed string too long, the length is $codeUnitCount, maximum allowed is $maxAllowed", Some(p))
|
||||
}
|
||||
characters = (0 until sizeof).map(i => LiteralExpression(codeUnitCount.>>>(8*i).&(0xff), 1)).toList ++ characters
|
||||
}
|
||||
if (zt) characters ++= co.stringTerminator.map(nul => LiteralExpression(nul, 1))
|
||||
characters
|
||||
}
|
||||
|
||||
val textLiteralAtom: P[TextLiteralExpression] = textLiteral.map(TextLiteralExpression)
|
||||
|
@ -12,6 +12,8 @@ import millfork.node.Position
|
||||
* @author Karol Stasiak
|
||||
*/
|
||||
|
||||
final case class TextCodecWithFlags(code: TextCodec, nullTerminated: Boolean, lengthPrefixed: Boolean, lenient: Boolean)
|
||||
|
||||
sealed trait TextCodec {
|
||||
def name: String
|
||||
|
||||
@ -265,65 +267,76 @@ class TableTextCodec(override val name: String,
|
||||
}
|
||||
|
||||
object TextCodec {
|
||||
val allCodecs = Map(
|
||||
"ascii" -> TextCodec.Ascii,
|
||||
"petscii" -> TextCodec.Petscii,
|
||||
"pet" -> TextCodec.Petscii,
|
||||
"petsciijp" -> TextCodec.PetsciiJp,
|
||||
"petjp" -> TextCodec.PetsciiJp,
|
||||
"oldpetscii" -> TextCodec.OldPetscii,
|
||||
"oldpet" -> TextCodec.OldPetscii,
|
||||
"origpetscii" -> TextCodec.OriginalPetscii,
|
||||
"origpet" -> TextCodec.OriginalPetscii,
|
||||
"cbmscr" -> TextCodec.CbmScreencodes,
|
||||
"petscr" -> TextCodec.CbmScreencodes,
|
||||
"cbmscrjp" -> TextCodec.CbmScreencodesJp,
|
||||
"petscrjp" -> TextCodec.CbmScreencodesJp,
|
||||
"atascii" -> TextCodec.Atascii,
|
||||
"atari" -> TextCodec.Atascii,
|
||||
"atasciiscr" -> TextCodec.AtasciiScreencodes,
|
||||
"atariscr" -> TextCodec.AtasciiScreencodes,
|
||||
"bbc" -> TextCodec.Bbc,
|
||||
"sinclair" -> TextCodec.Sinclair,
|
||||
"apple2" -> TextCodec.Apple2,
|
||||
"jis" -> TextCodec.Jis,
|
||||
"jisx" -> TextCodec.Jis,
|
||||
"iso_de" -> TextCodec.IsoIec646De,
|
||||
"iso_no" -> TextCodec.IsoIec646No,
|
||||
"iso_dk" -> TextCodec.IsoIec646No,
|
||||
"iso_se" -> TextCodec.IsoIec646Se,
|
||||
"iso_fi" -> TextCodec.IsoIec646Se,
|
||||
"iso_yu" -> TextCodec.IsoIec646Yu,
|
||||
"msx_intl" -> TextCodec.MsxWest,
|
||||
"msx_us" -> TextCodec.MsxWest,
|
||||
"msx_uk" -> TextCodec.MsxWest,
|
||||
"msx_de" -> TextCodec.MsxWest,
|
||||
"msx_fr" -> TextCodec.MsxWest,
|
||||
"msx_es" -> TextCodec.MsxWest,
|
||||
"msx_ru" -> TextCodec.MsxRu,
|
||||
"msx_jp" -> TextCodec.MsxJp,
|
||||
"msx_br" -> TextCodec.MsxBr,
|
||||
"vectrex" -> TextCodec.Vectrex,
|
||||
"koi7n2" -> TextCodec.Koi7N2,
|
||||
"short_koi" -> TextCodec.Koi7N2,
|
||||
"zx80" -> TextCodec.Zx80,
|
||||
"zx81" -> TextCodec.Zx81,
|
||||
"iso8859_15" -> TextCodec.Iso8859_15,
|
||||
"latin0" -> TextCodec.Iso8859_15,
|
||||
"latin9" -> TextCodec.Iso8859_15,
|
||||
"iso15" -> TextCodec.Iso8859_15,
|
||||
"utf8" -> TextCodec.Utf8,
|
||||
"utf16be" -> TextCodec.Utf16Be,
|
||||
"utf16le" -> TextCodec.Utf16Le,
|
||||
)
|
||||
|
||||
def forName(name: String, position: Option[Position], log: Logger): (TextCodec, Boolean) = {
|
||||
val zeroTerminated = name.endsWith("z")
|
||||
val cleanName = name.stripSuffix("z")
|
||||
val codec = (position, cleanName) match {
|
||||
case (_, "ascii") => TextCodec.Ascii
|
||||
case (_, "petscii") => TextCodec.Petscii
|
||||
case (_, "pet") => TextCodec.Petscii
|
||||
case (_, "petsciijp") => TextCodec.PetsciiJp
|
||||
case (_, "petjp") => TextCodec.PetsciiJp
|
||||
case (_, "oldpetscii") => TextCodec.OldPetscii
|
||||
case (_, "oldpet") => TextCodec.OldPetscii
|
||||
case (_, "origpetscii") => TextCodec.OriginalPetscii
|
||||
case (_, "origpet") => TextCodec.OriginalPetscii
|
||||
case (_, "cbmscr") => TextCodec.CbmScreencodes
|
||||
case (_, "petscr") => TextCodec.CbmScreencodes
|
||||
case (_, "cbmscrjp") => TextCodec.CbmScreencodesJp
|
||||
case (_, "petscrjp") => TextCodec.CbmScreencodesJp
|
||||
case (_, "atascii") => TextCodec.Atascii
|
||||
case (_, "atari") => TextCodec.Atascii
|
||||
case (_, "atasciiscr") => TextCodec.AtasciiScreencodes
|
||||
case (_, "atariscr") => TextCodec.AtasciiScreencodes
|
||||
case (_, "bbc") => TextCodec.Bbc
|
||||
case (_, "sinclair") => TextCodec.Sinclair
|
||||
case (_, "apple2") => TextCodec.Apple2
|
||||
case (_, "jis") => TextCodec.Jis
|
||||
case (_, "jisx") => TextCodec.Jis
|
||||
case (_, "iso_de") => TextCodec.IsoIec646De
|
||||
case (_, "iso_no") => TextCodec.IsoIec646No
|
||||
case (_, "iso_dk") => TextCodec.IsoIec646No
|
||||
case (_, "iso_se") => TextCodec.IsoIec646Se
|
||||
case (_, "iso_fi") => TextCodec.IsoIec646Se
|
||||
case (_, "iso_yu") => TextCodec.IsoIec646Yu
|
||||
case (_, "msx_intl") => TextCodec.MsxWest
|
||||
case (_, "msx_us") => TextCodec.MsxWest
|
||||
case (_, "msx_uk") => TextCodec.MsxWest
|
||||
case (_, "msx_de") => TextCodec.MsxWest
|
||||
case (_, "msx_fr") => TextCodec.MsxWest
|
||||
case (_, "msx_es") => TextCodec.MsxWest
|
||||
case (_, "msx_ru") => TextCodec.MsxRu
|
||||
case (_, "msx_jp") => TextCodec.MsxJp
|
||||
case (_, "msx_br") => TextCodec.MsxBr
|
||||
case (_, "vectrex") => TextCodec.Vectrex
|
||||
case (_, "koi7n2") => TextCodec.Koi7N2
|
||||
case (_, "short_koi") => TextCodec.Koi7N2
|
||||
case (_, "zx80") => TextCodec.Zx80
|
||||
case (_, "zx81") => TextCodec.Zx81
|
||||
case (_, "iso8859_15") => TextCodec.Iso8859_15
|
||||
case (_, "latin0") => TextCodec.Iso8859_15
|
||||
case (_, "latin9") => TextCodec.Iso8859_15
|
||||
case (_, "iso15") => TextCodec.Iso8859_15
|
||||
case (_, "utf8") => TextCodec.Utf8
|
||||
case (_, "utf16be") => TextCodec.Utf16Be
|
||||
case (_, "utf16le") => TextCodec.Utf16Le
|
||||
case (p, _) =>
|
||||
log.error(s"Unknown string encoding: `$name`", p)
|
||||
TextCodec.Ascii
|
||||
def forName(name: String, position: Option[Position], log: Logger): TextCodecWithFlags = {
|
||||
if (allCodecs.contains(name)) return TextCodecWithFlags(allCodecs(name), nullTerminated = false, lengthPrefixed = false, lenient = false)
|
||||
if (name.endsWith("z")) {
|
||||
val cleanName = name.stripSuffix("z")
|
||||
if (allCodecs.contains(cleanName)) return TextCodecWithFlags(allCodecs(cleanName), nullTerminated = true, lengthPrefixed = false, lenient = false)
|
||||
}
|
||||
codec -> zeroTerminated
|
||||
val lengthPrefixed = name.startsWith("p")
|
||||
if (name.startsWith("p")) {
|
||||
val cleanName = name.stripPrefix("p")
|
||||
if (allCodecs.contains(cleanName)) return TextCodecWithFlags(allCodecs(cleanName), nullTerminated = false, lengthPrefixed = true, lenient = false)
|
||||
|
||||
if (cleanName.endsWith("z")) {
|
||||
val cleanName2 = cleanName.stripSuffix("z")
|
||||
if (allCodecs.contains(cleanName2)) return TextCodecWithFlags(allCodecs(cleanName2), nullTerminated = true, lengthPrefixed = true, lenient = false)
|
||||
}
|
||||
}
|
||||
log.error(s"Unknown string encoding: `$name`", position)
|
||||
TextCodecWithFlags(TextCodec.Ascii, nullTerminated = false, lengthPrefixed = false, lenient = false)
|
||||
}
|
||||
|
||||
private val Utf8 = new UnicodeTextCodec("UTF-8", StandardCharsets.UTF_8, List(0))
|
||||
@ -337,7 +350,7 @@ object TextCodec {
|
||||
private lazy val DefaultOverrides: Map[Char, Int] = ('\u2400' to '\u2420').map(c => c->(c.toInt - 0x2400)).toMap + ('\u2421' -> 127)
|
||||
|
||||
//noinspection ScalaUnusedSymbol
|
||||
private val AsciiEscapeSequences: Map[String, List[Int]] = Map(
|
||||
private lazy val AsciiEscapeSequences: Map[String, List[Int]] = Map(
|
||||
"n" -> List(13, 10),
|
||||
"t" -> List(9),
|
||||
"b" -> List(8),
|
||||
@ -347,12 +360,12 @@ object TextCodec {
|
||||
"rbrace" -> List('}'.toInt))
|
||||
|
||||
//noinspection ScalaUnusedSymbol
|
||||
private val MinimalEscapeSequencesWithoutBraces: Map[String, List[Int]] = Map(
|
||||
private lazy val MinimalEscapeSequencesWithoutBraces: Map[String, List[Int]] = Map(
|
||||
"apos" -> List('\''.toInt),
|
||||
"q" -> List('\"'.toInt))
|
||||
|
||||
//noinspection ScalaUnusedSymbol
|
||||
private val MinimalEscapeSequencesWithBraces: Map[String, List[Int]] = Map(
|
||||
private lazy val MinimalEscapeSequencesWithBraces: Map[String, List[Int]] = Map(
|
||||
"apos" -> List('\''.toInt),
|
||||
"q" -> List('\"'.toInt),
|
||||
"lbrace" -> List('{'.toInt),
|
||||
|
30
src/test/scala/millfork/test/PStringSuite.scala
Normal file
30
src/test/scala/millfork/test/PStringSuite.scala
Normal file
@ -0,0 +1,30 @@
|
||||
package millfork.test
|
||||
|
||||
import millfork.Cpu
|
||||
import millfork.test.emu.{EmuUnoptimizedCrossPlatformRun, ShouldNotCompile, ShouldNotParse}
|
||||
import org.scalatest.{AppendedClues, FunSuite, Matchers}
|
||||
|
||||
/**
|
||||
* @author Karol Stasiak
|
||||
*/
|
||||
class PStringSuite extends FunSuite with Matchers with AppendedClues {
|
||||
test("pstrlen") {
|
||||
EmuUnoptimizedCrossPlatformRun(Cpu.Mos, Cpu.Z80, Cpu.Motorola6809)(
|
||||
"""
|
||||
|byte pstrlen(pointer str) {
|
||||
| return str[0]
|
||||
|}
|
||||
|byte output @ $c000
|
||||
|void main() {
|
||||
| output = pstrlen("hello"p)
|
||||
|}
|
||||
|""".stripMargin) { m =>
|
||||
m.readByte(0xc000) should be (5)
|
||||
}
|
||||
}
|
||||
|
||||
test("P-string literal too long") {
|
||||
val longString = "a" * 256
|
||||
ShouldNotParse(s"""const pointer p = "$longString"p""")
|
||||
}
|
||||
}
|
@ -39,7 +39,11 @@ object ShouldNotParse extends Matchers {
|
||||
}
|
||||
parserF.toAst match {
|
||||
case Success(program, _) =>
|
||||
fail("Parse succeeded")
|
||||
if (!log.hasErrors) {
|
||||
fail("Parse succeeded")
|
||||
} else {
|
||||
log.warn("Non-fatal parse errors encountered. OK.")
|
||||
}
|
||||
case f: Failure[_, _] =>
|
||||
println(f.extra.toString)
|
||||
log.warn("Last parser: " + f.lastParser, Some(parserF.indexToPosition(f.index, f.lastParser.toString)))
|
||||
|
Loading…
Reference in New Issue
Block a user