1
0
mirror of https://github.com/KarolS/millfork.git synced 2025-01-08 22:30:34 +00:00

Add support for Pascal-style strings

This commit is contained in:
Karol Stasiak 2020-04-04 00:45:09 +02:00
parent 7ce088514f
commit 5df695f2c2
7 changed files with 186 additions and 78 deletions

View File

@ -70,6 +70,32 @@ Warning: If you define UTF-16 to be you default or screen encoding, you will enc
* `nullchar` and `nullchar_scr` will still be bytes, equal to zero.
* the `string` module in the Millfork standard library will not work correctly
## Length-prefixed strings (Pascal strings)
You can also prepend `p` to the name of the encoding to make the string length-prefixed.
The length is measured in bytes and doesn't include the zero terminator, if present.
In all encodings except for UTF-16 the prefix takes one byte,
which means that length-prefixed strings cannot be longer than 255 bytes.
In case of UTF-16, the length prefix contains the number of code units,
so the number of bytes divided by two,
which allows for strings of practically unlimited length.
The length is stores as two bytes and is always little endian,
even in case of the `utf16be` encoding or a big-endian processor.
"this is a Pascal string" pascii
"this is also a Pascal string"p
"this is a zero-terminated Pascal string"pz
Note: A string that's both length-prefixed and zero-terminated does not count as a normal zero-terminated string!
To pass it to a function that expects a zero-terminated string, add 1 (or, in case of UTF-16, 2):
pointer p
p = "test"pz
// putstrz(p) // won't work correctly
putstrz(p + 1) // ok
## Escape sequences and miscellaneous compatibility issues
Most characters between the quotes are interpreted literally.

View File

@ -7,7 +7,7 @@ import java.util.Locale
import millfork.error.Logger
import millfork.output._
import millfork.parser.TextCodec
import millfork.parser.{TextCodec, TextCodecWithFlags}
import org.apache.commons.configuration2.INIConfiguration
/**
@ -126,17 +126,23 @@ object Platform {
val codecName = cs.get(classOf[String], "encoding", "ascii")
val srcCodecName = cs.get(classOf[String], "screen_encoding", codecName)
val (codec, czt) = TextCodec.forName(codecName, None, log)
val TextCodecWithFlags(codec, czt, clp, _) = TextCodec.forName(codecName, None, log)
if (czt) {
log.error("Default encoding cannot be zero-terminated")
}
if (clp) {
log.error("Default encoding cannot be length-prefixed")
}
if (codec.stringTerminator.length != 1) {
log.warn("Default encoding should be byte-based")
}
val (srcCodec, szt) = TextCodec.forName(srcCodecName, None, log)
val TextCodecWithFlags(srcCodec, szt, slp, _) = TextCodec.forName(srcCodecName, None, log)
if (szt) {
log.error("Default screen encoding cannot be zero-terminated")
}
if (slp) {
log.error("Default screen encoding cannot be length-prefixed")
}
val as = conf.getSection("allocation")

View File

@ -687,15 +687,25 @@ abstract class AbstractAssembler[T <: AbstractCode](private val program: Program
private def printArrayToAssemblyOutput(assembly: ArrayBuffer[String], name: String, elementType: Type, items: Seq[Expression]): Unit = {
if (name.startsWith("textliteral$")) {
val chars = items.lastOption match {
case Some(LiteralExpression(0, _)) => items.init
var suffix = ""
var chars = items.lastOption match {
case Some(LiteralExpression(0, _)) =>
suffix = "z"
items.init
case _ => items
}
chars.headOption match {
case Some(LiteralExpression(n, _)) if n + 1 == chars.size =>
// length-prefixed
suffix = "p" + suffix
chars = chars.tail
case _ =>
}
val text = chars.map {
case LiteralExpression(i, _) if i >= 0 && i <= 255 => platform.defaultCodec.decode(i.toInt)
case _ => TextCodec.NotAChar
}.mkString("")
if (!text.contains(TextCodec.NotAChar) && !text.exists(c => c.isControl)) assembly.append(" ; \"" + text + "\"")
if (!text.contains(TextCodec.NotAChar) && !text.exists(c => c.isControl)) assembly.append(" ; \"" + text + "\"" + suffix)
}
items.flatMap(expr => env.eval(expr) match {
case Some(c) => List.tabulate(elementType.size)(i => subbyte(c, i, elementType.size).quickSimplify.toString)

View File

@ -98,12 +98,19 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
val functionFlags: P[Set[String]] = flags_("asm", "inline", "interrupt", "macro", "noinline", "reentrant", "kernal_interrupt", "const")
val codec: P[((TextCodec, Boolean), Boolean)] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map {
case (_, "" | "default") => (options.platform.defaultCodec -> false) -> options.flag(CompilationFlag.LenientTextEncoding)
case (_, "z" | "defaultz") => (options.platform.defaultCodec -> true) -> options.flag(CompilationFlag.LenientTextEncoding)
case (_, "scr") => (options.platform.screenCodec -> false) -> options.flag(CompilationFlag.LenientTextEncoding)
case (_, "scrz") => (options.platform.screenCodec -> true) -> options.flag(CompilationFlag.LenientTextEncoding)
case (p, x) => TextCodec.forName(x, Some(p), log) -> false
val codec: P[TextCodecWithFlags] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map { case (position, encoding) =>
val lenient = options.flag(CompilationFlag.LenientTextEncoding)
encoding match {
case "" | "default" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = false, lengthPrefixed = false, lenient = lenient)
case "z" | "defaultz" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = true, lengthPrefixed = false, lenient = lenient)
case "p" | "pdefault" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = false, lengthPrefixed = true, lenient = lenient)
case "pz" | "pdefaultz" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = true, lengthPrefixed = true, lenient = lenient)
case "scr" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = false, lengthPrefixed = false, lenient = lenient)
case "scrz" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = true, lengthPrefixed = false, lenient = lenient)
case "pscr" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = false, lengthPrefixed = true, lenient = lenient)
case "pscrz" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = true, lengthPrefixed = true, lenient = lenient)
case _ => TextCodec.forName(encoding, Some(position), log)
}
}
// def operator: P[String] = P(CharsWhileIn("!-+*/><=~|&^", min=1).!) // TODO: only valid operators
@ -111,11 +118,14 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
val charAtom: P[LiteralExpression] = for {
p <- position()
c <- "'" ~/ CharPred(c => c >= ' ' && c != '\'' && !invalidCharLiteralTypes(Character.getType(c))).rep.! ~/ "'"
((co, zt), lenient) <- HWS ~ codec
TextCodecWithFlags(co, zt, pascal, lenient) <- HWS ~ codec
} yield {
if (zt) {
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
}
if (pascal) {
log.error("Length-prefixed encoding is not a valid encoding for a character literal", Some(p))
}
co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
case List(value) =>
LiteralExpression(value, 1)
@ -134,10 +144,19 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
}
val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
case (p, s, ((co, zt), lenient)) =>
val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
else characters
case (p, s, TextCodecWithFlags(co, zt, lp, lenient)) =>
var characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
if (lp) {
val sizeof = co.stringTerminator.length
val codeUnitCount = characters.length / sizeof
val maxAllowed = 1.<<(8*sizeof) - 1
if (codeUnitCount > maxAllowed) {
log.error(s"Length-prefixed string too long, the length is $codeUnitCount, maximum allowed is $maxAllowed", Some(p))
}
characters = (0 until sizeof).map(i => LiteralExpression(codeUnitCount.>>>(8*i).&(0xff), 1)).toList ++ characters
}
if (zt) characters ++= co.stringTerminator.map(nul => LiteralExpression(nul, 1))
characters
}
val textLiteralAtom: P[TextLiteralExpression] = textLiteral.map(TextLiteralExpression)

View File

@ -12,6 +12,8 @@ import millfork.node.Position
* @author Karol Stasiak
*/
final case class TextCodecWithFlags(code: TextCodec, nullTerminated: Boolean, lengthPrefixed: Boolean, lenient: Boolean)
sealed trait TextCodec {
def name: String
@ -265,65 +267,76 @@ class TableTextCodec(override val name: String,
}
object TextCodec {
val allCodecs = Map(
"ascii" -> TextCodec.Ascii,
"petscii" -> TextCodec.Petscii,
"pet" -> TextCodec.Petscii,
"petsciijp" -> TextCodec.PetsciiJp,
"petjp" -> TextCodec.PetsciiJp,
"oldpetscii" -> TextCodec.OldPetscii,
"oldpet" -> TextCodec.OldPetscii,
"origpetscii" -> TextCodec.OriginalPetscii,
"origpet" -> TextCodec.OriginalPetscii,
"cbmscr" -> TextCodec.CbmScreencodes,
"petscr" -> TextCodec.CbmScreencodes,
"cbmscrjp" -> TextCodec.CbmScreencodesJp,
"petscrjp" -> TextCodec.CbmScreencodesJp,
"atascii" -> TextCodec.Atascii,
"atari" -> TextCodec.Atascii,
"atasciiscr" -> TextCodec.AtasciiScreencodes,
"atariscr" -> TextCodec.AtasciiScreencodes,
"bbc" -> TextCodec.Bbc,
"sinclair" -> TextCodec.Sinclair,
"apple2" -> TextCodec.Apple2,
"jis" -> TextCodec.Jis,
"jisx" -> TextCodec.Jis,
"iso_de" -> TextCodec.IsoIec646De,
"iso_no" -> TextCodec.IsoIec646No,
"iso_dk" -> TextCodec.IsoIec646No,
"iso_se" -> TextCodec.IsoIec646Se,
"iso_fi" -> TextCodec.IsoIec646Se,
"iso_yu" -> TextCodec.IsoIec646Yu,
"msx_intl" -> TextCodec.MsxWest,
"msx_us" -> TextCodec.MsxWest,
"msx_uk" -> TextCodec.MsxWest,
"msx_de" -> TextCodec.MsxWest,
"msx_fr" -> TextCodec.MsxWest,
"msx_es" -> TextCodec.MsxWest,
"msx_ru" -> TextCodec.MsxRu,
"msx_jp" -> TextCodec.MsxJp,
"msx_br" -> TextCodec.MsxBr,
"vectrex" -> TextCodec.Vectrex,
"koi7n2" -> TextCodec.Koi7N2,
"short_koi" -> TextCodec.Koi7N2,
"zx80" -> TextCodec.Zx80,
"zx81" -> TextCodec.Zx81,
"iso8859_15" -> TextCodec.Iso8859_15,
"latin0" -> TextCodec.Iso8859_15,
"latin9" -> TextCodec.Iso8859_15,
"iso15" -> TextCodec.Iso8859_15,
"utf8" -> TextCodec.Utf8,
"utf16be" -> TextCodec.Utf16Be,
"utf16le" -> TextCodec.Utf16Le,
)
def forName(name: String, position: Option[Position], log: Logger): (TextCodec, Boolean) = {
val zeroTerminated = name.endsWith("z")
val cleanName = name.stripSuffix("z")
val codec = (position, cleanName) match {
case (_, "ascii") => TextCodec.Ascii
case (_, "petscii") => TextCodec.Petscii
case (_, "pet") => TextCodec.Petscii
case (_, "petsciijp") => TextCodec.PetsciiJp
case (_, "petjp") => TextCodec.PetsciiJp
case (_, "oldpetscii") => TextCodec.OldPetscii
case (_, "oldpet") => TextCodec.OldPetscii
case (_, "origpetscii") => TextCodec.OriginalPetscii
case (_, "origpet") => TextCodec.OriginalPetscii
case (_, "cbmscr") => TextCodec.CbmScreencodes
case (_, "petscr") => TextCodec.CbmScreencodes
case (_, "cbmscrjp") => TextCodec.CbmScreencodesJp
case (_, "petscrjp") => TextCodec.CbmScreencodesJp
case (_, "atascii") => TextCodec.Atascii
case (_, "atari") => TextCodec.Atascii
case (_, "atasciiscr") => TextCodec.AtasciiScreencodes
case (_, "atariscr") => TextCodec.AtasciiScreencodes
case (_, "bbc") => TextCodec.Bbc
case (_, "sinclair") => TextCodec.Sinclair
case (_, "apple2") => TextCodec.Apple2
case (_, "jis") => TextCodec.Jis
case (_, "jisx") => TextCodec.Jis
case (_, "iso_de") => TextCodec.IsoIec646De
case (_, "iso_no") => TextCodec.IsoIec646No
case (_, "iso_dk") => TextCodec.IsoIec646No
case (_, "iso_se") => TextCodec.IsoIec646Se
case (_, "iso_fi") => TextCodec.IsoIec646Se
case (_, "iso_yu") => TextCodec.IsoIec646Yu
case (_, "msx_intl") => TextCodec.MsxWest
case (_, "msx_us") => TextCodec.MsxWest
case (_, "msx_uk") => TextCodec.MsxWest
case (_, "msx_de") => TextCodec.MsxWest
case (_, "msx_fr") => TextCodec.MsxWest
case (_, "msx_es") => TextCodec.MsxWest
case (_, "msx_ru") => TextCodec.MsxRu
case (_, "msx_jp") => TextCodec.MsxJp
case (_, "msx_br") => TextCodec.MsxBr
case (_, "vectrex") => TextCodec.Vectrex
case (_, "koi7n2") => TextCodec.Koi7N2
case (_, "short_koi") => TextCodec.Koi7N2
case (_, "zx80") => TextCodec.Zx80
case (_, "zx81") => TextCodec.Zx81
case (_, "iso8859_15") => TextCodec.Iso8859_15
case (_, "latin0") => TextCodec.Iso8859_15
case (_, "latin9") => TextCodec.Iso8859_15
case (_, "iso15") => TextCodec.Iso8859_15
case (_, "utf8") => TextCodec.Utf8
case (_, "utf16be") => TextCodec.Utf16Be
case (_, "utf16le") => TextCodec.Utf16Le
case (p, _) =>
log.error(s"Unknown string encoding: `$name`", p)
TextCodec.Ascii
def forName(name: String, position: Option[Position], log: Logger): TextCodecWithFlags = {
if (allCodecs.contains(name)) return TextCodecWithFlags(allCodecs(name), nullTerminated = false, lengthPrefixed = false, lenient = false)
if (name.endsWith("z")) {
val cleanName = name.stripSuffix("z")
if (allCodecs.contains(cleanName)) return TextCodecWithFlags(allCodecs(cleanName), nullTerminated = true, lengthPrefixed = false, lenient = false)
}
codec -> zeroTerminated
val lengthPrefixed = name.startsWith("p")
if (name.startsWith("p")) {
val cleanName = name.stripPrefix("p")
if (allCodecs.contains(cleanName)) return TextCodecWithFlags(allCodecs(cleanName), nullTerminated = false, lengthPrefixed = true, lenient = false)
if (cleanName.endsWith("z")) {
val cleanName2 = cleanName.stripSuffix("z")
if (allCodecs.contains(cleanName2)) return TextCodecWithFlags(allCodecs(cleanName2), nullTerminated = true, lengthPrefixed = true, lenient = false)
}
}
log.error(s"Unknown string encoding: `$name`", position)
TextCodecWithFlags(TextCodec.Ascii, nullTerminated = false, lengthPrefixed = false, lenient = false)
}
private val Utf8 = new UnicodeTextCodec("UTF-8", StandardCharsets.UTF_8, List(0))
@ -337,7 +350,7 @@ object TextCodec {
private lazy val DefaultOverrides: Map[Char, Int] = ('\u2400' to '\u2420').map(c => c->(c.toInt - 0x2400)).toMap + ('\u2421' -> 127)
//noinspection ScalaUnusedSymbol
private val AsciiEscapeSequences: Map[String, List[Int]] = Map(
private lazy val AsciiEscapeSequences: Map[String, List[Int]] = Map(
"n" -> List(13, 10),
"t" -> List(9),
"b" -> List(8),
@ -347,12 +360,12 @@ object TextCodec {
"rbrace" -> List('}'.toInt))
//noinspection ScalaUnusedSymbol
private val MinimalEscapeSequencesWithoutBraces: Map[String, List[Int]] = Map(
private lazy val MinimalEscapeSequencesWithoutBraces: Map[String, List[Int]] = Map(
"apos" -> List('\''.toInt),
"q" -> List('\"'.toInt))
//noinspection ScalaUnusedSymbol
private val MinimalEscapeSequencesWithBraces: Map[String, List[Int]] = Map(
private lazy val MinimalEscapeSequencesWithBraces: Map[String, List[Int]] = Map(
"apos" -> List('\''.toInt),
"q" -> List('\"'.toInt),
"lbrace" -> List('{'.toInt),

View File

@ -0,0 +1,30 @@
package millfork.test
import millfork.Cpu
import millfork.test.emu.{EmuUnoptimizedCrossPlatformRun, ShouldNotCompile, ShouldNotParse}
import org.scalatest.{AppendedClues, FunSuite, Matchers}
/**
* @author Karol Stasiak
*/
class PStringSuite extends FunSuite with Matchers with AppendedClues {
test("pstrlen") {
EmuUnoptimizedCrossPlatformRun(Cpu.Mos, Cpu.Z80, Cpu.Motorola6809)(
"""
|byte pstrlen(pointer str) {
| return str[0]
|}
|byte output @ $c000
|void main() {
| output = pstrlen("hello"p)
|}
|""".stripMargin) { m =>
m.readByte(0xc000) should be (5)
}
}
test("P-string literal too long") {
val longString = "a" * 256
ShouldNotParse(s"""const pointer p = "$longString"p""")
}
}

View File

@ -39,7 +39,11 @@ object ShouldNotParse extends Matchers {
}
parserF.toAst match {
case Success(program, _) =>
fail("Parse succeeded")
if (!log.hasErrors) {
fail("Parse succeeded")
} else {
log.warn("Non-fatal parse errors encountered. OK.")
}
case f: Failure[_, _] =>
println(f.extra.toString)
log.warn("Last parser: " + f.lastParser, Some(parserF.indexToPosition(f.index, f.lastParser.toString)))