Add support for Pascal-style strings

2025-04-22 16:40:03 +00:00 · 2020-04-04 00:45:09 +02:00 · 2020-04-04 00:45:09 +02:00 · 5df695f2c2
commit 5df695f2c2
parent 7ce088514f
7 changed files with 186 additions and 78 deletions
--- a/docs/lang/literals.md
+++ b/docs/lang/literals.md
@ -70,6 +70,32 @@ Warning: If you define UTF-16 to be you default or screen encoding, you will enc
 * `nullchar` and `nullchar_scr` will still be bytes, equal to zero.
 * the `string` module in the Millfork standard library will not work correctly

+## Length-prefixed strings (Pascal strings)
+
+You can also prepend `p` to the name of the encoding to make the string length-prefixed.
+
+The length is measured in bytes and doesn't include the zero terminator, if present.
+In all encodings except for UTF-16 the prefix takes one byte,
+which means that length-prefixed strings cannot be longer than 255 bytes.
+ 
+In case of UTF-16, the length prefix contains the number of code units,
+so the number of bytes divided by two,
+which allows for strings of practically unlimited length.
+The length is stores as two bytes and is always little endian,
+even in case of the `utf16be` encoding or a big-endian processor.
+
+        "this is a Pascal string" pascii
+        "this is also a Pascal string"p
+        "this is a zero-terminated Pascal string"pz
+
+Note: A string that's both length-prefixed and zero-terminated does not count as a normal zero-terminated string!
+To pass it to a function that expects a zero-terminated string, add 1 (or, in case of UTF-16, 2):
+
+    pointer p
+    p = "test"pz
+    // putstrz(p)  // won't work correctly
+    putstrz(p + 1) // ok
+
 ## Escape sequences and miscellaneous compatibility issues

 Most characters between the quotes are interpreted literally.
--- a/src/main/scala/millfork/Platform.scala
+++ b/src/main/scala/millfork/Platform.scala
@ -7,7 +7,7 @@ import java.util.Locale

 import millfork.error.Logger
 import millfork.output._
-import millfork.parser.TextCodec
+import millfork.parser.{TextCodec, TextCodecWithFlags}
 import org.apache.commons.configuration2.INIConfiguration

 /**
@ -126,17 +126,23 @@ object Platform {

    val codecName = cs.get(classOf[String], "encoding", "ascii")
    val srcCodecName = cs.get(classOf[String], "screen_encoding", codecName)
-    val (codec, czt) = TextCodec.forName(codecName, None, log)
+    val TextCodecWithFlags(codec, czt, clp, _) = TextCodec.forName(codecName, None, log)
    if (czt) {
      log.error("Default encoding cannot be zero-terminated")
    }
+    if (clp) {
+      log.error("Default encoding cannot be length-prefixed")
+    }
    if (codec.stringTerminator.length != 1) {
      log.warn("Default encoding should be byte-based")
    }
-    val (srcCodec, szt) = TextCodec.forName(srcCodecName, None, log)
+    val TextCodecWithFlags(srcCodec, szt, slp, _) = TextCodec.forName(srcCodecName, None, log)
    if (szt) {
      log.error("Default screen encoding cannot be zero-terminated")
    }
+    if (slp) {
+      log.error("Default screen encoding cannot be length-prefixed")
+    }

    val as = conf.getSection("allocation")

--- a/src/main/scala/millfork/output/AbstractAssembler.scala
+++ b/src/main/scala/millfork/output/AbstractAssembler.scala
@ -687,15 +687,25 @@ abstract class AbstractAssembler[T <: AbstractCode](private val program: Program

  private def printArrayToAssemblyOutput(assembly: ArrayBuffer[String], name: String, elementType: Type, items: Seq[Expression]): Unit = {
    if (name.startsWith("textliteral$")) {
-      val chars = items.lastOption match {
-        case Some(LiteralExpression(0, _)) => items.init
+      var suffix = ""
+      var chars = items.lastOption match {
+        case Some(LiteralExpression(0, _)) =>
+          suffix = "z"
+          items.init
        case _ => items
      }
+      chars.headOption match {
+        case Some(LiteralExpression(n, _)) if n + 1 == chars.size  =>
+          // length-prefixed
+          suffix = "p" + suffix
+          chars = chars.tail
+        case _ =>
+      }
      val text = chars.map {
        case LiteralExpression(i, _) if i >= 0 && i <= 255 => platform.defaultCodec.decode(i.toInt)
        case _ => TextCodec.NotAChar
      }.mkString("")
-      if (!text.contains(TextCodec.NotAChar) && !text.exists(c => c.isControl)) assembly.append("    ; \"" + text + "\"")
+      if (!text.contains(TextCodec.NotAChar) && !text.exists(c => c.isControl)) assembly.append("    ; \"" + text + "\"" + suffix)
    }
    items.flatMap(expr => env.eval(expr) match {
      case Some(c) => List.tabulate(elementType.size)(i => subbyte(c, i, elementType.size).quickSimplify.toString)
--- a/src/main/scala/millfork/parser/MfParser.scala
+++ b/src/main/scala/millfork/parser/MfParser.scala
@ -98,12 +98,19 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri

  val functionFlags: P[Set[String]] = flags_("asm", "inline", "interrupt", "macro", "noinline", "reentrant", "kernal_interrupt", "const")

-  val codec: P[((TextCodec, Boolean), Boolean)] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map {
-    case (_, "" | "default") => (options.platform.defaultCodec -> false) -> options.flag(CompilationFlag.LenientTextEncoding)
-    case (_, "z" | "defaultz") => (options.platform.defaultCodec -> true) -> options.flag(CompilationFlag.LenientTextEncoding)
-    case (_, "scr") => (options.platform.screenCodec -> false) -> options.flag(CompilationFlag.LenientTextEncoding)
-    case (_, "scrz") => (options.platform.screenCodec -> true) -> options.flag(CompilationFlag.LenientTextEncoding)
-    case (p, x) => TextCodec.forName(x, Some(p), log) -> false
+  val codec: P[TextCodecWithFlags] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map { case (position, encoding) =>
+    val lenient = options.flag(CompilationFlag.LenientTextEncoding)
+    encoding match {
+      case "" | "default" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = false, lengthPrefixed = false, lenient = lenient)
+      case "z" | "defaultz" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = true, lengthPrefixed = false, lenient = lenient)
+      case "p" | "pdefault" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = false, lengthPrefixed = true, lenient = lenient)
+      case "pz" | "pdefaultz" => TextCodecWithFlags(options.platform.defaultCodec, nullTerminated = true, lengthPrefixed = true, lenient = lenient)
+      case "scr" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = false, lengthPrefixed = false, lenient = lenient)
+      case "scrz" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = true, lengthPrefixed = false, lenient = lenient)
+      case "pscr" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = false, lengthPrefixed = true, lenient = lenient)
+      case "pscrz" => TextCodecWithFlags(options.platform.screenCodec, nullTerminated = true, lengthPrefixed = true, lenient = lenient)
+      case _ => TextCodec.forName(encoding, Some(position), log)
+    }
  }

  //  def operator: P[String] = P(CharsWhileIn("!-+*/><=~|&^", min=1).!) // TODO: only valid operators
@ -111,11 +118,14 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
  val charAtom: P[LiteralExpression] = for {
    p <- position()
    c <- "'" ~/ CharPred(c => c >= ' ' && c != '\'' && !invalidCharLiteralTypes(Character.getType(c))).rep.! ~/ "'"
-    ((co, zt), lenient) <- HWS ~ codec
+    TextCodecWithFlags(co, zt, pascal, lenient) <- HWS ~ codec
  } yield {
    if (zt) {
      log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
    }
+    if (pascal) {
+      log.error("Length-prefixed encoding is not a valid encoding for a character literal", Some(p))
+    }
    co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
      case List(value) =>
        LiteralExpression(value, 1)
@ -134,10 +144,19 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
  }

  val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
-      case (p, s, ((co, zt), lenient)) =>
-        val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
-        if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
-        else characters
+      case (p, s, TextCodecWithFlags(co, zt, lp, lenient)) =>
+        var characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
+        if (lp) {
+          val sizeof = co.stringTerminator.length
+          val codeUnitCount = characters.length / sizeof
+          val maxAllowed = 1.<<(8*sizeof) - 1
+          if (codeUnitCount > maxAllowed) {
+            log.error(s"Length-prefixed string too long, the length is $codeUnitCount, maximum allowed is $maxAllowed", Some(p))
+          }
+          characters = (0 until sizeof).map(i => LiteralExpression(codeUnitCount.>>>(8*i).&(0xff), 1)).toList ++ characters
+        }
+        if (zt) characters ++= co.stringTerminator.map(nul => LiteralExpression(nul, 1))
+        characters
    }

  val textLiteralAtom: P[TextLiteralExpression] = textLiteral.map(TextLiteralExpression)
--- a/src/main/scala/millfork/parser/TextCodec.scala
+++ b/src/main/scala/millfork/parser/TextCodec.scala
@ -12,6 +12,8 @@ import millfork.node.Position
  * @author Karol Stasiak
  */

+final case class TextCodecWithFlags(code: TextCodec, nullTerminated: Boolean, lengthPrefixed: Boolean, lenient: Boolean)
+
 sealed trait TextCodec {
  def name: String

@ -265,65 +267,76 @@ class TableTextCodec(override val name: String,
 }

 object TextCodec {
+  val allCodecs = Map(
+    "ascii" -> TextCodec.Ascii,
+    "petscii" -> TextCodec.Petscii,
+    "pet" -> TextCodec.Petscii,
+    "petsciijp" -> TextCodec.PetsciiJp,
+    "petjp" -> TextCodec.PetsciiJp,
+    "oldpetscii" -> TextCodec.OldPetscii,
+    "oldpet" -> TextCodec.OldPetscii,
+    "origpetscii" -> TextCodec.OriginalPetscii,
+    "origpet" -> TextCodec.OriginalPetscii,
+    "cbmscr" -> TextCodec.CbmScreencodes,
+    "petscr" -> TextCodec.CbmScreencodes,
+    "cbmscrjp" -> TextCodec.CbmScreencodesJp,
+    "petscrjp" -> TextCodec.CbmScreencodesJp,
+    "atascii" -> TextCodec.Atascii,
+    "atari" -> TextCodec.Atascii,
+    "atasciiscr" -> TextCodec.AtasciiScreencodes,
+    "atariscr" -> TextCodec.AtasciiScreencodes,
+    "bbc" -> TextCodec.Bbc,
+    "sinclair" -> TextCodec.Sinclair,
+    "apple2" -> TextCodec.Apple2,
+    "jis" -> TextCodec.Jis,
+    "jisx" -> TextCodec.Jis,
+    "iso_de" -> TextCodec.IsoIec646De,
+    "iso_no" -> TextCodec.IsoIec646No,
+    "iso_dk" -> TextCodec.IsoIec646No,
+    "iso_se" -> TextCodec.IsoIec646Se,
+    "iso_fi" -> TextCodec.IsoIec646Se,
+    "iso_yu" -> TextCodec.IsoIec646Yu,
+    "msx_intl" -> TextCodec.MsxWest,
+    "msx_us" -> TextCodec.MsxWest,
+    "msx_uk" -> TextCodec.MsxWest,
+    "msx_de" -> TextCodec.MsxWest,
+    "msx_fr" -> TextCodec.MsxWest,
+    "msx_es" -> TextCodec.MsxWest,
+    "msx_ru" -> TextCodec.MsxRu,
+    "msx_jp" -> TextCodec.MsxJp,
+    "msx_br" -> TextCodec.MsxBr,
+    "vectrex" -> TextCodec.Vectrex,
+    "koi7n2" -> TextCodec.Koi7N2,
+    "short_koi" -> TextCodec.Koi7N2,
+    "zx80" -> TextCodec.Zx80,
+    "zx81" -> TextCodec.Zx81,
+    "iso8859_15" -> TextCodec.Iso8859_15,
+    "latin0" -> TextCodec.Iso8859_15,
+    "latin9" -> TextCodec.Iso8859_15,
+    "iso15" -> TextCodec.Iso8859_15,
+    "utf8" -> TextCodec.Utf8,
+    "utf16be" -> TextCodec.Utf16Be,
+    "utf16le" -> TextCodec.Utf16Le,
+  )

-  def forName(name: String, position: Option[Position], log: Logger): (TextCodec, Boolean) = {
-    val zeroTerminated = name.endsWith("z")
-    val cleanName = name.stripSuffix("z")
-    val codec = (position, cleanName) match {
-      case (_, "ascii") => TextCodec.Ascii
-      case (_, "petscii") => TextCodec.Petscii
-      case (_, "pet") => TextCodec.Petscii
-      case (_, "petsciijp") => TextCodec.PetsciiJp
-      case (_, "petjp") => TextCodec.PetsciiJp
-      case (_, "oldpetscii") => TextCodec.OldPetscii
-      case (_, "oldpet") => TextCodec.OldPetscii
-      case (_, "origpetscii") => TextCodec.OriginalPetscii
-      case (_, "origpet") => TextCodec.OriginalPetscii
-      case (_, "cbmscr") => TextCodec.CbmScreencodes
-      case (_, "petscr") => TextCodec.CbmScreencodes
-      case (_, "cbmscrjp") => TextCodec.CbmScreencodesJp
-      case (_, "petscrjp") => TextCodec.CbmScreencodesJp
-      case (_, "atascii") => TextCodec.Atascii
-      case (_, "atari") => TextCodec.Atascii
-      case (_, "atasciiscr") => TextCodec.AtasciiScreencodes
-      case (_, "atariscr") => TextCodec.AtasciiScreencodes
-      case (_, "bbc") => TextCodec.Bbc
-      case (_, "sinclair") => TextCodec.Sinclair
-      case (_, "apple2") => TextCodec.Apple2
-      case (_, "jis") => TextCodec.Jis
-      case (_, "jisx") => TextCodec.Jis
-      case (_, "iso_de") => TextCodec.IsoIec646De
-      case (_, "iso_no") => TextCodec.IsoIec646No
-      case (_, "iso_dk") => TextCodec.IsoIec646No
-      case (_, "iso_se") => TextCodec.IsoIec646Se
-      case (_, "iso_fi") => TextCodec.IsoIec646Se
-      case (_, "iso_yu") => TextCodec.IsoIec646Yu
-      case (_, "msx_intl") => TextCodec.MsxWest
-      case (_, "msx_us") => TextCodec.MsxWest
-      case (_, "msx_uk") => TextCodec.MsxWest
-      case (_, "msx_de") => TextCodec.MsxWest
-      case (_, "msx_fr") => TextCodec.MsxWest
-      case (_, "msx_es") => TextCodec.MsxWest
-      case (_, "msx_ru") => TextCodec.MsxRu
-      case (_, "msx_jp") => TextCodec.MsxJp
-      case (_, "msx_br") => TextCodec.MsxBr
-      case (_, "vectrex") => TextCodec.Vectrex
-      case (_, "koi7n2") => TextCodec.Koi7N2
-      case (_, "short_koi") => TextCodec.Koi7N2
-      case (_, "zx80") => TextCodec.Zx80
-      case (_, "zx81") => TextCodec.Zx81
-      case (_, "iso8859_15") => TextCodec.Iso8859_15
-      case (_, "latin0") => TextCodec.Iso8859_15
-      case (_, "latin9") => TextCodec.Iso8859_15
-      case (_, "iso15") => TextCodec.Iso8859_15
-      case (_, "utf8") => TextCodec.Utf8
-      case (_, "utf16be") => TextCodec.Utf16Be
-      case (_, "utf16le") => TextCodec.Utf16Le
-      case (p, _) =>
-        log.error(s"Unknown string encoding: `$name`", p)
-        TextCodec.Ascii
+  def forName(name: String, position: Option[Position], log: Logger): TextCodecWithFlags = {
+    if (allCodecs.contains(name)) return TextCodecWithFlags(allCodecs(name), nullTerminated = false, lengthPrefixed = false, lenient = false)
+    if (name.endsWith("z")) {
+      val cleanName = name.stripSuffix("z")
+      if (allCodecs.contains(cleanName)) return TextCodecWithFlags(allCodecs(cleanName), nullTerminated = true, lengthPrefixed = false, lenient = false)
    }
-    codec -> zeroTerminated
+    val lengthPrefixed = name.startsWith("p")
+    if (name.startsWith("p")) {
+      val cleanName = name.stripPrefix("p")
+      if (allCodecs.contains(cleanName)) return TextCodecWithFlags(allCodecs(cleanName), nullTerminated = false, lengthPrefixed = true, lenient = false)
+
+      if (cleanName.endsWith("z")) {
+        val cleanName2 = cleanName.stripSuffix("z")
+        if (allCodecs.contains(cleanName2)) return TextCodecWithFlags(allCodecs(cleanName2), nullTerminated = true, lengthPrefixed = true, lenient = false)
+      }
+    }
+    log.error(s"Unknown string encoding: `$name`", position)
+    TextCodecWithFlags(TextCodec.Ascii, nullTerminated = false, lengthPrefixed = false, lenient = false)
  }

  private val Utf8 = new UnicodeTextCodec("UTF-8", StandardCharsets.UTF_8, List(0))
@ -337,7 +350,7 @@ object TextCodec {
  private lazy val DefaultOverrides: Map[Char, Int] = ('\u2400' to '\u2420').map(c => c->(c.toInt - 0x2400)).toMap + ('\u2421' -> 127)

  //noinspection ScalaUnusedSymbol
-  private val AsciiEscapeSequences: Map[String, List[Int]] = Map(
+  private lazy val AsciiEscapeSequences: Map[String, List[Int]] = Map(
    "n" -> List(13, 10),
    "t" -> List(9),
    "b" -> List(8),
@ -347,12 +360,12 @@ object TextCodec {
    "rbrace" -> List('}'.toInt))

  //noinspection ScalaUnusedSymbol
-  private val MinimalEscapeSequencesWithoutBraces: Map[String, List[Int]] = Map(
+  private lazy val MinimalEscapeSequencesWithoutBraces: Map[String, List[Int]] = Map(
    "apos" -> List('\''.toInt),
    "q" -> List('\"'.toInt))

  //noinspection ScalaUnusedSymbol
-  private val MinimalEscapeSequencesWithBraces: Map[String, List[Int]] = Map(
+  private lazy val MinimalEscapeSequencesWithBraces: Map[String, List[Int]] = Map(
    "apos" -> List('\''.toInt),
    "q" -> List('\"'.toInt),
    "lbrace" -> List('{'.toInt),
--- a/src/test/scala/millfork/test/PStringSuite.scala
+++ b/src/test/scala/millfork/test/PStringSuite.scala
@ -0,0 +1,30 @@
+package millfork.test
+
+import millfork.Cpu
+import millfork.test.emu.{EmuUnoptimizedCrossPlatformRun, ShouldNotCompile, ShouldNotParse}
+import org.scalatest.{AppendedClues, FunSuite, Matchers}
+
+/**
+  * @author Karol Stasiak
+  */
+class PStringSuite extends FunSuite with Matchers with AppendedClues {
+  test("pstrlen") {
+    EmuUnoptimizedCrossPlatformRun(Cpu.Mos, Cpu.Z80, Cpu.Motorola6809)(
+      """
+        |byte pstrlen(pointer str) {
+        |    return str[0]
+        |}
+        |byte output @ $c000
+        |void main() {
+        |    output = pstrlen("hello"p)
+        |}
+        |""".stripMargin) { m =>
+      m.readByte(0xc000) should be (5)
+    }
+  }
+
+  test("P-string literal too long") {
+    val longString = "a" * 256
+    ShouldNotParse(s"""const pointer p = "$longString"p""")
+  }
+}
--- a/src/test/scala/millfork/test/emu/ShouldNotParse.scala
+++ b/src/test/scala/millfork/test/emu/ShouldNotParse.scala
@ -39,7 +39,11 @@ object ShouldNotParse extends Matchers {
      }
    parserF.toAst match {
      case Success(program, _) =>
-        fail("Parse succeeded")
+        if (!log.hasErrors) {
+          fail("Parse succeeded")
+        } else {
+          log.warn("Non-fatal parse errors encountered. OK.")
+        }
      case f: Failure[_, _] =>
        println(f.extra.toString)
        log.warn("Last parser: " + f.lastParser, Some(parserF.indexToPosition(f.index, f.lastParser.toString)))