diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f5684c0..109cd4f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ * A very incomplete support for NEC PC-88. +* Automatic selection of text encoding based on target platform. + ## 0.3.0 * Finally faster than C. diff --git a/docs/api/command-line.md b/docs/api/command-line.md index 6ac8292a..3a77db53 100644 --- a/docs/api/command-line.md +++ b/docs/api/command-line.md @@ -81,14 +81,18 @@ Default: no if targeting Ricoh, yes otherwise. * `-fvariable-overlap`, `-fno-variable-overlap` – Whether variables should overlap if their scopes do not intersect. Default: yes. -* `-fbounds-checking`, `-fnobounds-checking` – Whether should insert bounds checking on array access. +* `-fbounds-checking`, `-fno-bounds-checking` – Whether should insert bounds checking on array access. Default: no. -* `-fcompact-dispatch-params`, `-fnocompact-dispatch-params` – +* `-fcompact-dispatch-params`, `-fno-compact-dispatch-params` – Whether parameter values in return dispatch statements may overlap other objects. This may cause problems if the parameter table is stored next to a hardware register that has side effects when reading. `.ini` equivalent: `compact_dispatch_params`. Default: yes. +* `-flenient-encoding`, `-fno-lenient-encoding` – +Whether the compiler should allow for invalid characters in string/character literals that use the default encodings and replace them with alternatives. +.ini` equivalent: `lenient_encoding`. Default: no. + ## Optimization options * `-O0` – Disable all optimizations. diff --git a/docs/api/custom-platform.md b/docs/api/custom-platform.md index 0f6f75a3..3b8c0c9d 100644 --- a/docs/api/custom-platform.md +++ b/docs/api/custom-platform.md @@ -26,6 +26,13 @@ Every platform is defined in an `.ini` file with an appropriate name. * `z80` (Zilog Z80; experimental and very incomplete) +* `encoding` – default encoding for console I/O, one of +`ascii`, `pet`/`petscii`, `petscr`/`cbmscr`, `atascii`, `bbc`, `jis`/`jisx`, `apple2`, +`iso_de`, `iso_no`/`iso_dk`, `iso_se`/`iso_fi`, `iso_yu`. Default: `ascii` + +* `screen_encoding` – default encoding for screencodes (literals with encoding specified as `scr`). +Default: the same as `encoding`. + * `modules` – comma-separated list of modules that will be automatically imported * other compilation options (they can be overridden using commandline options): @@ -54,6 +61,8 @@ Every platform is defined in an `.ini` file with an appropriate name. * `inline` - inline functions automatically by default, default is `false`. * `ipo` - enable interprocedural optimization, default is `false`. + + * `lenient_encoding` - allow for automatic substitution of invalid characters in string literals using the default encodings, default is `false`. #### `[allocation]` section diff --git a/docs/lang/literals.md b/docs/lang/literals.md index 0800bdb4..2173e75a 100644 --- a/docs/lang/literals.md +++ b/docs/lang/literals.md @@ -16,9 +16,10 @@ Hexadecimal: `$D323`, `0x2a2` ## String literals -String literals are surrounded with double quotes and followed by the name of the encoding: +String literals are surrounded with double quotes and optionally followed by the name of the encoding: "this is a string" ascii + "this is also a string" Characters between the quotes are interpreted literally, there are no ways to escape special characters or quotes. @@ -28,11 +29,16 @@ for compatibility with multiple variants. Currently available encodings: +* `default` – default console encoding (can be omitted) + +* `scr` – default screencodes +(usually the same as `default`, a notable exception are the Commodore computers) + * `ascii` – standard ASCII * `pet` or `petscii` – PETSCII (ASCII-like character set used by Commodore machines) -* `scr` – Commodore screencodes +* `cbmscr` or `petscr` – Commodore screencodes * `apple2` – Apple II charset ($A0–$FE) @@ -46,16 +52,42 @@ Currently available encodings: When programming for Commodore, use `pet` for strings you're printing using standard I/O routines -and `scr` for strings you're copying to screen memory directly. +and `petscr` for strings you're copying to screen memory directly. + +If the characters in the literal cannot be encoded in particular encoding, an error is raised. +However, if the command-line option `-flenient-encoding` is used, +then literals using `default` and `scr` encodings replace unsupported characters with supported ones +and a warning is issued. +For example, if `-flenient-encoding` is enabled, then a literal `"£¥↑ž©ß"` is equivalent to: + +* `"£Y↑z(C)ss"` if the default encoding is `pet` + +* `"£Y↑z©ss"` if the default encoding is `bbc` + +* `"?Y^z(C)ss"` if the default encoding is `ascii` + +* `"?Y^ž(C)ss"` if the default encoding is `iso_yu` + +* `"?Y^z(C)ß"` if the default encoding is `iso_de` + +* `"?¥^z(C)ss"` if the default encoding is `jisx` + +Note that the final length of the string may vary. ## Character literals -Character literals are surrounded by single quotes and followed by the name of the encoding: +Character literals are surrounded by single quotes and optionally followed by the name of the encoding: 'x' ascii + 'W' From the type system point of view, they are constants of type byte. +If the characters in the literal cannot be encoded in particular encoding, an error is raised. +However, if the command-line option `-flenient-encoding` is used, +then literals using `default` and `scr` encodings replace unsupported characters with supported ones. +If the replacement is one characacter long, only a warning is issued, otherwise an error is raised. + ## Array initialisers An array is initialized with either: diff --git a/examples/c64/calculator.mfk b/examples/c64/calculator.mfk index c716fc94..c0c47d1e 100644 --- a/examples/c64/calculator.mfk +++ b/examples/c64/calculator.mfk @@ -1,10 +1,10 @@ import stdio import c64_basic -array text1 = [ "enter first number:" petscii, 13, 0 ] -array text2 = [ "enter second number:" petscii, 13, 0 ] -array text3 = [ "the sum is:" petscii, 13, 0 ] -array texte = [ "that wasn't a number, try again:" petscii, 13, 0 ] +array text1 = [ "enter first number:", 13, 0 ] +array text2 = [ "enter second number:", 13, 0 ] +array text3 = [ "the sum is:", 13, 0 ] +array texte = [ "that wasn't a number, try again:", 13, 0 ] void main() { word a diff --git a/examples/c64/text_encodings.mfk b/examples/c64/text_encodings.mfk index 62488482..7e84d3db 100644 --- a/examples/c64/text_encodings.mfk +++ b/examples/c64/text_encodings.mfk @@ -1,8 +1,8 @@ import stdio array p = [ - "this is an example" petscii, 13, - "of multiline petscii text" petscii + "this is an example", 13, + "of multiline petscii text" ] array s = [ diff --git a/examples/hello_world/hello_world.mfk b/examples/hello_world/hello_world.mfk index 4bbbf4df..8e2fbfe5 100644 --- a/examples/hello_world/hello_world.mfk +++ b/examples/hello_world/hello_world.mfk @@ -3,7 +3,7 @@ import stdio -array hello_world = "hello world" petscii +array hello_world = "hello world" void main(){ putstr(hello_world, hello_world.length) diff --git a/include/a8.ini b/include/a8.ini index 75c635fe..54c842f4 100644 --- a/include/a8.ini +++ b/include/a8.ini @@ -1,7 +1,7 @@ [compilation] arch=strict modules=a8_kernel,default_panic - +encoding=atascii [allocation] ; TODO diff --git a/include/apple2.ini b/include/apple2.ini index 7079822f..f4e53db1 100644 --- a/include/apple2.ini +++ b/include/apple2.ini @@ -1,5 +1,6 @@ [compilation] arch=strict +encoding=apple2 modules=apple2_kernel,default_panic diff --git a/include/bbcmicro.ini b/include/bbcmicro.ini index c0fc7959..95b4d3ec 100644 --- a/include/bbcmicro.ini +++ b/include/bbcmicro.ini @@ -1,6 +1,7 @@ [compilation] ; "strict" guarantees compatibility with Rockwell CPU's in some later Model B's arch=strict +encoding=bbc modules=bbc_kernal,bbc_hardware,default_panic diff --git a/include/c128.ini b/include/c128.ini index e719ed3d..d4117389 100644 --- a/include/c128.ini +++ b/include/c128.ini @@ -1,5 +1,7 @@ [compilation] arch=nmos +encoding=petscii +screen_encoding=petscr modules=c128_hardware,loader_1c01,c128_kernal,default_panic diff --git a/include/c16.ini b/include/c16.ini index 40b2011d..e0385222 100644 --- a/include/c16.ini +++ b/include/c16.ini @@ -1,5 +1,7 @@ [compilation] arch=nmos +encoding=petscii +screen_encoding=petscr modules=loader_1001,c264_kernal,c264_hardware,default_panic diff --git a/include/c64.ini b/include/c64.ini index 3a5e8fc3..56b8cdf2 100644 --- a/include/c64.ini +++ b/include/c64.ini @@ -4,6 +4,8 @@ [compilation] ; CPU architecture: nmos, strictnmos, ricoh, strictricoh, cmos arch=nmos +encoding=petscii +screen_encoding=petscr ; modules to load modules=c64_hardware,loader_0801,c64_kernal,c64_panic,stdlib ; optionally: default flags diff --git a/include/c64_scpu.ini b/include/c64_scpu.ini index 0be7553d..a96e35af 100644 --- a/include/c64_scpu.ini +++ b/include/c64_scpu.ini @@ -3,6 +3,8 @@ [compilation] arch=65816 +encoding=petscii +screen_encoding=petscr modules=c64_hardware,loader_0801,c64_kernal,c64_panic,stdlib emit_65816=emulation diff --git a/include/c64_scpu16.ini b/include/c64_scpu16.ini index f1e4582a..98a0b366 100644 --- a/include/c64_scpu16.ini +++ b/include/c64_scpu16.ini @@ -4,6 +4,8 @@ [compilation] arch=65816 +encoding=petscii +screen_encoding=petscr modules=c64_hardware,loader_0801_16bit,c64_kernal,c64_panic,stdlib emit_65816=native diff --git a/include/lunix.ini b/include/lunix.ini index 2c1246e4..cf38e3cf 100644 --- a/include/lunix.ini +++ b/include/lunix.ini @@ -2,6 +2,8 @@ [compilation] arch=nmos +encoding=petscii +screen_encoding=petscr modules=lunix lunix=true diff --git a/include/pc88.ini b/include/pc88.ini index c873a62c..bca7f6f6 100644 --- a/include/pc88.ini +++ b/include/pc88.ini @@ -2,6 +2,7 @@ ;a single-load PC-88 program [compilation] arch=z80 +encoding=jisx modules=default_panic [allocation] diff --git a/include/plus4.ini b/include/plus4.ini index eced39e2..86514327 100644 --- a/include/plus4.ini +++ b/include/plus4.ini @@ -1,5 +1,7 @@ [compilation] arch=nmos +encoding=petscii +screen_encoding=petscr modules=loader_1001,c264_kernal,c264_hardware,default_panic diff --git a/include/vic20.ini b/include/vic20.ini index 7f08a624..be3a3e4b 100644 --- a/include/vic20.ini +++ b/include/vic20.ini @@ -1,5 +1,7 @@ [compilation] arch=nmos +encoding=petscii +screen_encoding=petscr modules=loader_1001,vic20_kernal,default_panic diff --git a/include/vic20_3k.ini b/include/vic20_3k.ini index 1a78a95f..2d78a439 100644 --- a/include/vic20_3k.ini +++ b/include/vic20_3k.ini @@ -1,5 +1,7 @@ [compilation] arch=nmos +encoding=petscii +screen_encoding=petscr modules=loader_0401,vic20_kernal,default_panic diff --git a/include/vic20_8k.ini b/include/vic20_8k.ini index 4529c0f2..bfdf0492 100644 --- a/include/vic20_8k.ini +++ b/include/vic20_8k.ini @@ -1,5 +1,7 @@ [compilation] arch=nmos +encoding=petscii +screen_encoding=petscr modules=loader_1201,vic20_kernal,default_panic diff --git a/include/zxspectrum.ini b/include/zxspectrum.ini index 5925c23e..9812e78f 100644 --- a/include/zxspectrum.ini +++ b/include/zxspectrum.ini @@ -2,6 +2,7 @@ ;a single-load ZX Spectrum 48k program [compilation] arch=z80 +encoding=bbc modules=default_panic,zxspectrum [allocation] diff --git a/src/main/scala/millfork/CompilationOptions.scala b/src/main/scala/millfork/CompilationOptions.scala index 04f09a12..452dc603 100644 --- a/src/main/scala/millfork/CompilationOptions.scala +++ b/src/main/scala/millfork/CompilationOptions.scala @@ -203,7 +203,7 @@ object Cpu extends Enumeration { object CompilationFlag extends Enumeration { val // common compilation options: - EmitIllegals, DecimalMode, ReadOnlyArrays, + EmitIllegals, DecimalMode, ReadOnlyArrays, LenientTextEncoding, // compilation options for MOS: EmitCmosOpcodes, EmitCmosNopOpcodes, EmitHudsonOpcodes, Emit65CE02Opcodes, EmitEmulation65816Opcodes, EmitNative65816Opcodes, PreventJmpIndirectBug, LargeCode, ReturnWordsViaAccumulator, @@ -244,6 +244,7 @@ object CompilationFlag extends Enumeration { "ror_warn" -> RorWarning, "prevent_jmp_indirect_bug" -> PreventJmpIndirectBug, "compact_dispatch_params" -> CompactReturnDispatchParams, + "lenient_encoding" -> LenientTextEncoding, ) } \ No newline at end of file diff --git a/src/main/scala/millfork/Main.scala b/src/main/scala/millfork/Main.scala index a09a35ef..926511ec 100644 --- a/src/main/scala/millfork/Main.scala +++ b/src/main/scala/millfork/Main.scala @@ -323,6 +323,9 @@ object Main { boolean("-fbounds-checking", "-fno-bounds-checking").action { (c, v) => c.changeFlag(CompilationFlag.VariableOverlap, v) }.description("Whether should insert bounds checking on array access.") + boolean("-flenient-encoding", "-fno-lenient-encoding").action { (c, v) => + c.changeFlag(CompilationFlag.LenientTextEncoding, v) + }.description("Whether the compiler should replace invalid characters in string literals that use the default encodings.") fluff("", "Optimization options:", "") diff --git a/src/main/scala/millfork/Platform.scala b/src/main/scala/millfork/Platform.scala index 0a3e548a..c1ee36a0 100644 --- a/src/main/scala/millfork/Platform.scala +++ b/src/main/scala/millfork/Platform.scala @@ -6,6 +6,7 @@ import java.nio.file.{Files, Paths} import millfork.error.ErrorReporting import millfork.output._ +import millfork.parser.TextCodec import org.apache.commons.configuration2.INIConfiguration /** @@ -20,6 +21,8 @@ class Platform( val cpu: Cpu.Value, val flagOverrides: Map[CompilationFlag.Value, Boolean], val startingModules: List[String], + val defaultCodec: TextCodec, + val screenCodec: TextCodec, val outputPackager: OutputPackager, val codeAllocators: Map[String, UpwardByteAllocator], val variableAllocators: Map[String, VariableAllocator], @@ -98,6 +101,11 @@ object Platform { ErrorReporting.error("Invalid zeropage register size: " + zpRegisterSize) } + val codecName = cs.get(classOf[String], "encoding", "ascii") + val srcCodecName = cs.get(classOf[String], "screen_encoding", codecName) + val codec = TextCodec.forName(codecName, None) + val srcCodec = TextCodec.forName(srcCodecName, None) + val as = conf.getSection("allocation") val banks = as.get(classOf[String], "segments", "default").split("[, ]+").filter(_.nonEmpty).toList @@ -185,7 +193,13 @@ object Platform { case x => ErrorReporting.fatal(s"Invalid output style: `$x`") } - new Platform(cpu, flagOverrides, startingModules, outputPackager, + new Platform( + cpu, + flagOverrides, + startingModules, + codec, + srcCodec, + outputPackager, codeAllocators.toMap, variableAllocators.toMap, zpRegisterSize, diff --git a/src/main/scala/millfork/parser/MfParser.scala b/src/main/scala/millfork/parser/MfParser.scala index d1886bcc..0af1bd5e 100644 --- a/src/main/scala/millfork/parser/MfParser.scala +++ b/src/main/scala/millfork/parser/MfParser.scala @@ -7,7 +7,7 @@ import fastparse.all._ import millfork.env._ import millfork.error.ErrorReporting import millfork.node._ -import millfork.{CompilationOptions, SeparatedList} +import millfork.{CompilationFlag, CompilationOptions, SeparatedList} /** * @author Karol Stasiak @@ -40,26 +40,10 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St newPosition } - val codec: P[TextCodec] = P(position("text codec identifier") ~ identifier).map { - case (_, "ascii") => TextCodec.Ascii - case (_, "petscii") => TextCodec.Petscii - case (_, "pet") => TextCodec.Petscii - case (_, "scr") => TextCodec.CbmScreencodes - case (_, "atascii") => TextCodec.Atascii - case (_, "atari") => TextCodec.Atascii - case (_, "bbc") => TextCodec.Bbc - case (_, "apple2") => TextCodec.Apple2 - case (_, "jis") => TextCodec.Jis - case (_, "jisx") => TextCodec.Jis - case (_, "iso_de") => TextCodec.IsoIec646De - case (_, "iso_no") => TextCodec.IsoIec646No - case (_, "iso_dk") => TextCodec.IsoIec646No - case (_, "iso_se") => TextCodec.IsoIec646Se - case (_, "iso_fi") => TextCodec.IsoIec646Se - case (_, "iso_yu") => TextCodec.IsoIec646Yu - case (p, x) => - ErrorReporting.error(s"Unknown string encoding: `$x`", Some(p)) - TextCodec.Ascii + val codec: P[(TextCodec, Boolean)] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map { + case (_, "" | "default") => options.platform.defaultCodec -> options.flag(CompilationFlag.LenientTextEncoding) + case (_, "scr") => options.platform.screenCodec -> options.flag(CompilationFlag.LenientTextEncoding) + case (p, x) => TextCodec.forName(x, Some(p)) -> false } // def operator: P[String] = P(CharsWhileIn("!-+*/><=~|&^", min=1).!) // TODO: only valid operators @@ -67,9 +51,9 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St val charAtom: P[LiteralExpression] = for { p <- position() c <- "'" ~/ CharPred(c => c >= ' ' && !invalidCharLiteralTypes(Character.getType(c))).! ~/ "'" - co <- HWS ~ codec + (co, lenient) <- HWS ~ codec } yield { - co.encode(Some(p), c.charAt(0)) match { + co.encode(options, Some(p), c.charAt(0), lenient = lenient) match { case List(value) => LiteralExpression(value, 1) case _ => @@ -152,7 +136,7 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St } def arrayStringContents: P[ArrayContents] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map { - case (p, s, co) => LiteralContents(s.flatMap(c => co.encode(None, c)).map(c => LiteralExpression(c, 1).pos(p))) + case (p, s, (co, lenient)) => LiteralContents(s.flatMap(c => co.encode(options, None, c, lenient = lenient)).map(c => LiteralExpression(c, 1).pos(p))) } def arrayLoopContents: P[ArrayContents] = for { diff --git a/src/main/scala/millfork/parser/TextCodec.scala b/src/main/scala/millfork/parser/TextCodec.scala index ecffd03f..cf2ccdb6 100644 --- a/src/main/scala/millfork/parser/TextCodec.scala +++ b/src/main/scala/millfork/parser/TextCodec.scala @@ -1,5 +1,8 @@ package millfork.parser +import java.util.Locale + +import millfork.CompilationOptions import millfork.error.ErrorReporting import millfork.node.Position @@ -7,19 +10,75 @@ import millfork.node.Position * @author Karol Stasiak */ class TextCodec(val name: String, private val map: String, private val extra: Map[Char, Int], private val decompositions: Map[Char, String]) { - def encode(position: Option[Position], c: Char): List[Int] = { + + private def isPrintable(c: Char) = { + c.getType match { + case Character.LOWERCASE_LETTER => true + case Character.UPPERCASE_LETTER => true + case Character.TITLECASE_LETTER => true + case Character.OTHER_LETTER => true + case Character.LETTER_NUMBER => true + case Character.DECIMAL_DIGIT_NUMBER => true + case Character.OTHER_NUMBER => true + case Character.DASH_PUNCTUATION => true + case Character.START_PUNCTUATION => true + case Character.END_PUNCTUATION => true + case Character.INITIAL_QUOTE_PUNCTUATION => true + case Character.FINAL_QUOTE_PUNCTUATION => true + case Character.OTHER_PUNCTUATION => true + case Character.CURRENCY_SYMBOL => true + case Character.OTHER_SYMBOL => true + case Character.MATH_SYMBOL => true + case Character.SPACE_SEPARATOR => true + case Character.PARAGRAPH_SEPARATOR => false + case Character.LINE_SEPARATOR => false + case Character.CONTROL => false + case Character.MODIFIER_SYMBOL => false + case Character.SURROGATE => false + case Character.NON_SPACING_MARK => false + case Character.COMBINING_SPACING_MARK => false + case _ => false + } + } + + private def format(c:Char):String = { + val u = f"U+${c.toInt}%04X" + if (isPrintable(c)) f"`$c%c` ($u%s)" + else u + } + + private def format(s:String) = { + val u = s.map(c => f"U+${c.toInt}%04X").mkString(",") + if (s.forall(isPrintable)) f"`$s%s` ($u%s)" + else u + } + + private def encodeImpl(options: CompilationOptions, position: Option[Position], c: Char, lenient: Boolean): Option[List[Int]] = { if (decompositions.contains(c)) { - decompositions(c).toList.flatMap(x => encode(position, x)) - } else if (extra.contains(c)) List(extra(c)) else { + Some(decompositions(c).toList.flatMap(x => encodeImpl(options, position, x, lenient).getOrElse(Nil))) + } else if (extra.contains(c)) Some(List(extra(c))) else { val index = map.indexOf(c) if (index >= 0) { - List(index) + Some(List(index)) + } else if (lenient) { + val alternative = TextCodec.lossyAlternatives.getOrElse(c, Nil).:+("?").find(alts => alts.forall(alt => encodeImpl(options, position, alt, lenient = false).isDefined)).getOrElse("") + ErrorReporting.warn(s"Cannot encode ${format(c)} in encoding `$name`, replaced it with ${format(alternative)}", options, position) + Some(alternative.toList.flatMap(encodeImpl(options, position, _, lenient = false).get)) } else { - ErrorReporting.fatal("Invalid character in string", position) + None } } } + def encode(options: CompilationOptions, position: Option[Position], c: Char, lenient: Boolean): List[Int] = { + encodeImpl(options, position, c, lenient) match { + case Some(x) => x + case None => + ErrorReporting.error(f"Invalid character ${format(c)} in string", position) + Nil + } + } + def decode(by: Int): Char = { val index = by & 0xff if (index < map.length) map(index) else TextCodec.NotAChar @@ -27,6 +86,30 @@ class TextCodec(val name: String, private val map: String, private val extra: Ma } object TextCodec { + + def forName(name: String, position: Option[Position]): TextCodec = (position, name) match { + case (_, "ascii") => TextCodec.Ascii + case (_, "petscii") => TextCodec.Petscii + case (_, "pet") => TextCodec.Petscii + case (_, "cbmscr") => TextCodec.CbmScreencodes + case (_, "petscr") => TextCodec.CbmScreencodes + case (_, "atascii") => TextCodec.Atascii + case (_, "atari") => TextCodec.Atascii + case (_, "bbc") => TextCodec.Bbc + case (_, "apple2") => TextCodec.Apple2 + case (_, "jis") => TextCodec.Jis + case (_, "jisx") => TextCodec.Jis + case (_, "iso_de") => TextCodec.IsoIec646De + case (_, "iso_no") => TextCodec.IsoIec646No + case (_, "iso_dk") => TextCodec.IsoIec646No + case (_, "iso_se") => TextCodec.IsoIec646Se + case (_, "iso_fi") => TextCodec.IsoIec646Se + case (_, "iso_yu") => TextCodec.IsoIec646Yu + case (p, x) => + ErrorReporting.error(s"Unknown string encoding: `$x`", p) + TextCodec.Ascii + } + val NotAChar = '\ufffd' val Ascii = new TextCodec("ASCII", 0.until(127).map { i => if (i < 32) NotAChar else i.toChar }.mkString, Map.empty, Map.empty) @@ -70,7 +153,7 @@ object TextCodec { 'Ü' -> '^'.toInt, 'ü' -> '~'.toInt, '«' -> '"'.toInt, - '»' -> '#'.toInt, + '»' -> '"'.toInt, '§' -> '#'.toInt), Map.empty ) @@ -116,6 +199,12 @@ object TextCodec { Map('↑' -> '^'.toInt), Map.empty ) + private val jisHalfwidthKatakanaOrder: String = + "\ufffd。「」、・ヲァィゥェォャュョッ" + + "ーアイウエオカキクケコサシスセソ" + + "タチツテトナニヌネノハヒフヘホマ" + + "ミムメモヤユヨラリルレロワン゛゜" + //noinspection ScalaUnnecessaryParentheses val Jis = new TextCodec("JIS-X-0201", "\ufffd" * 32 + @@ -123,10 +212,7 @@ object TextCodec { "[¥]^_" + "`" + 'a'.to('z').mkString + "{|}~\ufffd" + "\ufffd" * 32 + - "\ufffd。「」、・ヲァィゥェォャュョッ" + - "ーアイウエオカキクケコサシスセソ" + - "タチツテトナニヌネノハヒフヘホマ" + - "ミムメモヤユヨラリルレロワン゛゜" + + jisHalfwidthKatakanaOrder + "\ufffd" * 8 + "♠♡♢♣" + "\ufffd" * 4 + @@ -139,4 +225,102 @@ object TextCodec { "ハヒフヘホ".zip("パピプペポ").map { case (h, p) => p -> (h + "゜") }.toMap ) + val lossyAlternatives: Map[Char, List[String]] = { + val allowLowercase: Map[Char, List[String]] = ('A' to 'Z').map(c => c -> List(c.toString.toLowerCase(Locale.ROOT))).toMap + val allowUppercase: Map[Char, List[String]] = ('a' to 'z').map(c => c -> List(c.toString.toUpperCase(Locale.ROOT))).toMap + val ligaturesAndSymbols: Map[Char, List[String]] = Map( + '¦' -> List("|"), + '|' -> List("¦"), + 'ß' -> List("ss", "SS"), + 'ff' -> List("ff", "FF"), + 'fl' -> List("fl", "FL"), + 'fi' -> List("fi", "FI"), + 'ffi' -> List("ffi", "FFI"), + 'ffl' -> List("ffl", "FFL"), + '½' -> List("1/2"), + '¼' -> List("1/4"), + '¾' -> List("3/4"), + '¥' -> List("Y", "y"), + '円' -> List("¥", "Y", "y"), + '年' -> List("Y", "y"), + '月' -> List("M", "m"), + '日' -> List("D", "d"), + '時' -> List("h", "H"), + '分' -> List("m", "M"), + '秒' -> List("s", "S"), + '♥' -> List("H", "h"), + '♠' -> List("S", "s"), + '♡' -> List("H", "h"), + '♢' -> List("D", "d"), + '♣' -> List("C", "c"), + '。' -> List("."), + '、' -> List(","), + '・' -> List("-"), + '•' -> List("・", "*"), + '「' -> List("[", "("), + '」' -> List("]", ")"), + '。' -> List("."), + '。' -> List("."), + '^' -> List("↑"), + '↑' -> List("^"), + '‾' -> List("~"), + '¯' -> List("~"), + '«' -> List("\""), + '»' -> List("\""), + '§' -> List("#"), + '[' -> List("("), + ']' -> List(")"), + '{' -> List("("), + '}' -> List(")"), + '§' -> List("#"), + '§' -> List("#"), + '©' -> List("(C)"), + 'İ' -> List("I", "i"), + 'ª' -> List("a", "A"), + 'º' -> List("o", "O"), + '‰' -> List("%."), + '÷' -> List("/"), + 'ij' -> List("ij", "IJ"), + 'IJ' -> List("IJ", "ij"), + ) + val accentedLetters: Map[Char, List[String]] = List( + "áàäãåąāǎă" -> "a", + "çčċćĉ" -> "c", + "đď" -> "d", + "ð" -> "dh", + "éèêëęēėě" -> "e", + "ğǧĝģġ" -> "g", + "ħĥ" -> "h", + "íıìîïįīǐĭĩ" -> "i", + "ĵ" -> "j", + "ķ" -> "k", + "ĺľłļŀ" -> "l", + "ñńňņŋ" -> "n", + "óòöôőõøōǒ" -> "o", + "řŗŕ" -> "r", + "śšŝșşſ" -> "s", + "ţțťŧ" -> "t", + "þ" -> "th", + "úùũûüűųūǔůǘǜǚǖ" -> "u", + "ẃẁŵ" -> "w", + "ýÿỳŷȳ" -> "y", + "žźż" -> "z", + "æ" -> "ae", + "œ" -> "oe", + ).flatMap{case (acc, plain) => acc.toList.flatMap(letter => List( + letter -> List(plain, plain.toUpperCase(Locale.ROOT)), + letter.toUpper -> List(plain.toUpperCase(Locale.ROOT), plain) + ))}.toMap + val hiragana: Map[Char, List[String]] = (0x3041 to 0x3096).map{ kana => kana.toChar -> List(kana.+(0x60).toChar.toString)}.toMap + val fullWidth: Map[Char, List[String]] = (0xff01 to 0xff5e).map{ i => + val fw = i.toChar + val hw = i.-(0xfee0).toChar + if (hw.isUpper) fw -> List(hw.toString, hw.toString.toLowerCase(Locale.ROOT)) + else if (hw.isLower) fw -> List(hw.toString, hw.toString.toUpperCase(Locale.ROOT)) + else fw -> List(hw.toString) + }.toMap + val halfWidth = (0xff61 to 0xff9f).map{ c => c.toChar -> List(jisHalfwidthKatakanaOrder(c - 0xff60).toString)}.toMap + allowLowercase ++ allowUppercase ++ ligaturesAndSymbols ++ accentedLetters ++ hiragana ++ fullWidth ++ halfWidth + } + } diff --git a/src/test/scala/millfork/test/TextCodecSuite.scala b/src/test/scala/millfork/test/TextCodecSuite.scala index b5fdf0bc..4a31551b 100644 --- a/src/test/scala/millfork/test/TextCodecSuite.scala +++ b/src/test/scala/millfork/test/TextCodecSuite.scala @@ -22,4 +22,18 @@ class TextCodecSuite extends FunSuite with Matchers { | } """.stripMargin) } + + test("Lenient encoding") { + val m = EmuUnoptimizedRun( + """ + | void main() { + | if 'å' != 'a' { poke($bfff, 0) } + | if '÷' != '/' { poke($bffd, 0) } + | if 'π' != '?' { poke($bffc, 0) } + | } + | macro asm void poke(word const addr, byte a) { + | STA addr + | } + """.stripMargin) + } } diff --git a/src/test/scala/millfork/test/emu/EmuPlatform.scala b/src/test/scala/millfork/test/emu/EmuPlatform.scala index a1a7a813..70e8904b 100644 --- a/src/test/scala/millfork/test/emu/EmuPlatform.scala +++ b/src/test/scala/millfork/test/emu/EmuPlatform.scala @@ -1,6 +1,7 @@ package millfork.test.emu import millfork.output.{AfterCodeByteAllocator, CurrentBankFragmentOutput, UpwardByteAllocator, VariableAllocator} +import millfork.parser.TextCodec import millfork.{Cpu, CpuFamily, OutputStyle, Platform} /** @@ -13,6 +14,8 @@ object EmuPlatform { cpu, Map(), Nil, + TextCodec.Ascii, + TextCodec.Ascii, CurrentBankFragmentOutput(0, 0xffff), Map("default" -> new UpwardByteAllocator(0x200, 0xb000)), Map("default" -> new VariableAllocator( diff --git a/src/test/scala/millfork/test/emu/EmuRun.scala b/src/test/scala/millfork/test/emu/EmuRun.scala index bd34326f..b44d1079 100644 --- a/src/test/scala/millfork/test/emu/EmuRun.scala +++ b/src/test/scala/millfork/test/emu/EmuRun.scala @@ -100,6 +100,7 @@ class EmuRun(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimization], println(source) val platform = EmuPlatform.get(cpu) val options = CompilationOptions(platform, Map( + CompilationFlag.LenientTextEncoding -> true, CompilationFlag.EmitIllegals -> this.emitIllegals, CompilationFlag.InlineFunctions -> this.inline, CompilationFlag.InterproceduralOptimization -> true, diff --git a/src/test/scala/millfork/test/emu/EmuZ80Run.scala b/src/test/scala/millfork/test/emu/EmuZ80Run.scala index e25e45ca..ae14554f 100644 --- a/src/test/scala/millfork/test/emu/EmuZ80Run.scala +++ b/src/test/scala/millfork/test/emu/EmuZ80Run.scala @@ -5,14 +5,13 @@ import fastparse.core.Parsed.{Failure, Success} import millfork.assembly.AssemblyOptimization import millfork.assembly.z80.ZLine import millfork.compiler.CompilationContext -import millfork.compiler.mos.MosCompiler import millfork.env.{Environment, InitializedArray, InitializedMemoryVariable, NormalFunction} import millfork.error.ErrorReporting import millfork.node.StandardCallGraph import millfork.node.opt.NodeOptimization -import millfork.output.{MemoryBank, MosAssembler, Z80Assembler} +import millfork.output.{MemoryBank, Z80Assembler} import millfork.parser.Z80Parser -import millfork.{CompilationOptions, CpuFamily} +import millfork.{CompilationFlag, CompilationOptions, CpuFamily} import millfork.compiler.z80.Z80Compiler import org.scalatest.Matchers @@ -28,7 +27,8 @@ class EmuZ80Run(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimizatio Console.err.flush() println(source) val platform = EmuPlatform.get(cpu) - val options = CompilationOptions(platform, millfork.Cpu.defaultFlags(cpu).map(_ -> true).toMap, None, 0) + val extraFlags = Map(CompilationFlag.LenientTextEncoding -> true) + val options = CompilationOptions(platform, millfork.Cpu.defaultFlags(cpu).map(_ -> true).toMap ++ extraFlags, None, 0) ErrorReporting.hasErrors = false ErrorReporting.verbosity = 999 var effectiveSource = source