1
0
mirror of https://github.com/KarolS/millfork.git synced 2024-05-31 18:41:30 +00:00

Text encoding improvements

This commit is contained in:
Karol Stasiak 2018-07-07 00:58:44 +02:00
parent 265f729b24
commit 2c8de8b6a5
31 changed files with 329 additions and 54 deletions

View File

@ -6,6 +6,8 @@
* A very incomplete support for NEC PC-88.
* Automatic selection of text encoding based on target platform.
## 0.3.0
* Finally faster than C.

View File

@ -81,14 +81,18 @@ Default: no if targeting Ricoh, yes otherwise.
* `-fvariable-overlap`, `-fno-variable-overlap` Whether variables should overlap if their scopes do not intersect.
Default: yes.
* `-fbounds-checking`, `-fnobounds-checking` Whether should insert bounds checking on array access.
* `-fbounds-checking`, `-fno-bounds-checking` Whether should insert bounds checking on array access.
Default: no.
* `-fcompact-dispatch-params`, `-fnocompact-dispatch-params`
* `-fcompact-dispatch-params`, `-fno-compact-dispatch-params`
Whether parameter values in return dispatch statements may overlap other objects.
This may cause problems if the parameter table is stored next to a hardware register that has side effects when reading.
`.ini` equivalent: `compact_dispatch_params`. Default: yes.
* `-flenient-encoding`, `-fno-lenient-encoding`
Whether the compiler should allow for invalid characters in string/character literals that use the default encodings and replace them with alternatives.
.ini` equivalent: `lenient_encoding`. Default: no.
## Optimization options
* `-O0` Disable all optimizations.

View File

@ -26,6 +26,13 @@ Every platform is defined in an `.ini` file with an appropriate name.
* `z80` (Zilog Z80; experimental and very incomplete)
* `encoding` default encoding for console I/O, one of
`ascii`, `pet`/`petscii`, `petscr`/`cbmscr`, `atascii`, `bbc`, `jis`/`jisx`, `apple2`,
`iso_de`, `iso_no`/`iso_dk`, `iso_se`/`iso_fi`, `iso_yu`. Default: `ascii`
* `screen_encoding` default encoding for screencodes (literals with encoding specified as `scr`).
Default: the same as `encoding`.
* `modules` comma-separated list of modules that will be automatically imported
* other compilation options (they can be overridden using commandline options):
@ -54,6 +61,8 @@ Every platform is defined in an `.ini` file with an appropriate name.
* `inline` - inline functions automatically by default, default is `false`.
* `ipo` - enable interprocedural optimization, default is `false`.
* `lenient_encoding` - allow for automatic substitution of invalid characters in string literals using the default encodings, default is `false`.
#### `[allocation]` section

View File

@ -16,9 +16,10 @@ Hexadecimal: `$D323`, `0x2a2`
## String literals
String literals are surrounded with double quotes and followed by the name of the encoding:
String literals are surrounded with double quotes and optionally followed by the name of the encoding:
"this is a string" ascii
"this is also a string"
Characters between the quotes are interpreted literally,
there are no ways to escape special characters or quotes.
@ -28,11 +29,16 @@ for compatibility with multiple variants.
Currently available encodings:
* `default` default console encoding (can be omitted)
* `scr` default screencodes
(usually the same as `default`, a notable exception are the Commodore computers)
* `ascii` standard ASCII
* `pet` or `petscii` PETSCII (ASCII-like character set used by Commodore machines)
* `scr` Commodore screencodes
* `cbmscr` or `petscr` Commodore screencodes
* `apple2` Apple II charset ($A0$FE)
@ -46,16 +52,42 @@ Currently available encodings:
When programming for Commodore,
use `pet` for strings you're printing using standard I/O routines
and `scr` for strings you're copying to screen memory directly.
and `petscr` for strings you're copying to screen memory directly.
If the characters in the literal cannot be encoded in particular encoding, an error is raised.
However, if the command-line option `-flenient-encoding` is used,
then literals using `default` and `scr` encodings replace unsupported characters with supported ones
and a warning is issued.
For example, if `-flenient-encoding` is enabled, then a literal `"£¥↑ž©ß"` is equivalent to:
* `"£Y↑z(C)ss"` if the default encoding is `pet`
* `"£Y↑z©ss"` if the default encoding is `bbc`
* `"?Y^z(C)ss"` if the default encoding is `ascii`
* `"?Y^ž(C)ss"` if the default encoding is `iso_yu`
* `"?Y^z(C)ß"` if the default encoding is `iso_de`
* `"?¥^z(C)ss"` if the default encoding is `jisx`
Note that the final length of the string may vary.
## Character literals
Character literals are surrounded by single quotes and followed by the name of the encoding:
Character literals are surrounded by single quotes and optionally followed by the name of the encoding:
'x' ascii
'W'
From the type system point of view, they are constants of type byte.
If the characters in the literal cannot be encoded in particular encoding, an error is raised.
However, if the command-line option `-flenient-encoding` is used,
then literals using `default` and `scr` encodings replace unsupported characters with supported ones.
If the replacement is one characacter long, only a warning is issued, otherwise an error is raised.
## Array initialisers
An array is initialized with either:

View File

@ -1,10 +1,10 @@
import stdio
import c64_basic
array text1 = [ "enter first number:" petscii, 13, 0 ]
array text2 = [ "enter second number:" petscii, 13, 0 ]
array text3 = [ "the sum is:" petscii, 13, 0 ]
array texte = [ "that wasn't a number, try again:" petscii, 13, 0 ]
array text1 = [ "enter first number:", 13, 0 ]
array text2 = [ "enter second number:", 13, 0 ]
array text3 = [ "the sum is:", 13, 0 ]
array texte = [ "that wasn't a number, try again:", 13, 0 ]
void main() {
word a

View File

@ -1,8 +1,8 @@
import stdio
array p = [
"this is an example" petscii, 13,
"of multiline petscii text" petscii
"this is an example", 13,
"of multiline petscii text"
]
array s = [

View File

@ -3,7 +3,7 @@
import stdio
array hello_world = "hello world" petscii
array hello_world = "hello world"
void main(){
putstr(hello_world, hello_world.length)

View File

@ -1,7 +1,7 @@
[compilation]
arch=strict
modules=a8_kernel,default_panic
encoding=atascii
[allocation]
; TODO

View File

@ -1,5 +1,6 @@
[compilation]
arch=strict
encoding=apple2
modules=apple2_kernel,default_panic

View File

@ -1,6 +1,7 @@
[compilation]
; "strict" guarantees compatibility with Rockwell CPU's in some later Model B's
arch=strict
encoding=bbc
modules=bbc_kernal,bbc_hardware,default_panic

View File

@ -1,5 +1,7 @@
[compilation]
arch=nmos
encoding=petscii
screen_encoding=petscr
modules=c128_hardware,loader_1c01,c128_kernal,default_panic

View File

@ -1,5 +1,7 @@
[compilation]
arch=nmos
encoding=petscii
screen_encoding=petscr
modules=loader_1001,c264_kernal,c264_hardware,default_panic

View File

@ -4,6 +4,8 @@
[compilation]
; CPU architecture: nmos, strictnmos, ricoh, strictricoh, cmos
arch=nmos
encoding=petscii
screen_encoding=petscr
; modules to load
modules=c64_hardware,loader_0801,c64_kernal,c64_panic,stdlib
; optionally: default flags

View File

@ -3,6 +3,8 @@
[compilation]
arch=65816
encoding=petscii
screen_encoding=petscr
modules=c64_hardware,loader_0801,c64_kernal,c64_panic,stdlib
emit_65816=emulation

View File

@ -4,6 +4,8 @@
[compilation]
arch=65816
encoding=petscii
screen_encoding=petscr
modules=c64_hardware,loader_0801_16bit,c64_kernal,c64_panic,stdlib
emit_65816=native

View File

@ -2,6 +2,8 @@
[compilation]
arch=nmos
encoding=petscii
screen_encoding=petscr
modules=lunix
lunix=true

View File

@ -2,6 +2,7 @@
;a single-load PC-88 program
[compilation]
arch=z80
encoding=jisx
modules=default_panic
[allocation]

View File

@ -1,5 +1,7 @@
[compilation]
arch=nmos
encoding=petscii
screen_encoding=petscr
modules=loader_1001,c264_kernal,c264_hardware,default_panic

View File

@ -1,5 +1,7 @@
[compilation]
arch=nmos
encoding=petscii
screen_encoding=petscr
modules=loader_1001,vic20_kernal,default_panic

View File

@ -1,5 +1,7 @@
[compilation]
arch=nmos
encoding=petscii
screen_encoding=petscr
modules=loader_0401,vic20_kernal,default_panic

View File

@ -1,5 +1,7 @@
[compilation]
arch=nmos
encoding=petscii
screen_encoding=petscr
modules=loader_1201,vic20_kernal,default_panic

View File

@ -2,6 +2,7 @@
;a single-load ZX Spectrum 48k program
[compilation]
arch=z80
encoding=bbc
modules=default_panic,zxspectrum
[allocation]

View File

@ -203,7 +203,7 @@ object Cpu extends Enumeration {
object CompilationFlag extends Enumeration {
val
// common compilation options:
EmitIllegals, DecimalMode, ReadOnlyArrays,
EmitIllegals, DecimalMode, ReadOnlyArrays, LenientTextEncoding,
// compilation options for MOS:
EmitCmosOpcodes, EmitCmosNopOpcodes, EmitHudsonOpcodes, Emit65CE02Opcodes, EmitEmulation65816Opcodes, EmitNative65816Opcodes,
PreventJmpIndirectBug, LargeCode, ReturnWordsViaAccumulator,
@ -244,6 +244,7 @@ object CompilationFlag extends Enumeration {
"ror_warn" -> RorWarning,
"prevent_jmp_indirect_bug" -> PreventJmpIndirectBug,
"compact_dispatch_params" -> CompactReturnDispatchParams,
"lenient_encoding" -> LenientTextEncoding,
)
}

View File

@ -323,6 +323,9 @@ object Main {
boolean("-fbounds-checking", "-fno-bounds-checking").action { (c, v) =>
c.changeFlag(CompilationFlag.VariableOverlap, v)
}.description("Whether should insert bounds checking on array access.")
boolean("-flenient-encoding", "-fno-lenient-encoding").action { (c, v) =>
c.changeFlag(CompilationFlag.LenientTextEncoding, v)
}.description("Whether the compiler should replace invalid characters in string literals that use the default encodings.")
fluff("", "Optimization options:", "")

View File

@ -6,6 +6,7 @@ import java.nio.file.{Files, Paths}
import millfork.error.ErrorReporting
import millfork.output._
import millfork.parser.TextCodec
import org.apache.commons.configuration2.INIConfiguration
/**
@ -20,6 +21,8 @@ class Platform(
val cpu: Cpu.Value,
val flagOverrides: Map[CompilationFlag.Value, Boolean],
val startingModules: List[String],
val defaultCodec: TextCodec,
val screenCodec: TextCodec,
val outputPackager: OutputPackager,
val codeAllocators: Map[String, UpwardByteAllocator],
val variableAllocators: Map[String, VariableAllocator],
@ -98,6 +101,11 @@ object Platform {
ErrorReporting.error("Invalid zeropage register size: " + zpRegisterSize)
}
val codecName = cs.get(classOf[String], "encoding", "ascii")
val srcCodecName = cs.get(classOf[String], "screen_encoding", codecName)
val codec = TextCodec.forName(codecName, None)
val srcCodec = TextCodec.forName(srcCodecName, None)
val as = conf.getSection("allocation")
val banks = as.get(classOf[String], "segments", "default").split("[, ]+").filter(_.nonEmpty).toList
@ -185,7 +193,13 @@ object Platform {
case x => ErrorReporting.fatal(s"Invalid output style: `$x`")
}
new Platform(cpu, flagOverrides, startingModules, outputPackager,
new Platform(
cpu,
flagOverrides,
startingModules,
codec,
srcCodec,
outputPackager,
codeAllocators.toMap,
variableAllocators.toMap,
zpRegisterSize,

View File

@ -7,7 +7,7 @@ import fastparse.all._
import millfork.env._
import millfork.error.ErrorReporting
import millfork.node._
import millfork.{CompilationOptions, SeparatedList}
import millfork.{CompilationFlag, CompilationOptions, SeparatedList}
/**
* @author Karol Stasiak
@ -40,26 +40,10 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St
newPosition
}
val codec: P[TextCodec] = P(position("text codec identifier") ~ identifier).map {
case (_, "ascii") => TextCodec.Ascii
case (_, "petscii") => TextCodec.Petscii
case (_, "pet") => TextCodec.Petscii
case (_, "scr") => TextCodec.CbmScreencodes
case (_, "atascii") => TextCodec.Atascii
case (_, "atari") => TextCodec.Atascii
case (_, "bbc") => TextCodec.Bbc
case (_, "apple2") => TextCodec.Apple2
case (_, "jis") => TextCodec.Jis
case (_, "jisx") => TextCodec.Jis
case (_, "iso_de") => TextCodec.IsoIec646De
case (_, "iso_no") => TextCodec.IsoIec646No
case (_, "iso_dk") => TextCodec.IsoIec646No
case (_, "iso_se") => TextCodec.IsoIec646Se
case (_, "iso_fi") => TextCodec.IsoIec646Se
case (_, "iso_yu") => TextCodec.IsoIec646Yu
case (p, x) =>
ErrorReporting.error(s"Unknown string encoding: `$x`", Some(p))
TextCodec.Ascii
val codec: P[(TextCodec, Boolean)] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map {
case (_, "" | "default") => options.platform.defaultCodec -> options.flag(CompilationFlag.LenientTextEncoding)
case (_, "scr") => options.platform.screenCodec -> options.flag(CompilationFlag.LenientTextEncoding)
case (p, x) => TextCodec.forName(x, Some(p)) -> false
}
// def operator: P[String] = P(CharsWhileIn("!-+*/><=~|&^", min=1).!) // TODO: only valid operators
@ -67,9 +51,9 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St
val charAtom: P[LiteralExpression] = for {
p <- position()
c <- "'" ~/ CharPred(c => c >= ' ' && !invalidCharLiteralTypes(Character.getType(c))).! ~/ "'"
co <- HWS ~ codec
(co, lenient) <- HWS ~ codec
} yield {
co.encode(Some(p), c.charAt(0)) match {
co.encode(options, Some(p), c.charAt(0), lenient = lenient) match {
case List(value) =>
LiteralExpression(value, 1)
case _ =>
@ -152,7 +136,7 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St
}
def arrayStringContents: P[ArrayContents] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
case (p, s, co) => LiteralContents(s.flatMap(c => co.encode(None, c)).map(c => LiteralExpression(c, 1).pos(p)))
case (p, s, (co, lenient)) => LiteralContents(s.flatMap(c => co.encode(options, None, c, lenient = lenient)).map(c => LiteralExpression(c, 1).pos(p)))
}
def arrayLoopContents: P[ArrayContents] = for {

View File

@ -1,5 +1,8 @@
package millfork.parser
import java.util.Locale
import millfork.CompilationOptions
import millfork.error.ErrorReporting
import millfork.node.Position
@ -7,19 +10,75 @@ import millfork.node.Position
* @author Karol Stasiak
*/
class TextCodec(val name: String, private val map: String, private val extra: Map[Char, Int], private val decompositions: Map[Char, String]) {
def encode(position: Option[Position], c: Char): List[Int] = {
private def isPrintable(c: Char) = {
c.getType match {
case Character.LOWERCASE_LETTER => true
case Character.UPPERCASE_LETTER => true
case Character.TITLECASE_LETTER => true
case Character.OTHER_LETTER => true
case Character.LETTER_NUMBER => true
case Character.DECIMAL_DIGIT_NUMBER => true
case Character.OTHER_NUMBER => true
case Character.DASH_PUNCTUATION => true
case Character.START_PUNCTUATION => true
case Character.END_PUNCTUATION => true
case Character.INITIAL_QUOTE_PUNCTUATION => true
case Character.FINAL_QUOTE_PUNCTUATION => true
case Character.OTHER_PUNCTUATION => true
case Character.CURRENCY_SYMBOL => true
case Character.OTHER_SYMBOL => true
case Character.MATH_SYMBOL => true
case Character.SPACE_SEPARATOR => true
case Character.PARAGRAPH_SEPARATOR => false
case Character.LINE_SEPARATOR => false
case Character.CONTROL => false
case Character.MODIFIER_SYMBOL => false
case Character.SURROGATE => false
case Character.NON_SPACING_MARK => false
case Character.COMBINING_SPACING_MARK => false
case _ => false
}
}
private def format(c:Char):String = {
val u = f"U+${c.toInt}%04X"
if (isPrintable(c)) f"`$c%c` ($u%s)"
else u
}
private def format(s:String) = {
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",")
if (s.forall(isPrintable)) f"`$s%s` ($u%s)"
else u
}
private def encodeImpl(options: CompilationOptions, position: Option[Position], c: Char, lenient: Boolean): Option[List[Int]] = {
if (decompositions.contains(c)) {
decompositions(c).toList.flatMap(x => encode(position, x))
} else if (extra.contains(c)) List(extra(c)) else {
Some(decompositions(c).toList.flatMap(x => encodeImpl(options, position, x, lenient).getOrElse(Nil)))
} else if (extra.contains(c)) Some(List(extra(c))) else {
val index = map.indexOf(c)
if (index >= 0) {
List(index)
Some(List(index))
} else if (lenient) {
val alternative = TextCodec.lossyAlternatives.getOrElse(c, Nil).:+("?").find(alts => alts.forall(alt => encodeImpl(options, position, alt, lenient = false).isDefined)).getOrElse("")
ErrorReporting.warn(s"Cannot encode ${format(c)} in encoding `$name`, replaced it with ${format(alternative)}", options, position)
Some(alternative.toList.flatMap(encodeImpl(options, position, _, lenient = false).get))
} else {
ErrorReporting.fatal("Invalid character in string", position)
None
}
}
}
def encode(options: CompilationOptions, position: Option[Position], c: Char, lenient: Boolean): List[Int] = {
encodeImpl(options, position, c, lenient) match {
case Some(x) => x
case None =>
ErrorReporting.error(f"Invalid character ${format(c)} in string", position)
Nil
}
}
def decode(by: Int): Char = {
val index = by & 0xff
if (index < map.length) map(index) else TextCodec.NotAChar
@ -27,6 +86,30 @@ class TextCodec(val name: String, private val map: String, private val extra: Ma
}
object TextCodec {
def forName(name: String, position: Option[Position]): TextCodec = (position, name) match {
case (_, "ascii") => TextCodec.Ascii
case (_, "petscii") => TextCodec.Petscii
case (_, "pet") => TextCodec.Petscii
case (_, "cbmscr") => TextCodec.CbmScreencodes
case (_, "petscr") => TextCodec.CbmScreencodes
case (_, "atascii") => TextCodec.Atascii
case (_, "atari") => TextCodec.Atascii
case (_, "bbc") => TextCodec.Bbc
case (_, "apple2") => TextCodec.Apple2
case (_, "jis") => TextCodec.Jis
case (_, "jisx") => TextCodec.Jis
case (_, "iso_de") => TextCodec.IsoIec646De
case (_, "iso_no") => TextCodec.IsoIec646No
case (_, "iso_dk") => TextCodec.IsoIec646No
case (_, "iso_se") => TextCodec.IsoIec646Se
case (_, "iso_fi") => TextCodec.IsoIec646Se
case (_, "iso_yu") => TextCodec.IsoIec646Yu
case (p, x) =>
ErrorReporting.error(s"Unknown string encoding: `$x`", p)
TextCodec.Ascii
}
val NotAChar = '\ufffd'
val Ascii = new TextCodec("ASCII", 0.until(127).map { i => if (i < 32) NotAChar else i.toChar }.mkString, Map.empty, Map.empty)
@ -70,7 +153,7 @@ object TextCodec {
'Ü' -> '^'.toInt,
'ü' -> '~'.toInt,
'«' -> '"'.toInt,
'»' -> '#'.toInt,
'»' -> '"'.toInt,
'§' -> '#'.toInt),
Map.empty
)
@ -116,6 +199,12 @@ object TextCodec {
Map('↑' -> '^'.toInt), Map.empty
)
private val jisHalfwidthKatakanaOrder: String =
"\ufffd。「」、・ヲァィゥェォャュョッ" +
"ーアイウエオカキクケコサシスセソ" +
"タチツテトナニヌネノハヒフヘホマ" +
"ミムメモヤユヨラリルレロワン゛゜"
//noinspection ScalaUnnecessaryParentheses
val Jis = new TextCodec("JIS-X-0201",
"\ufffd" * 32 +
@ -123,10 +212,7 @@ object TextCodec {
"[¥]^_" +
"`" + 'a'.to('z').mkString + "{|}~\ufffd" +
"\ufffd" * 32 +
"\ufffd。「」、・ヲァィゥェォャュョッ" +
"ーアイウエオカキクケコサシスセソ" +
"タチツテトナニヌネノハヒフヘホマ" +
"ミムメモヤユヨラリルレロワン゛゜" +
jisHalfwidthKatakanaOrder +
"\ufffd" * 8 +
"♠♡♢♣" +
"\ufffd" * 4 +
@ -139,4 +225,102 @@ object TextCodec {
"ハヒフヘホ".zip("パピプペポ").map { case (h, p) => p -> (h + "゜") }.toMap
)
val lossyAlternatives: Map[Char, List[String]] = {
val allowLowercase: Map[Char, List[String]] = ('A' to 'Z').map(c => c -> List(c.toString.toLowerCase(Locale.ROOT))).toMap
val allowUppercase: Map[Char, List[String]] = ('a' to 'z').map(c => c -> List(c.toString.toUpperCase(Locale.ROOT))).toMap
val ligaturesAndSymbols: Map[Char, List[String]] = Map(
'¦' -> List("|"),
'|' -> List("¦"),
'ß' -> List("ss", "SS"),
'ff' -> List("ff", "FF"),
'fl' -> List("fl", "FL"),
'fi' -> List("fi", "FI"),
'ffi' -> List("ffi", "FFI"),
'ffl' -> List("ffl", "FFL"),
'½' -> List("1/2"),
'¼' -> List("1/4"),
'¾' -> List("3/4"),
'¥' -> List("Y", "y"),
'円' -> List("¥", "Y", "y"),
'年' -> List("Y", "y"),
'月' -> List("M", "m"),
'日' -> List("D", "d"),
'時' -> List("h", "H"),
'分' -> List("m", "M"),
'秒' -> List("s", "S"),
'♥' -> List("H", "h"),
'♠' -> List("S", "s"),
'♡' -> List("H", "h"),
'♢' -> List("D", "d"),
'♣' -> List("C", "c"),
'。' -> List("."),
'、' -> List(","),
'・' -> List("-"),
'•' -> List("・", "*"),
'「' -> List("[", "("),
'」' -> List("]", ")"),
'。' -> List("."),
'。' -> List("."),
'^' -> List("↑"),
'↑' -> List("^"),
'‾' -> List("~"),
'¯' -> List("~"),
'«' -> List("\""),
'»' -> List("\""),
'§' -> List("#"),
'[' -> List("("),
']' -> List(")"),
'{' -> List("("),
'}' -> List(")"),
'§' -> List("#"),
'§' -> List("#"),
'©' -> List("(C)"),
'İ' -> List("I", "i"),
'ª' -> List("a", "A"),
'º' -> List("o", "O"),
'‰' -> List("%."),
'÷' -> List("/"),
'ij' -> List("ij", "IJ"),
'IJ' -> List("IJ", "ij"),
)
val accentedLetters: Map[Char, List[String]] = List(
"áàäãåąāǎă" -> "a",
"çčċćĉ" -> "c",
"đď" -> "d",
"ð" -> "dh",
"éèêëęēėě" -> "e",
"ğǧĝģġ" -> "g",
"ħĥ" -> "h",
"íıìîïįīǐĭĩ" -> "i",
"ĵ" -> "j",
"ķ" -> "k",
"ĺľłļŀ" -> "l",
"ñńňņŋ" -> "n",
"óòöôőõøōǒ" -> "o",
"řŗŕ" -> "r",
"śšŝșşſ" -> "s",
"ţțťŧ" -> "t",
"þ" -> "th",
"úùũûüűųūǔůǘǜǚǖ" -> "u",
"ẃẁŵ" -> "w",
"ýÿỳŷȳ" -> "y",
"žźż" -> "z",
"æ" -> "ae",
"œ" -> "oe",
).flatMap{case (acc, plain) => acc.toList.flatMap(letter => List(
letter -> List(plain, plain.toUpperCase(Locale.ROOT)),
letter.toUpper -> List(plain.toUpperCase(Locale.ROOT), plain)
))}.toMap
val hiragana: Map[Char, List[String]] = (0x3041 to 0x3096).map{ kana => kana.toChar -> List(kana.+(0x60).toChar.toString)}.toMap
val fullWidth: Map[Char, List[String]] = (0xff01 to 0xff5e).map{ i =>
val fw = i.toChar
val hw = i.-(0xfee0).toChar
if (hw.isUpper) fw -> List(hw.toString, hw.toString.toLowerCase(Locale.ROOT))
else if (hw.isLower) fw -> List(hw.toString, hw.toString.toUpperCase(Locale.ROOT))
else fw -> List(hw.toString)
}.toMap
val halfWidth = (0xff61 to 0xff9f).map{ c => c.toChar -> List(jisHalfwidthKatakanaOrder(c - 0xff60).toString)}.toMap
allowLowercase ++ allowUppercase ++ ligaturesAndSymbols ++ accentedLetters ++ hiragana ++ fullWidth ++ halfWidth
}
}

View File

@ -22,4 +22,18 @@ class TextCodecSuite extends FunSuite with Matchers {
| }
""".stripMargin)
}
test("Lenient encoding") {
val m = EmuUnoptimizedRun(
"""
| void main() {
| if 'å' != 'a' { poke($bfff, 0) }
| if '÷' != '/' { poke($bffd, 0) }
| if 'π' != '?' { poke($bffc, 0) }
| }
| macro asm void poke(word const addr, byte a) {
| STA addr
| }
""".stripMargin)
}
}

View File

@ -1,6 +1,7 @@
package millfork.test.emu
import millfork.output.{AfterCodeByteAllocator, CurrentBankFragmentOutput, UpwardByteAllocator, VariableAllocator}
import millfork.parser.TextCodec
import millfork.{Cpu, CpuFamily, OutputStyle, Platform}
/**
@ -13,6 +14,8 @@ object EmuPlatform {
cpu,
Map(),
Nil,
TextCodec.Ascii,
TextCodec.Ascii,
CurrentBankFragmentOutput(0, 0xffff),
Map("default" -> new UpwardByteAllocator(0x200, 0xb000)),
Map("default" -> new VariableAllocator(

View File

@ -100,6 +100,7 @@ class EmuRun(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimization],
println(source)
val platform = EmuPlatform.get(cpu)
val options = CompilationOptions(platform, Map(
CompilationFlag.LenientTextEncoding -> true,
CompilationFlag.EmitIllegals -> this.emitIllegals,
CompilationFlag.InlineFunctions -> this.inline,
CompilationFlag.InterproceduralOptimization -> true,

View File

@ -5,14 +5,13 @@ import fastparse.core.Parsed.{Failure, Success}
import millfork.assembly.AssemblyOptimization
import millfork.assembly.z80.ZLine
import millfork.compiler.CompilationContext
import millfork.compiler.mos.MosCompiler
import millfork.env.{Environment, InitializedArray, InitializedMemoryVariable, NormalFunction}
import millfork.error.ErrorReporting
import millfork.node.StandardCallGraph
import millfork.node.opt.NodeOptimization
import millfork.output.{MemoryBank, MosAssembler, Z80Assembler}
import millfork.output.{MemoryBank, Z80Assembler}
import millfork.parser.Z80Parser
import millfork.{CompilationOptions, CpuFamily}
import millfork.{CompilationFlag, CompilationOptions, CpuFamily}
import millfork.compiler.z80.Z80Compiler
import org.scalatest.Matchers
@ -28,7 +27,8 @@ class EmuZ80Run(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimizatio
Console.err.flush()
println(source)
val platform = EmuPlatform.get(cpu)
val options = CompilationOptions(platform, millfork.Cpu.defaultFlags(cpu).map(_ -> true).toMap, None, 0)
val extraFlags = Map(CompilationFlag.LenientTextEncoding -> true)
val options = CompilationOptions(platform, millfork.Cpu.defaultFlags(cpu).map(_ -> true).toMap ++ extraFlags, None, 0)
ErrorReporting.hasErrors = false
ErrorReporting.verbosity = 999
var effectiveSource = source