mirror of
https://github.com/KarolS/millfork.git
synced 2025-04-03 00:29:43 +00:00
Text encoding improvements
This commit is contained in:
parent
265f729b24
commit
2c8de8b6a5
@ -6,6 +6,8 @@
|
||||
|
||||
* A very incomplete support for NEC PC-88.
|
||||
|
||||
* Automatic selection of text encoding based on target platform.
|
||||
|
||||
## 0.3.0
|
||||
|
||||
* Finally faster than C.
|
||||
|
@ -81,14 +81,18 @@ Default: no if targeting Ricoh, yes otherwise.
|
||||
* `-fvariable-overlap`, `-fno-variable-overlap` – Whether variables should overlap if their scopes do not intersect.
|
||||
Default: yes.
|
||||
|
||||
* `-fbounds-checking`, `-fnobounds-checking` – Whether should insert bounds checking on array access.
|
||||
* `-fbounds-checking`, `-fno-bounds-checking` – Whether should insert bounds checking on array access.
|
||||
Default: no.
|
||||
|
||||
* `-fcompact-dispatch-params`, `-fnocompact-dispatch-params` –
|
||||
* `-fcompact-dispatch-params`, `-fno-compact-dispatch-params` –
|
||||
Whether parameter values in return dispatch statements may overlap other objects.
|
||||
This may cause problems if the parameter table is stored next to a hardware register that has side effects when reading.
|
||||
`.ini` equivalent: `compact_dispatch_params`. Default: yes.
|
||||
|
||||
* `-flenient-encoding`, `-fno-lenient-encoding` –
|
||||
Whether the compiler should allow for invalid characters in string/character literals that use the default encodings and replace them with alternatives.
|
||||
.ini` equivalent: `lenient_encoding`. Default: no.
|
||||
|
||||
## Optimization options
|
||||
|
||||
* `-O0` – Disable all optimizations.
|
||||
|
@ -26,6 +26,13 @@ Every platform is defined in an `.ini` file with an appropriate name.
|
||||
|
||||
* `z80` (Zilog Z80; experimental and very incomplete)
|
||||
|
||||
* `encoding` – default encoding for console I/O, one of
|
||||
`ascii`, `pet`/`petscii`, `petscr`/`cbmscr`, `atascii`, `bbc`, `jis`/`jisx`, `apple2`,
|
||||
`iso_de`, `iso_no`/`iso_dk`, `iso_se`/`iso_fi`, `iso_yu`. Default: `ascii`
|
||||
|
||||
* `screen_encoding` – default encoding for screencodes (literals with encoding specified as `scr`).
|
||||
Default: the same as `encoding`.
|
||||
|
||||
* `modules` – comma-separated list of modules that will be automatically imported
|
||||
|
||||
* other compilation options (they can be overridden using commandline options):
|
||||
@ -54,6 +61,8 @@ Every platform is defined in an `.ini` file with an appropriate name.
|
||||
* `inline` - inline functions automatically by default, default is `false`.
|
||||
|
||||
* `ipo` - enable interprocedural optimization, default is `false`.
|
||||
|
||||
* `lenient_encoding` - allow for automatic substitution of invalid characters in string literals using the default encodings, default is `false`.
|
||||
|
||||
|
||||
#### `[allocation]` section
|
||||
|
@ -16,9 +16,10 @@ Hexadecimal: `$D323`, `0x2a2`
|
||||
|
||||
## String literals
|
||||
|
||||
String literals are surrounded with double quotes and followed by the name of the encoding:
|
||||
String literals are surrounded with double quotes and optionally followed by the name of the encoding:
|
||||
|
||||
"this is a string" ascii
|
||||
"this is also a string"
|
||||
|
||||
Characters between the quotes are interpreted literally,
|
||||
there are no ways to escape special characters or quotes.
|
||||
@ -28,11 +29,16 @@ for compatibility with multiple variants.
|
||||
|
||||
Currently available encodings:
|
||||
|
||||
* `default` – default console encoding (can be omitted)
|
||||
|
||||
* `scr` – default screencodes
|
||||
(usually the same as `default`, a notable exception are the Commodore computers)
|
||||
|
||||
* `ascii` – standard ASCII
|
||||
|
||||
* `pet` or `petscii` – PETSCII (ASCII-like character set used by Commodore machines)
|
||||
|
||||
* `scr` – Commodore screencodes
|
||||
* `cbmscr` or `petscr` – Commodore screencodes
|
||||
|
||||
* `apple2` – Apple II charset ($A0–$FE)
|
||||
|
||||
@ -46,16 +52,42 @@ Currently available encodings:
|
||||
|
||||
When programming for Commodore,
|
||||
use `pet` for strings you're printing using standard I/O routines
|
||||
and `scr` for strings you're copying to screen memory directly.
|
||||
and `petscr` for strings you're copying to screen memory directly.
|
||||
|
||||
If the characters in the literal cannot be encoded in particular encoding, an error is raised.
|
||||
However, if the command-line option `-flenient-encoding` is used,
|
||||
then literals using `default` and `scr` encodings replace unsupported characters with supported ones
|
||||
and a warning is issued.
|
||||
For example, if `-flenient-encoding` is enabled, then a literal `"£¥↑ž©ß"` is equivalent to:
|
||||
|
||||
* `"£Y↑z(C)ss"` if the default encoding is `pet`
|
||||
|
||||
* `"£Y↑z©ss"` if the default encoding is `bbc`
|
||||
|
||||
* `"?Y^z(C)ss"` if the default encoding is `ascii`
|
||||
|
||||
* `"?Y^ž(C)ss"` if the default encoding is `iso_yu`
|
||||
|
||||
* `"?Y^z(C)ß"` if the default encoding is `iso_de`
|
||||
|
||||
* `"?¥^z(C)ss"` if the default encoding is `jisx`
|
||||
|
||||
Note that the final length of the string may vary.
|
||||
|
||||
## Character literals
|
||||
|
||||
Character literals are surrounded by single quotes and followed by the name of the encoding:
|
||||
Character literals are surrounded by single quotes and optionally followed by the name of the encoding:
|
||||
|
||||
'x' ascii
|
||||
'W'
|
||||
|
||||
From the type system point of view, they are constants of type byte.
|
||||
|
||||
If the characters in the literal cannot be encoded in particular encoding, an error is raised.
|
||||
However, if the command-line option `-flenient-encoding` is used,
|
||||
then literals using `default` and `scr` encodings replace unsupported characters with supported ones.
|
||||
If the replacement is one characacter long, only a warning is issued, otherwise an error is raised.
|
||||
|
||||
## Array initialisers
|
||||
|
||||
An array is initialized with either:
|
||||
|
@ -1,10 +1,10 @@
|
||||
import stdio
|
||||
import c64_basic
|
||||
|
||||
array text1 = [ "enter first number:" petscii, 13, 0 ]
|
||||
array text2 = [ "enter second number:" petscii, 13, 0 ]
|
||||
array text3 = [ "the sum is:" petscii, 13, 0 ]
|
||||
array texte = [ "that wasn't a number, try again:" petscii, 13, 0 ]
|
||||
array text1 = [ "enter first number:", 13, 0 ]
|
||||
array text2 = [ "enter second number:", 13, 0 ]
|
||||
array text3 = [ "the sum is:", 13, 0 ]
|
||||
array texte = [ "that wasn't a number, try again:", 13, 0 ]
|
||||
|
||||
void main() {
|
||||
word a
|
||||
|
@ -1,8 +1,8 @@
|
||||
import stdio
|
||||
|
||||
array p = [
|
||||
"this is an example" petscii, 13,
|
||||
"of multiline petscii text" petscii
|
||||
"this is an example", 13,
|
||||
"of multiline petscii text"
|
||||
]
|
||||
|
||||
array s = [
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
import stdio
|
||||
|
||||
array hello_world = "hello world" petscii
|
||||
array hello_world = "hello world"
|
||||
|
||||
void main(){
|
||||
putstr(hello_world, hello_world.length)
|
||||
|
@ -1,7 +1,7 @@
|
||||
[compilation]
|
||||
arch=strict
|
||||
modules=a8_kernel,default_panic
|
||||
|
||||
encoding=atascii
|
||||
|
||||
[allocation]
|
||||
; TODO
|
||||
|
@ -1,5 +1,6 @@
|
||||
[compilation]
|
||||
arch=strict
|
||||
encoding=apple2
|
||||
modules=apple2_kernel,default_panic
|
||||
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
[compilation]
|
||||
; "strict" guarantees compatibility with Rockwell CPU's in some later Model B's
|
||||
arch=strict
|
||||
encoding=bbc
|
||||
modules=bbc_kernal,bbc_hardware,default_panic
|
||||
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
[compilation]
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=c128_hardware,loader_1c01,c128_kernal,default_panic
|
||||
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
[compilation]
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=loader_1001,c264_kernal,c264_hardware,default_panic
|
||||
|
||||
|
||||
|
@ -4,6 +4,8 @@
|
||||
[compilation]
|
||||
; CPU architecture: nmos, strictnmos, ricoh, strictricoh, cmos
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
; modules to load
|
||||
modules=c64_hardware,loader_0801,c64_kernal,c64_panic,stdlib
|
||||
; optionally: default flags
|
||||
|
@ -3,6 +3,8 @@
|
||||
|
||||
[compilation]
|
||||
arch=65816
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=c64_hardware,loader_0801,c64_kernal,c64_panic,stdlib
|
||||
emit_65816=emulation
|
||||
|
||||
|
@ -4,6 +4,8 @@
|
||||
|
||||
[compilation]
|
||||
arch=65816
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=c64_hardware,loader_0801_16bit,c64_kernal,c64_panic,stdlib
|
||||
emit_65816=native
|
||||
|
||||
|
@ -2,6 +2,8 @@
|
||||
|
||||
[compilation]
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=lunix
|
||||
lunix=true
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
;a single-load PC-88 program
|
||||
[compilation]
|
||||
arch=z80
|
||||
encoding=jisx
|
||||
modules=default_panic
|
||||
|
||||
[allocation]
|
||||
|
@ -1,5 +1,7 @@
|
||||
[compilation]
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=loader_1001,c264_kernal,c264_hardware,default_panic
|
||||
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
[compilation]
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=loader_1001,vic20_kernal,default_panic
|
||||
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
[compilation]
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=loader_0401,vic20_kernal,default_panic
|
||||
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
[compilation]
|
||||
arch=nmos
|
||||
encoding=petscii
|
||||
screen_encoding=petscr
|
||||
modules=loader_1201,vic20_kernal,default_panic
|
||||
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
;a single-load ZX Spectrum 48k program
|
||||
[compilation]
|
||||
arch=z80
|
||||
encoding=bbc
|
||||
modules=default_panic,zxspectrum
|
||||
|
||||
[allocation]
|
||||
|
@ -203,7 +203,7 @@ object Cpu extends Enumeration {
|
||||
object CompilationFlag extends Enumeration {
|
||||
val
|
||||
// common compilation options:
|
||||
EmitIllegals, DecimalMode, ReadOnlyArrays,
|
||||
EmitIllegals, DecimalMode, ReadOnlyArrays, LenientTextEncoding,
|
||||
// compilation options for MOS:
|
||||
EmitCmosOpcodes, EmitCmosNopOpcodes, EmitHudsonOpcodes, Emit65CE02Opcodes, EmitEmulation65816Opcodes, EmitNative65816Opcodes,
|
||||
PreventJmpIndirectBug, LargeCode, ReturnWordsViaAccumulator,
|
||||
@ -244,6 +244,7 @@ object CompilationFlag extends Enumeration {
|
||||
"ror_warn" -> RorWarning,
|
||||
"prevent_jmp_indirect_bug" -> PreventJmpIndirectBug,
|
||||
"compact_dispatch_params" -> CompactReturnDispatchParams,
|
||||
"lenient_encoding" -> LenientTextEncoding,
|
||||
)
|
||||
|
||||
}
|
@ -323,6 +323,9 @@ object Main {
|
||||
boolean("-fbounds-checking", "-fno-bounds-checking").action { (c, v) =>
|
||||
c.changeFlag(CompilationFlag.VariableOverlap, v)
|
||||
}.description("Whether should insert bounds checking on array access.")
|
||||
boolean("-flenient-encoding", "-fno-lenient-encoding").action { (c, v) =>
|
||||
c.changeFlag(CompilationFlag.LenientTextEncoding, v)
|
||||
}.description("Whether the compiler should replace invalid characters in string literals that use the default encodings.")
|
||||
|
||||
fluff("", "Optimization options:", "")
|
||||
|
||||
|
@ -6,6 +6,7 @@ import java.nio.file.{Files, Paths}
|
||||
|
||||
import millfork.error.ErrorReporting
|
||||
import millfork.output._
|
||||
import millfork.parser.TextCodec
|
||||
import org.apache.commons.configuration2.INIConfiguration
|
||||
|
||||
/**
|
||||
@ -20,6 +21,8 @@ class Platform(
|
||||
val cpu: Cpu.Value,
|
||||
val flagOverrides: Map[CompilationFlag.Value, Boolean],
|
||||
val startingModules: List[String],
|
||||
val defaultCodec: TextCodec,
|
||||
val screenCodec: TextCodec,
|
||||
val outputPackager: OutputPackager,
|
||||
val codeAllocators: Map[String, UpwardByteAllocator],
|
||||
val variableAllocators: Map[String, VariableAllocator],
|
||||
@ -98,6 +101,11 @@ object Platform {
|
||||
ErrorReporting.error("Invalid zeropage register size: " + zpRegisterSize)
|
||||
}
|
||||
|
||||
val codecName = cs.get(classOf[String], "encoding", "ascii")
|
||||
val srcCodecName = cs.get(classOf[String], "screen_encoding", codecName)
|
||||
val codec = TextCodec.forName(codecName, None)
|
||||
val srcCodec = TextCodec.forName(srcCodecName, None)
|
||||
|
||||
val as = conf.getSection("allocation")
|
||||
|
||||
val banks = as.get(classOf[String], "segments", "default").split("[, ]+").filter(_.nonEmpty).toList
|
||||
@ -185,7 +193,13 @@ object Platform {
|
||||
case x => ErrorReporting.fatal(s"Invalid output style: `$x`")
|
||||
}
|
||||
|
||||
new Platform(cpu, flagOverrides, startingModules, outputPackager,
|
||||
new Platform(
|
||||
cpu,
|
||||
flagOverrides,
|
||||
startingModules,
|
||||
codec,
|
||||
srcCodec,
|
||||
outputPackager,
|
||||
codeAllocators.toMap,
|
||||
variableAllocators.toMap,
|
||||
zpRegisterSize,
|
||||
|
@ -7,7 +7,7 @@ import fastparse.all._
|
||||
import millfork.env._
|
||||
import millfork.error.ErrorReporting
|
||||
import millfork.node._
|
||||
import millfork.{CompilationOptions, SeparatedList}
|
||||
import millfork.{CompilationFlag, CompilationOptions, SeparatedList}
|
||||
|
||||
/**
|
||||
* @author Karol Stasiak
|
||||
@ -40,26 +40,10 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St
|
||||
newPosition
|
||||
}
|
||||
|
||||
val codec: P[TextCodec] = P(position("text codec identifier") ~ identifier).map {
|
||||
case (_, "ascii") => TextCodec.Ascii
|
||||
case (_, "petscii") => TextCodec.Petscii
|
||||
case (_, "pet") => TextCodec.Petscii
|
||||
case (_, "scr") => TextCodec.CbmScreencodes
|
||||
case (_, "atascii") => TextCodec.Atascii
|
||||
case (_, "atari") => TextCodec.Atascii
|
||||
case (_, "bbc") => TextCodec.Bbc
|
||||
case (_, "apple2") => TextCodec.Apple2
|
||||
case (_, "jis") => TextCodec.Jis
|
||||
case (_, "jisx") => TextCodec.Jis
|
||||
case (_, "iso_de") => TextCodec.IsoIec646De
|
||||
case (_, "iso_no") => TextCodec.IsoIec646No
|
||||
case (_, "iso_dk") => TextCodec.IsoIec646No
|
||||
case (_, "iso_se") => TextCodec.IsoIec646Se
|
||||
case (_, "iso_fi") => TextCodec.IsoIec646Se
|
||||
case (_, "iso_yu") => TextCodec.IsoIec646Yu
|
||||
case (p, x) =>
|
||||
ErrorReporting.error(s"Unknown string encoding: `$x`", Some(p))
|
||||
TextCodec.Ascii
|
||||
val codec: P[(TextCodec, Boolean)] = P(position("text codec identifier") ~ identifier.?.map(_.getOrElse(""))).map {
|
||||
case (_, "" | "default") => options.platform.defaultCodec -> options.flag(CompilationFlag.LenientTextEncoding)
|
||||
case (_, "scr") => options.platform.screenCodec -> options.flag(CompilationFlag.LenientTextEncoding)
|
||||
case (p, x) => TextCodec.forName(x, Some(p)) -> false
|
||||
}
|
||||
|
||||
// def operator: P[String] = P(CharsWhileIn("!-+*/><=~|&^", min=1).!) // TODO: only valid operators
|
||||
@ -67,9 +51,9 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St
|
||||
val charAtom: P[LiteralExpression] = for {
|
||||
p <- position()
|
||||
c <- "'" ~/ CharPred(c => c >= ' ' && !invalidCharLiteralTypes(Character.getType(c))).! ~/ "'"
|
||||
co <- HWS ~ codec
|
||||
(co, lenient) <- HWS ~ codec
|
||||
} yield {
|
||||
co.encode(Some(p), c.charAt(0)) match {
|
||||
co.encode(options, Some(p), c.charAt(0), lenient = lenient) match {
|
||||
case List(value) =>
|
||||
LiteralExpression(value, 1)
|
||||
case _ =>
|
||||
@ -152,7 +136,7 @@ abstract class MfParser[T](filename: String, input: String, currentDirectory: St
|
||||
}
|
||||
|
||||
def arrayStringContents: P[ArrayContents] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
|
||||
case (p, s, co) => LiteralContents(s.flatMap(c => co.encode(None, c)).map(c => LiteralExpression(c, 1).pos(p)))
|
||||
case (p, s, (co, lenient)) => LiteralContents(s.flatMap(c => co.encode(options, None, c, lenient = lenient)).map(c => LiteralExpression(c, 1).pos(p)))
|
||||
}
|
||||
|
||||
def arrayLoopContents: P[ArrayContents] = for {
|
||||
|
@ -1,5 +1,8 @@
|
||||
package millfork.parser
|
||||
|
||||
import java.util.Locale
|
||||
|
||||
import millfork.CompilationOptions
|
||||
import millfork.error.ErrorReporting
|
||||
import millfork.node.Position
|
||||
|
||||
@ -7,19 +10,75 @@ import millfork.node.Position
|
||||
* @author Karol Stasiak
|
||||
*/
|
||||
class TextCodec(val name: String, private val map: String, private val extra: Map[Char, Int], private val decompositions: Map[Char, String]) {
|
||||
def encode(position: Option[Position], c: Char): List[Int] = {
|
||||
|
||||
private def isPrintable(c: Char) = {
|
||||
c.getType match {
|
||||
case Character.LOWERCASE_LETTER => true
|
||||
case Character.UPPERCASE_LETTER => true
|
||||
case Character.TITLECASE_LETTER => true
|
||||
case Character.OTHER_LETTER => true
|
||||
case Character.LETTER_NUMBER => true
|
||||
case Character.DECIMAL_DIGIT_NUMBER => true
|
||||
case Character.OTHER_NUMBER => true
|
||||
case Character.DASH_PUNCTUATION => true
|
||||
case Character.START_PUNCTUATION => true
|
||||
case Character.END_PUNCTUATION => true
|
||||
case Character.INITIAL_QUOTE_PUNCTUATION => true
|
||||
case Character.FINAL_QUOTE_PUNCTUATION => true
|
||||
case Character.OTHER_PUNCTUATION => true
|
||||
case Character.CURRENCY_SYMBOL => true
|
||||
case Character.OTHER_SYMBOL => true
|
||||
case Character.MATH_SYMBOL => true
|
||||
case Character.SPACE_SEPARATOR => true
|
||||
case Character.PARAGRAPH_SEPARATOR => false
|
||||
case Character.LINE_SEPARATOR => false
|
||||
case Character.CONTROL => false
|
||||
case Character.MODIFIER_SYMBOL => false
|
||||
case Character.SURROGATE => false
|
||||
case Character.NON_SPACING_MARK => false
|
||||
case Character.COMBINING_SPACING_MARK => false
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
|
||||
private def format(c:Char):String = {
|
||||
val u = f"U+${c.toInt}%04X"
|
||||
if (isPrintable(c)) f"`$c%c` ($u%s)"
|
||||
else u
|
||||
}
|
||||
|
||||
private def format(s:String) = {
|
||||
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",")
|
||||
if (s.forall(isPrintable)) f"`$s%s` ($u%s)"
|
||||
else u
|
||||
}
|
||||
|
||||
private def encodeImpl(options: CompilationOptions, position: Option[Position], c: Char, lenient: Boolean): Option[List[Int]] = {
|
||||
if (decompositions.contains(c)) {
|
||||
decompositions(c).toList.flatMap(x => encode(position, x))
|
||||
} else if (extra.contains(c)) List(extra(c)) else {
|
||||
Some(decompositions(c).toList.flatMap(x => encodeImpl(options, position, x, lenient).getOrElse(Nil)))
|
||||
} else if (extra.contains(c)) Some(List(extra(c))) else {
|
||||
val index = map.indexOf(c)
|
||||
if (index >= 0) {
|
||||
List(index)
|
||||
Some(List(index))
|
||||
} else if (lenient) {
|
||||
val alternative = TextCodec.lossyAlternatives.getOrElse(c, Nil).:+("?").find(alts => alts.forall(alt => encodeImpl(options, position, alt, lenient = false).isDefined)).getOrElse("")
|
||||
ErrorReporting.warn(s"Cannot encode ${format(c)} in encoding `$name`, replaced it with ${format(alternative)}", options, position)
|
||||
Some(alternative.toList.flatMap(encodeImpl(options, position, _, lenient = false).get))
|
||||
} else {
|
||||
ErrorReporting.fatal("Invalid character in string", position)
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def encode(options: CompilationOptions, position: Option[Position], c: Char, lenient: Boolean): List[Int] = {
|
||||
encodeImpl(options, position, c, lenient) match {
|
||||
case Some(x) => x
|
||||
case None =>
|
||||
ErrorReporting.error(f"Invalid character ${format(c)} in string", position)
|
||||
Nil
|
||||
}
|
||||
}
|
||||
|
||||
def decode(by: Int): Char = {
|
||||
val index = by & 0xff
|
||||
if (index < map.length) map(index) else TextCodec.NotAChar
|
||||
@ -27,6 +86,30 @@ class TextCodec(val name: String, private val map: String, private val extra: Ma
|
||||
}
|
||||
|
||||
object TextCodec {
|
||||
|
||||
def forName(name: String, position: Option[Position]): TextCodec = (position, name) match {
|
||||
case (_, "ascii") => TextCodec.Ascii
|
||||
case (_, "petscii") => TextCodec.Petscii
|
||||
case (_, "pet") => TextCodec.Petscii
|
||||
case (_, "cbmscr") => TextCodec.CbmScreencodes
|
||||
case (_, "petscr") => TextCodec.CbmScreencodes
|
||||
case (_, "atascii") => TextCodec.Atascii
|
||||
case (_, "atari") => TextCodec.Atascii
|
||||
case (_, "bbc") => TextCodec.Bbc
|
||||
case (_, "apple2") => TextCodec.Apple2
|
||||
case (_, "jis") => TextCodec.Jis
|
||||
case (_, "jisx") => TextCodec.Jis
|
||||
case (_, "iso_de") => TextCodec.IsoIec646De
|
||||
case (_, "iso_no") => TextCodec.IsoIec646No
|
||||
case (_, "iso_dk") => TextCodec.IsoIec646No
|
||||
case (_, "iso_se") => TextCodec.IsoIec646Se
|
||||
case (_, "iso_fi") => TextCodec.IsoIec646Se
|
||||
case (_, "iso_yu") => TextCodec.IsoIec646Yu
|
||||
case (p, x) =>
|
||||
ErrorReporting.error(s"Unknown string encoding: `$x`", p)
|
||||
TextCodec.Ascii
|
||||
}
|
||||
|
||||
val NotAChar = '\ufffd'
|
||||
|
||||
val Ascii = new TextCodec("ASCII", 0.until(127).map { i => if (i < 32) NotAChar else i.toChar }.mkString, Map.empty, Map.empty)
|
||||
@ -70,7 +153,7 @@ object TextCodec {
|
||||
'Ü' -> '^'.toInt,
|
||||
'ü' -> '~'.toInt,
|
||||
'«' -> '"'.toInt,
|
||||
'»' -> '#'.toInt,
|
||||
'»' -> '"'.toInt,
|
||||
'§' -> '#'.toInt),
|
||||
Map.empty
|
||||
)
|
||||
@ -116,6 +199,12 @@ object TextCodec {
|
||||
Map('↑' -> '^'.toInt), Map.empty
|
||||
)
|
||||
|
||||
private val jisHalfwidthKatakanaOrder: String =
|
||||
"\ufffd。「」、・ヲァィゥェォャュョッ" +
|
||||
"ーアイウエオカキクケコサシスセソ" +
|
||||
"タチツテトナニヌネノハヒフヘホマ" +
|
||||
"ミムメモヤユヨラリルレロワン゛゜"
|
||||
|
||||
//noinspection ScalaUnnecessaryParentheses
|
||||
val Jis = new TextCodec("JIS-X-0201",
|
||||
"\ufffd" * 32 +
|
||||
@ -123,10 +212,7 @@ object TextCodec {
|
||||
"[¥]^_" +
|
||||
"`" + 'a'.to('z').mkString + "{|}~\ufffd" +
|
||||
"\ufffd" * 32 +
|
||||
"\ufffd。「」、・ヲァィゥェォャュョッ" +
|
||||
"ーアイウエオカキクケコサシスセソ" +
|
||||
"タチツテトナニヌネノハヒフヘホマ" +
|
||||
"ミムメモヤユヨラリルレロワン゛゜" +
|
||||
jisHalfwidthKatakanaOrder +
|
||||
"\ufffd" * 8 +
|
||||
"♠♡♢♣" +
|
||||
"\ufffd" * 4 +
|
||||
@ -139,4 +225,102 @@ object TextCodec {
|
||||
"ハヒフヘホ".zip("パピプペポ").map { case (h, p) => p -> (h + "゜") }.toMap
|
||||
)
|
||||
|
||||
val lossyAlternatives: Map[Char, List[String]] = {
|
||||
val allowLowercase: Map[Char, List[String]] = ('A' to 'Z').map(c => c -> List(c.toString.toLowerCase(Locale.ROOT))).toMap
|
||||
val allowUppercase: Map[Char, List[String]] = ('a' to 'z').map(c => c -> List(c.toString.toUpperCase(Locale.ROOT))).toMap
|
||||
val ligaturesAndSymbols: Map[Char, List[String]] = Map(
|
||||
'¦' -> List("|"),
|
||||
'|' -> List("¦"),
|
||||
'ß' -> List("ss", "SS"),
|
||||
'ff' -> List("ff", "FF"),
|
||||
'fl' -> List("fl", "FL"),
|
||||
'fi' -> List("fi", "FI"),
|
||||
'ffi' -> List("ffi", "FFI"),
|
||||
'ffl' -> List("ffl", "FFL"),
|
||||
'½' -> List("1/2"),
|
||||
'¼' -> List("1/4"),
|
||||
'¾' -> List("3/4"),
|
||||
'¥' -> List("Y", "y"),
|
||||
'円' -> List("¥", "Y", "y"),
|
||||
'年' -> List("Y", "y"),
|
||||
'月' -> List("M", "m"),
|
||||
'日' -> List("D", "d"),
|
||||
'時' -> List("h", "H"),
|
||||
'分' -> List("m", "M"),
|
||||
'秒' -> List("s", "S"),
|
||||
'♥' -> List("H", "h"),
|
||||
'♠' -> List("S", "s"),
|
||||
'♡' -> List("H", "h"),
|
||||
'♢' -> List("D", "d"),
|
||||
'♣' -> List("C", "c"),
|
||||
'。' -> List("."),
|
||||
'、' -> List(","),
|
||||
'・' -> List("-"),
|
||||
'•' -> List("・", "*"),
|
||||
'「' -> List("[", "("),
|
||||
'」' -> List("]", ")"),
|
||||
'。' -> List("."),
|
||||
'。' -> List("."),
|
||||
'^' -> List("↑"),
|
||||
'↑' -> List("^"),
|
||||
'‾' -> List("~"),
|
||||
'¯' -> List("~"),
|
||||
'«' -> List("\""),
|
||||
'»' -> List("\""),
|
||||
'§' -> List("#"),
|
||||
'[' -> List("("),
|
||||
']' -> List(")"),
|
||||
'{' -> List("("),
|
||||
'}' -> List(")"),
|
||||
'§' -> List("#"),
|
||||
'§' -> List("#"),
|
||||
'©' -> List("(C)"),
|
||||
'İ' -> List("I", "i"),
|
||||
'ª' -> List("a", "A"),
|
||||
'º' -> List("o", "O"),
|
||||
'‰' -> List("%."),
|
||||
'÷' -> List("/"),
|
||||
'ij' -> List("ij", "IJ"),
|
||||
'IJ' -> List("IJ", "ij"),
|
||||
)
|
||||
val accentedLetters: Map[Char, List[String]] = List(
|
||||
"áàäãåąāǎă" -> "a",
|
||||
"çčċćĉ" -> "c",
|
||||
"đď" -> "d",
|
||||
"ð" -> "dh",
|
||||
"éèêëęēėě" -> "e",
|
||||
"ğǧĝģġ" -> "g",
|
||||
"ħĥ" -> "h",
|
||||
"íıìîïįīǐĭĩ" -> "i",
|
||||
"ĵ" -> "j",
|
||||
"ķ" -> "k",
|
||||
"ĺľłļŀ" -> "l",
|
||||
"ñńňņŋ" -> "n",
|
||||
"óòöôőõøōǒ" -> "o",
|
||||
"řŗŕ" -> "r",
|
||||
"śšŝșşſ" -> "s",
|
||||
"ţțťŧ" -> "t",
|
||||
"þ" -> "th",
|
||||
"úùũûüűųūǔůǘǜǚǖ" -> "u",
|
||||
"ẃẁŵ" -> "w",
|
||||
"ýÿỳŷȳ" -> "y",
|
||||
"žźż" -> "z",
|
||||
"æ" -> "ae",
|
||||
"œ" -> "oe",
|
||||
).flatMap{case (acc, plain) => acc.toList.flatMap(letter => List(
|
||||
letter -> List(plain, plain.toUpperCase(Locale.ROOT)),
|
||||
letter.toUpper -> List(plain.toUpperCase(Locale.ROOT), plain)
|
||||
))}.toMap
|
||||
val hiragana: Map[Char, List[String]] = (0x3041 to 0x3096).map{ kana => kana.toChar -> List(kana.+(0x60).toChar.toString)}.toMap
|
||||
val fullWidth: Map[Char, List[String]] = (0xff01 to 0xff5e).map{ i =>
|
||||
val fw = i.toChar
|
||||
val hw = i.-(0xfee0).toChar
|
||||
if (hw.isUpper) fw -> List(hw.toString, hw.toString.toLowerCase(Locale.ROOT))
|
||||
else if (hw.isLower) fw -> List(hw.toString, hw.toString.toUpperCase(Locale.ROOT))
|
||||
else fw -> List(hw.toString)
|
||||
}.toMap
|
||||
val halfWidth = (0xff61 to 0xff9f).map{ c => c.toChar -> List(jisHalfwidthKatakanaOrder(c - 0xff60).toString)}.toMap
|
||||
allowLowercase ++ allowUppercase ++ ligaturesAndSymbols ++ accentedLetters ++ hiragana ++ fullWidth ++ halfWidth
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -22,4 +22,18 @@ class TextCodecSuite extends FunSuite with Matchers {
|
||||
| }
|
||||
""".stripMargin)
|
||||
}
|
||||
|
||||
test("Lenient encoding") {
|
||||
val m = EmuUnoptimizedRun(
|
||||
"""
|
||||
| void main() {
|
||||
| if 'å' != 'a' { poke($bfff, 0) }
|
||||
| if '÷' != '/' { poke($bffd, 0) }
|
||||
| if 'π' != '?' { poke($bffc, 0) }
|
||||
| }
|
||||
| macro asm void poke(word const addr, byte a) {
|
||||
| STA addr
|
||||
| }
|
||||
""".stripMargin)
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package millfork.test.emu
|
||||
|
||||
import millfork.output.{AfterCodeByteAllocator, CurrentBankFragmentOutput, UpwardByteAllocator, VariableAllocator}
|
||||
import millfork.parser.TextCodec
|
||||
import millfork.{Cpu, CpuFamily, OutputStyle, Platform}
|
||||
|
||||
/**
|
||||
@ -13,6 +14,8 @@ object EmuPlatform {
|
||||
cpu,
|
||||
Map(),
|
||||
Nil,
|
||||
TextCodec.Ascii,
|
||||
TextCodec.Ascii,
|
||||
CurrentBankFragmentOutput(0, 0xffff),
|
||||
Map("default" -> new UpwardByteAllocator(0x200, 0xb000)),
|
||||
Map("default" -> new VariableAllocator(
|
||||
|
@ -100,6 +100,7 @@ class EmuRun(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimization],
|
||||
println(source)
|
||||
val platform = EmuPlatform.get(cpu)
|
||||
val options = CompilationOptions(platform, Map(
|
||||
CompilationFlag.LenientTextEncoding -> true,
|
||||
CompilationFlag.EmitIllegals -> this.emitIllegals,
|
||||
CompilationFlag.InlineFunctions -> this.inline,
|
||||
CompilationFlag.InterproceduralOptimization -> true,
|
||||
|
@ -5,14 +5,13 @@ import fastparse.core.Parsed.{Failure, Success}
|
||||
import millfork.assembly.AssemblyOptimization
|
||||
import millfork.assembly.z80.ZLine
|
||||
import millfork.compiler.CompilationContext
|
||||
import millfork.compiler.mos.MosCompiler
|
||||
import millfork.env.{Environment, InitializedArray, InitializedMemoryVariable, NormalFunction}
|
||||
import millfork.error.ErrorReporting
|
||||
import millfork.node.StandardCallGraph
|
||||
import millfork.node.opt.NodeOptimization
|
||||
import millfork.output.{MemoryBank, MosAssembler, Z80Assembler}
|
||||
import millfork.output.{MemoryBank, Z80Assembler}
|
||||
import millfork.parser.Z80Parser
|
||||
import millfork.{CompilationOptions, CpuFamily}
|
||||
import millfork.{CompilationFlag, CompilationOptions, CpuFamily}
|
||||
import millfork.compiler.z80.Z80Compiler
|
||||
import org.scalatest.Matchers
|
||||
|
||||
@ -28,7 +27,8 @@ class EmuZ80Run(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimizatio
|
||||
Console.err.flush()
|
||||
println(source)
|
||||
val platform = EmuPlatform.get(cpu)
|
||||
val options = CompilationOptions(platform, millfork.Cpu.defaultFlags(cpu).map(_ -> true).toMap, None, 0)
|
||||
val extraFlags = Map(CompilationFlag.LenientTextEncoding -> true)
|
||||
val options = CompilationOptions(platform, millfork.Cpu.defaultFlags(cpu).map(_ -> true).toMap ++ extraFlags, None, 0)
|
||||
ErrorReporting.hasErrors = false
|
||||
ErrorReporting.verbosity = 999
|
||||
var effectiveSource = source
|
||||
|
Loading…
x
Reference in New Issue
Block a user