diff --git a/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt b/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt index fe007816f..854076924 100644 --- a/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt +++ b/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt @@ -4,11 +4,14 @@ import com.github.michaelbull.result.Result import com.github.michaelbull.result.Ok import com.github.michaelbull.result.Err import java.io.CharConversionException +import java.nio.charset.Charset object IsoEncoding { + val charset: Charset = Charset.forName("ISO-8859-15") + fun encode(str: String): Result, CharConversionException> { return try { - Ok(str.toByteArray(Charsets.ISO_8859_1).map { it.toUByte() }) + Ok(str.toByteArray(charset).map { it.toUByte() }) } catch (ce: CharConversionException) { Err(ce) } @@ -16,7 +19,7 @@ object IsoEncoding { fun decode(bytes: List): Result { return try { - Ok(String(bytes.map { it.toByte() }.toByteArray(), Charsets.ISO_8859_1)) + Ok(String(bytes.map { it.toByte() }.toByteArray(), charset)) } catch (ce: CharConversionException) { Err(ce) } diff --git a/docs/source/portingguide.rst b/docs/source/portingguide.rst index 7852ec21b..db09c72c9 100644 --- a/docs/source/portingguide.rst +++ b/docs/source/portingguide.rst @@ -36,10 +36,10 @@ RAM, ROM, I/O #. what part(s) of the address space is memory mapped I/O registers? #. is there a banking system? How does it work (how do you select Ram/Rom banks)? How is the default bank configuration set? -Screen and Character encodings ------------------------------- +Character encodings +------------------- #. provide the primary character encoding table that the system uses (i.e. how is text represented in memory) -#. provide alternate character encoding table (if any) +#. provide alternate character encodings (if any) #. what are the system's character screen dimensions? #. is there a screen matrix directly accessible in Ram? Provide addresses of the character matrix and color attributes matrix, if any. diff --git a/docs/source/programming.rst b/docs/source/programming.rst index ba5bcc86f..cc99f3752 100644 --- a/docs/source/programming.rst +++ b/docs/source/programming.rst @@ -193,8 +193,9 @@ Values will usually be part of an expression or assignment statement:: -33.456e52 ; floating point number "Hi, I am a string" ; text string, encoded with compiler target default encoding 'a' ; byte value (ubyte) for the letter a - @"Alternate" ; text string, encoded with alternate encoding - @'a' ; byte value of the letter a, using alternate encoding + @"Alternate" ; text string, encoded with alternate encoding (old deprecated syntax) + sc:"Alternate" ; text string, encoded with c64 screencode encoding (current syntax) + sc:'a' ; byte value of the letter a in c64 screencode encoding byte counter = 42 ; variable of size 8 bits, with initial value 42 @@ -314,12 +315,28 @@ Strings Strings are a sequence of characters enclosed in ``"`` quotes. The length is limited to 255 characters. They're stored and treated much the same as a byte array, but they have some special properties because they are considered to be *text*. -Strings in your source code files will be encoded (translated from ASCII/UTF-8) into bytes via the -default encoding that is used on the target platform. For the C-64, this is CBM PETSCII. -Alternate-encoding strings (prefixed with ``@``) will be encoded via the alternate encoding for the -platform (if defined). For the C-64, that is SCREEN CODES (also known as POKE codes). -This @-prefix can also be used for character byte values. +Strings (without encoding prefix) will be encoded (translated from ASCII/UTF-8) into bytes via the +*default encoding* for the target platform. On the CBM machines, this is CBM PETSCII. +.. sidebar:: + Deprecated ``@`` prefix + + In older versions of the language, the ``@`` prefix was used to specify the + CBM screencode encoding. This syntax is still supported for now, but will be removed + in a future language version. + +Alternative encodings can be specified with a ``encodingname:`` prefix to the string or character literal. +The following encodings are currently recognised: + + - ``petscii`` Petscii, the default encoding on CBM machines (c64, c128, cx16) + - ``sc`` CBM-screencodes aka 'poke' codes (c64, c128, cx16) + - ``iso`` iso-8859-15 text (supported on cx16) + +So the following is a string literal that will be encoded into memory bytes using the iso encoding. +It can be correctly displayed on the screen only if a iso-8859-15 charset has been activated first +(the Commander X16 has this feature built in):: + + iso:"Käse, Straße" You can concatenate two string literals using '+', which can be useful to split long strings over separate lines. But remember that the length diff --git a/docs/source/syntaxreference.rst b/docs/source/syntaxreference.rst index ffe04c4a3..21f524d66 100644 --- a/docs/source/syntaxreference.rst +++ b/docs/source/syntaxreference.rst @@ -282,8 +282,7 @@ Various examples:: byte counter = len([1, 2, 3]) * 20 byte age = 2018 - 1974 float wallet = 55.25 - str name = "my name is Irmen" - str name = @"my name is Irmen" ; string with alternative byte encoding + str name = "my name is Alice" uword address = &counter byte[] values = [11, 22, 33, 44, 55] byte[5] values ; array of 5 bytes, initially set to zero @@ -432,10 +431,23 @@ memory at the given index. See :ref:`pointervars` String ^^^^^^ +.. sidebar:: + Deprecated ``@`` prefix -``"hello"`` is a string translated into the default character encoding (PETSCII) + In older versions of the language, the ``@`` prefix was used to specify the + CBM screencode encoding. This syntax is still supported for now, but will be removed + in a future language version. + +A string literal can occur with or without an encoding prefix (encoding followed by ':' followed by the string itself). +When this is omitted, the string is stored in the machine's default character encoding (which is PETSCII on the CBM machines). +You can choose to store the string in other encodings such as ``sc`` (screencodes) or ``iso`` (iso-8859-15). +Here are several examples: + + - ``"hello"`` a string translated into the default character encoding (PETSCII) + - ``petscii:"hello"`` same as the above, on CBM machines. + - ``sc:"my name is Alice"`` string with screencode encoding (new syntax) + - ``iso:"Ich heiße François"`` string in iso encoding -``@"hello"`` is a string translated into the alternate character encoding (Screencodes/pokes) There are several escape sequences available to put special characters into your string value: diff --git a/docs/source/todo.rst b/docs/source/todo.rst index 3dd088a67..1a744392b 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -3,7 +3,7 @@ TODO For next compiler release (7.7) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- document new string encoding syntax +- make 'petscii' not hardcoded default but specified in machinedefinition Need help with diff --git a/examples/test.p8 b/examples/test.p8 index 37d4bbce0..ec450a81d 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -4,10 +4,12 @@ main { str s1 = "Irmen_" str s2 = @"IRMEN_" - ;str s3 = iso:"Irmen_~" + str s3 = sc:"IRMEN_" + str s4 = iso:"Käse, Straße" sub start() { - txt.lowercase() + txt.iso() + ; txt.lowercase() txt.nl() txt.nl() txt.nl() @@ -17,12 +19,15 @@ main { txt.nl() txt.print(s2) txt.nl() -; txt.print(s3) -; txt.nl() + txt.print(s3) + txt.nl() + txt.print(s4) + txt.nl() sc(1, s1) sc(2, s2) - ; sc(3, s3) + sc(3, s3) + sc(4, s4) } sub sc(ubyte row, str text) {