document new string encoding syntax

2024-09-07 19:54:26 +00:00 · 2022-01-19 20:45:24 +01:00 · 2022-01-19 20:45:24 +01:00 · 9ed7587e3e
commit 9ed7587e3e
parent 674295e800
6 changed files with 59 additions and 22 deletions
--- a/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt
+++ b/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt
@ -4,11 +4,14 @@ import com.github.michaelbull.result.Result
 import com.github.michaelbull.result.Ok
 import com.github.michaelbull.result.Err
 import java.io.CharConversionException
 import java.nio.charset.Charset
 object IsoEncoding {
    val charset: Charset = Charset.forName("ISO-8859-15")
    fun encode(str: String): Result<List<UByte>, CharConversionException> {
        return try {
-            Ok(str.toByteArray(Charsets.ISO_8859_1).map { it.toUByte() })
+            Ok(str.toByteArray(charset).map { it.toUByte() })
        } catch (ce: CharConversionException) {
            Err(ce)
        }
@ -16,7 +19,7 @@ object IsoEncoding {
    fun decode(bytes: List<UByte>): Result<String, CharConversionException> {
        return try {
-            Ok(String(bytes.map { it.toByte() }.toByteArray(), Charsets.ISO_8859_1))
+            Ok(String(bytes.map { it.toByte() }.toByteArray(), charset))
        } catch (ce: CharConversionException) {
            Err(ce)
        }
--- a/docs/source/portingguide.rst
+++ b/docs/source/portingguide.rst
@ -36,10 +36,10 @@ RAM, ROM, I/O
 #. what part(s) of the address space is memory mapped I/O registers?
 #. is there a banking system? How does it work (how do you select Ram/Rom banks)? How is the default bank configuration set?
-Screen and Character encodings
+Character encodings
------------------------------
+-------------------
 #. provide the primary character encoding table that the system uses (i.e. how is text represented in memory)
-#. provide alternate character encoding table (if any)
+#. provide alternate character encodings (if any)
 #. what are the system's character screen dimensions?
 #. is there a screen matrix directly accessible in Ram? Provide addresses of the character matrix and color attributes matrix, if any.
--- a/docs/source/programming.rst
+++ b/docs/source/programming.rst
@ -193,8 +193,9 @@ Values will usually be part of an expression or assignment statement::
    -33.456e52            ; floating point number
    "Hi, I am a string"   ; text string, encoded with compiler target default encoding
    'a'                   ; byte value (ubyte) for the letter a
-    @"Alternate"          ; text string, encoded with alternate encoding
+    @"Alternate"          ; text string, encoded with alternate encoding (old deprecated syntax)
-    @'a'                  ; byte value of the letter a, using alternate encoding
+    sc:"Alternate"        ; text string, encoded with c64 screencode encoding (current syntax)
    sc:'a'                ; byte value of the letter a in c64 screencode encoding
    byte  counter  = 42   ; variable of size 8 bits, with initial value 42
@ -314,12 +315,28 @@ Strings
 Strings are a sequence of characters enclosed in ``"`` quotes. The length is limited to 255 characters.
 They're stored and treated much the same as a byte array,
 but they have some special properties because they are considered to be *text*.
-Strings in your source code files will be encoded (translated from ASCII/UTF-8) into bytes via the
+Strings (without encoding prefix) will be encoded (translated from ASCII/UTF-8) into bytes via the
-default encoding that is used on the target platform. For the C-64, this is CBM PETSCII.
+*default encoding* for the target platform. On the CBM machines, this is CBM PETSCII.
 Alternate-encoding strings (prefixed with ``@``) will be encoded via the alternate encoding for the
 platform (if defined). For the C-64, that is SCREEN CODES (also known as POKE codes).
 This @-prefix can also be used for character byte values.
 .. sidebar::
    Deprecated ``@`` prefix
    In older versions of the language, the ``@`` prefix was used to specify the
    CBM screencode encoding. This syntax is still supported for now, but will be removed
    in a future language version.
 Alternative encodings can be specified with a ``encodingname:`` prefix to the string or character literal.
 The following encodings are currently recognised:
    - ``petscii``  Petscii, the default encoding on CBM machines (c64, c128, cx16)
    - ``sc``  CBM-screencodes aka 'poke' codes (c64, c128, cx16)
    - ``iso``  iso-8859-15 text (supported on cx16)
 So the following is a string literal that will be encoded into memory bytes using the iso encoding.
 It can be correctly displayed on the screen only if a iso-8859-15 charset has been activated first
 (the Commander X16 has this feature built in)::
    iso:"Käse, Straße"
 You can concatenate two string literals using '+', which can be useful to
 split long strings over separate lines. But remember that the length
--- a/docs/source/syntaxreference.rst
+++ b/docs/source/syntaxreference.rst
@ -282,8 +282,7 @@ Various examples::
    byte        counter = len([1, 2, 3]) * 20
    byte        age     = 2018 - 1974
    float       wallet  = 55.25
-    str         name    = "my name is Irmen"
+    str         name    = "my name is Alice"
    str         name    = @"my name is Irmen"           ; string with alternative byte encoding
    uword       address = &counter
    byte[]      values  = [11, 22, 33, 44, 55]
    byte[5]     values                  ; array of 5 bytes, initially set to zero
@ -432,10 +431,23 @@ memory at the given index. See :ref:`pointervars`
 String
 ^^^^^^
 .. sidebar::
    Deprecated ``@`` prefix
-``"hello"``   is a string translated into the default character encoding (PETSCII)
+    In older versions of the language, the ``@`` prefix was used to specify the
    CBM screencode encoding. This syntax is still supported for now, but will be removed
    in a future language version.
 A string literal can occur with or without an encoding prefix (encoding followed by ':' followed by the string itself).
 When this is omitted, the string is stored in the machine's default character encoding (which is PETSCII on the CBM machines).
 You can choose to store the string in other encodings such as ``sc`` (screencodes) or ``iso`` (iso-8859-15).
 Here are several examples:
    - ``"hello"``   a string translated into the default character encoding (PETSCII)
    - ``petscii:"hello"``   same as the above, on CBM machines.
    - ``sc:"my name is Alice"``      string with screencode encoding (new syntax)
    - ``iso:"Ich heiße François"``   string in iso encoding
 ``@"hello"``  is a string translated into the alternate character encoding (Screencodes/pokes)
 There are several escape sequences available to put special characters into your string value:
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@ -3,7 +3,7 @@ TODO
 For next compiler release (7.7)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- document new string encoding syntax
+- make 'petscii' not hardcoded default but specified in machinedefinition
 Need help with
--- a/examples/test.p8
+++ b/examples/test.p8
@ -4,10 +4,12 @@
 main {
    str s1 = "Irmen_"
    str s2 = @"IRMEN_"
-    ;str s3 = iso:"Irmen_~"
+    str s3 = sc:"IRMEN_"
    str s4 = iso:"Käse, Straße"
    sub start() {
-        txt.lowercase()
+        txt.iso()
        ; txt.lowercase()
        txt.nl()
        txt.nl()
        txt.nl()
@ -17,12 +19,15 @@ main {
        txt.nl()
        txt.print(s2)
        txt.nl()
-;        txt.print(s3)
+        txt.print(s3)
-;        txt.nl()
+        txt.nl()
        txt.print(s4)
        txt.nl()
        sc(1, s1)
        sc(2, s2)
-        ; sc(3, s3)
+        sc(3, s3)
        sc(4, s4)
    }
    sub sc(ubyte row, str text) {