document new string encoding syntax

2025-08-05 06:28:20 +00:00 · 2022-01-19 20:45:24 +01:00
parent 674295e800
commit 9ed7587e3e
6 changed files with 59 additions and 22 deletions
--- a/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt
+++ b/codeGeneration/src/prog8/codegen/target/cbm/IsoEncoding.kt
@@ -4,11 +4,14 @@ import com.github.michaelbull.result.Result
 import com.github.michaelbull.result.Ok
 import com.github.michaelbull.result.Err
 import java.io.CharConversionException
+import java.nio.charset.Charset

 object IsoEncoding {
+    val charset: Charset = Charset.forName("ISO-8859-15")
+
    fun encode(str: String): Result<List<UByte>, CharConversionException> {
        return try {
-            Ok(str.toByteArray(Charsets.ISO_8859_1).map { it.toUByte() })
+            Ok(str.toByteArray(charset).map { it.toUByte() })
        } catch (ce: CharConversionException) {
            Err(ce)
        }
@@ -16,7 +19,7 @@ object IsoEncoding {

    fun decode(bytes: List<UByte>): Result<String, CharConversionException> {
        return try {
-            Ok(String(bytes.map { it.toByte() }.toByteArray(), Charsets.ISO_8859_1))
+            Ok(String(bytes.map { it.toByte() }.toByteArray(), charset))
        } catch (ce: CharConversionException) {
            Err(ce)
        }
--- a/docs/source/portingguide.rst
+++ b/docs/source/portingguide.rst
@@ -36,10 +36,10 @@ RAM, ROM, I/O
 #. what part(s) of the address space is memory mapped I/O registers?
 #. is there a banking system? How does it work (how do you select Ram/Rom banks)? How is the default bank configuration set?

-Screen and Character encodings
------------------------------
+Character encodings
+-------------------
 #. provide the primary character encoding table that the system uses (i.e. how is text represented in memory)
-#. provide alternate character encoding table (if any)
+#. provide alternate character encodings (if any)
 #. what are the system's character screen dimensions?
 #. is there a screen matrix directly accessible in Ram? Provide addresses of the character matrix and color attributes matrix, if any.

--- a/docs/source/programming.rst
+++ b/docs/source/programming.rst
@@ -193,8 +193,9 @@ Values will usually be part of an expression or assignment statement::
    -33.456e52            ; floating point number
    "Hi, I am a string"   ; text string, encoded with compiler target default encoding
    'a'                   ; byte value (ubyte) for the letter a
-    @"Alternate"          ; text string, encoded with alternate encoding
-    @'a'                  ; byte value of the letter a, using alternate encoding
+    @"Alternate"          ; text string, encoded with alternate encoding (old deprecated syntax)
+    sc:"Alternate"        ; text string, encoded with c64 screencode encoding (current syntax)
+    sc:'a'                ; byte value of the letter a in c64 screencode encoding

    byte  counter  = 42   ; variable of size 8 bits, with initial value 42

@@ -314,12 +315,28 @@ Strings
 Strings are a sequence of characters enclosed in ``"`` quotes. The length is limited to 255 characters.
 They're stored and treated much the same as a byte array,
 but they have some special properties because they are considered to be *text*.
-Strings in your source code files will be encoded (translated from ASCII/UTF-8) into bytes via the
-default encoding that is used on the target platform. For the C-64, this is CBM PETSCII.
-Alternate-encoding strings (prefixed with ``@``) will be encoded via the alternate encoding for the
-platform (if defined). For the C-64, that is SCREEN CODES (also known as POKE codes).
-This @-prefix can also be used for character byte values.
+Strings (without encoding prefix) will be encoded (translated from ASCII/UTF-8) into bytes via the
+*default encoding* for the target platform. On the CBM machines, this is CBM PETSCII.

+.. sidebar::
+    Deprecated ``@`` prefix
+
+    In older versions of the language, the ``@`` prefix was used to specify the
+    CBM screencode encoding. This syntax is still supported for now, but will be removed
+    in a future language version.
+
+Alternative encodings can be specified with a ``encodingname:`` prefix to the string or character literal.
+The following encodings are currently recognised:
+
+    - ``petscii``  Petscii, the default encoding on CBM machines (c64, c128, cx16)
+    - ``sc``  CBM-screencodes aka 'poke' codes (c64, c128, cx16)
+    - ``iso``  iso-8859-15 text (supported on cx16)
+
+So the following is a string literal that will be encoded into memory bytes using the iso encoding.
+It can be correctly displayed on the screen only if a iso-8859-15 charset has been activated first
+(the Commander X16 has this feature built in)::
+
+    iso:"Käse, Straße"

 You can concatenate two string literals using '+', which can be useful to
 split long strings over separate lines. But remember that the length
--- a/docs/source/syntaxreference.rst
+++ b/docs/source/syntaxreference.rst
@@ -282,8 +282,7 @@ Various examples::
    byte        counter = len([1, 2, 3]) * 20
    byte        age     = 2018 - 1974
    float       wallet  = 55.25
-    str         name    = "my name is Irmen"
-    str         name    = @"my name is Irmen"           ; string with alternative byte encoding
+    str         name    = "my name is Alice"
    uword       address = &counter
    byte[]      values  = [11, 22, 33, 44, 55]
    byte[5]     values                  ; array of 5 bytes, initially set to zero
@@ -432,10 +431,23 @@ memory at the given index. See :ref:`pointervars`

 String
 ^^^^^^
+.. sidebar::
+    Deprecated ``@`` prefix

-``"hello"``   is a string translated into the default character encoding (PETSCII)
+    In older versions of the language, the ``@`` prefix was used to specify the
+    CBM screencode encoding. This syntax is still supported for now, but will be removed
+    in a future language version.
+
+A string literal can occur with or without an encoding prefix (encoding followed by ':' followed by the string itself).
+When this is omitted, the string is stored in the machine's default character encoding (which is PETSCII on the CBM machines).
+You can choose to store the string in other encodings such as ``sc`` (screencodes) or ``iso`` (iso-8859-15).
+Here are several examples:
+
+    - ``"hello"``   a string translated into the default character encoding (PETSCII)
+    - ``petscii:"hello"``   same as the above, on CBM machines.
+    - ``sc:"my name is Alice"``      string with screencode encoding (new syntax)
+    - ``iso:"Ich heiße François"``   string in iso encoding

-``@"hello"``  is a string translated into the alternate character encoding (Screencodes/pokes)

 There are several escape sequences available to put special characters into your string value:

--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -3,7 +3,7 @@ TODO

 For next compiler release (7.7)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- document new string encoding syntax
+- make 'petscii' not hardcoded default but specified in machinedefinition


 Need help with
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -4,10 +4,12 @@
 main {
    str s1 = "Irmen_"
    str s2 = @"IRMEN_"
-    ;str s3 = iso:"Irmen_~"
+    str s3 = sc:"IRMEN_"
+    str s4 = iso:"Käse, Straße"

    sub start() {
-        txt.lowercase()
+        txt.iso()
+        ; txt.lowercase()
        txt.nl()
        txt.nl()
        txt.nl()
@@ -17,12 +19,15 @@ main {
        txt.nl()
        txt.print(s2)
        txt.nl()
-;        txt.print(s3)
-;        txt.nl()
+        txt.print(s3)
+        txt.nl()
+        txt.print(s4)
+        txt.nl()

        sc(1, s1)
        sc(2, s2)
-        ; sc(3, s3)
+        sc(3, s3)
+        sc(4, s4)
    }

    sub sc(ubyte row, str text) {