From 656868a09512dc56f14b6127beabe908089b896c Mon Sep 17 00:00:00 2001
From: Stephen Heumann <stephenheumann@gmail.com>
Date: Mon, 20 Jan 2020 17:22:06 -0600
Subject: [PATCH] Implement support for universal character names in
 identifiers.

---
 CCommon.pas |  2 +-
 Charset.pas | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 Scanner.asm |  2 +-
 Scanner.pas | 64 +++++++++++++++++++++++++++++++++++++++++-----
 Table.asm   |  4 +--
 cc.notes    |  2 ++
 6 files changed, 136 insertions(+), 11 deletions(-)

diff --git a/CCommon.pas b/CCommon.pas
index 66bdecc..cb9a243 100644
--- a/CCommon.pas
+++ b/CCommon.pas
@@ -192,7 +192,7 @@ type
       (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc,
        ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string,
        ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon,
-       letter,digit);
+       ch_backslash,letter,digit);
 
    tokenSet = set of tokenEnum;
    tokenClass = (reservedWord,reservedSymbol,identifier,intConstant,longConstant,
diff --git a/Charset.pas b/Charset.pas
index 272d3ba..c976f81 100644
--- a/Charset.pas
+++ b/Charset.pas
@@ -43,6 +43,13 @@ function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
 { Returns ordinal value of the character, or -1 if it can't be   }
 { converted.                                                     }
 
+function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean;
+
+{ Check if a code point is valid for a UCN in an identifier      }
+{                                                                }
+{ ch - the code point                                            }
+{ initial - is this UCN the initial element of the identifier?   }
+
 implementation
 
 function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
@@ -92,4 +99,70 @@ else begin
 1:
 end; {ConvertUCSToMacRoman}
 
+
+function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
+
+{ Check if a code point is valid for a UCN in an identifier      }
+{                                                                }
+{ ch - the code point                                            }
+{ initial - is this UCN the initial element of the identifier?   }
+
+begin {ValidUCNForIdentifier}
+{See C17 Annex D}
+ValidUCNForIdentifier := false;
+if    (ch = $0000A8)
+   or (ch = $0000AA)
+   or (ch = $0000AD)
+   or (ch = $0000AF)
+   or ((ch >= $0000B2) and (ch <= $0000B5))
+   or ((ch >= $0000B7) and (ch <= $0000BA))
+   or ((ch >= $0000BC) and (ch <= $0000BE))
+   or ((ch >= $0000C0) and (ch <= $0000D6))
+   or ((ch >= $0000D8) and (ch <= $0000F6))
+   or ((ch >= $0000F8) and (ch <= $0000FF))
+   or ((ch >= $000100) and (ch <= $00167F))
+   or ((ch >= $001681) and (ch <= $00180D))
+   or ((ch >= $00180F) and (ch <= $001FFF))
+   or ((ch >= $00200B) and (ch <= $00200D))
+   or ((ch >= $00202A) and (ch <= $00202E))
+   or ((ch >= $00203F) and (ch <= $002040))
+   or (ch = $002054)
+   or ((ch >= $002060) and (ch <= $00206F))
+   or ((ch >= $002070) and (ch <= $00218F))
+   or ((ch >= $002460) and (ch <= $0024FF))
+   or ((ch >= $002776) and (ch <= $002793))
+   or ((ch >= $002C00) and (ch <= $002DFF))
+   or ((ch >= $002E80) and (ch <= $002FFF))
+   or ((ch >= $003004) and (ch <= $003007))
+   or ((ch >= $003021) and (ch <= $00302F))
+   or ((ch >= $003031) and (ch <= $00303F))
+   or ((ch >= $003040) and (ch <= $00D7FF))
+   or ((ch >= $00F900) and (ch <= $00FD3D))
+   or ((ch >= $00FD40) and (ch <= $00FDCF))
+   or ((ch >= $00FDF0) and (ch <= $00FE44))
+   or ((ch >= $00FE47) and (ch <= $00FFFD))
+   or ((ch >= $010000) and (ch <= $01FFFD))
+   or ((ch >= $020000) and (ch <= $02FFFD))
+   or ((ch >= $030000) and (ch <= $03FFFD))
+   or ((ch >= $040000) and (ch <= $04FFFD))
+   or ((ch >= $050000) and (ch <= $05FFFD))
+   or ((ch >= $060000) and (ch <= $06FFFD))
+   or ((ch >= $070000) and (ch <= $07FFFD))
+   or ((ch >= $080000) and (ch <= $08FFFD))
+   or ((ch >= $090000) and (ch <= $09FFFD))
+   or ((ch >= $0A0000) and (ch <= $0AFFFD))
+   or ((ch >= $0B0000) and (ch <= $0BFFFD))
+   or ((ch >= $0C0000) and (ch <= $0CFFFD))
+   or ((ch >= $0D0000) and (ch <= $0DFFFD))
+   or ((ch >= $0E0000) and (ch <= $0EFFFD))
+   then ValidUCNForIdentifier := true;
+
+if initial then
+   if    ((ch >= $000300) and (ch <= $00036F))
+      or ((ch >= $001DC0) and (ch <= $001DFF))
+      or ((ch >= $0020D0) and (ch <= $0020FF))
+      or ((ch >= $00FE20) and (ch <= $00FE2F))
+      then ValidUCNForIdentifier := false;
+end; {ValidUCNForIdentifier}
+
 end.
diff --git a/Scanner.asm b/Scanner.asm
index 4b0d92c..757249e 100644
--- a/Scanner.asm
+++ b/Scanner.asm
@@ -116,7 +116,7 @@ cch      equ   13
          enum  (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
          enum  (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
          enum  (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
-         enum  (letter,digit)
+         enum  (ch_backslash,letter,digit)
 
          tsc                            create stack frame
          sec
diff --git a/Scanner.pas b/Scanner.pas
index 11e236c..be30a93 100644
--- a/Scanner.pas
+++ b/Scanner.pas
@@ -264,6 +264,7 @@ var
    tokenExpandEnabled: boolean;         {can token be macro expanded? (only for ident)}
    versionStrL: longStringPtr;          {macro version string}
    workString: pstring;                 {for building strings and identifiers}
+   ucnString: string[10];               {string of a UCN}
 
 {-- External procedures; see expresssion evaluator for notes ---}
 
@@ -670,6 +671,7 @@ if list or (numErr <> 0) then begin
         146: msg := @'Unicode character cannot be represented in execution character set';
         147: msg := @'lint: not all parameters were declared with a type';
         148: msg := @'all parameters must have a complete type';
+        149: msg := @'invalid universal character name for use in an identifier';
          otherwise: Error(57);
          end; {case}
        writeln(msg^);
@@ -3319,18 +3321,28 @@ function UniversalCharacterName : ucsCodePoint;
 {  The current character should be the 'u' or 'U'.            }
 {                                                             }
 {  Returns the code point value of the UCN.                   }
+{                                                             }
+{  Globals:                                                   }
+{     ucnString - string representation of this UCN           }
 
 var
    digits: integer;                     {number of hex digits (4 or 8)}
    codePoint: longint;                  {the code point specified by this UCN}
    dig: 0..15;                          {value of a hex digit}
+   i: integer;                          {index for recording UCN string}
 
 begin {UniversalCharacterName}
+i := 1;
+ucnString[i] := '\';
+i := i + 1;
+
 codePoint := 0;
 if ch = 'u' then
    digits := 4
 else {if ch = 'U' then}
    digits := 8;
+ucnString[i] := ch;
+i := i + 1;
 NextCh;
 
 while digits > 0 do begin
@@ -3342,25 +3354,39 @@ while digits > 0 do begin
          dig := ord(ch)-ord('A')+10;
          end; {else}
       codePoint := (codePoint << 4) | dig;
+      ucnString[i] := ch;
+      i := i + 1;
       NextCh;
       digits := digits - 1;
       end {while}
    else begin
       Error(145);
-      codePoint := ord('$');
+      codePoint := $0000C0;
       digits := 0;
       end; {else}
    end; {while}
 
+ucnString[0] := chr(i - 1);
+
 if (codePoint < 0) or (codePoint > maxUCSCodePoint)
    or ((codePoint >= $00D800) and (codePoint <= $00DFFF))
    or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60]))
    then begin
    Error(145);
-   UniversalCharacterName := ord('$');
+   UniversalCharacterName := $0000C0;
    end {if}
 else
    UniversalCharacterName := codePoint;
+
+{Normalize UCN string to shorter form for codepoints that fit in 16 bits}
+if (ord(ucnString[0]) = 10) and (codePoint <= $00FFFF) then begin
+   ucnString[2] := 'u';
+   ucnString[3] := ucnString[7];
+   ucnString[4] := ucnString[8];
+   ucnString[5] := ucnString[9];
+   ucnString[6] := ucnString[10];
+   ucnString[0] := chr(6);
+   end; {if}
 end; {UniversalCharacterName}
 
 
@@ -3816,7 +3842,7 @@ type
 var
    done: boolean;                       {loop termination}
    expandEnabled: boolean;              {can a token be expanded?}
-   i: 0..maxint;                        {loop/index counter}
+   i,j: 0..maxint;                      {loop/index counter}
    inhibit: boolean;                    {inhibit macro expansion?}
    lExpandMacros: boolean;              {local copy of expandMacros}
    lPrintMacroExpansions: boolean;      {local copy of printMacroExpansions}
@@ -3826,6 +3852,8 @@ var
    tToken: tokenType;                   {for merging tokens}
    sPtr,tsPtr: gstringPtr;              {for forming string constants}
    lLastWasReturn: boolean;             {local copy of lastWasReturn}
+   codePoint: ucsCodePoint;             {Unicode code point from UCN}
+   chFromUCN: integer;                  {character given by UCN (converted)}
 
 
    function EscapeCh: integer;
@@ -4361,16 +4389,38 @@ case charKinds[ord(ch)] of
       token.sval^.str[i+1] := chr(0);   {add null in case the string is extended}
       end;
 
-   letter: begin                         {reserved words and identifiers}
+   letter,ch_backslash: begin           {reserved words and identifiers}
       token.kind := ident;
       token.class := identifier;
       token.name := @workString;
       tokenExpandEnabled := true;
       i := 0;
-      while charKinds[ord(ch)] in [letter,digit] do begin
+      while charKinds[ord(ch)] in [letter,digit,ch_backslash] do begin
          i := i+1;
-         workString[i] := ch;
-         NextCh;
+         if ch = '\' then begin
+            NextCh;
+            if ch in ['u','U'] then begin
+               codePoint := UniversalCharacterName;
+               if not ValidUCNForIdentifier(codePoint, i=1) then
+                  Error(149);
+               chFromUCN := ConvertUCSToMacRoman(codePoint);
+               if chFromUCN >= 0 then
+                  workString[i] := chr(chFromUCN)
+               else begin
+                  for j := 1 to ord(ucnString[0]) do
+                     workString[i+j-1] := ucnString[j];
+                  i := i + ord(ucnString[0]) - 1;
+                  end; {else}
+               end {if}
+            else begin
+               Error(1);
+               workString[i] := '?';
+               end; {else}
+            end {if}
+         else begin
+            workString[i] := ch;
+            NextCh;
+            end; {if}
          end; {while}
       workString[0] := chr(i);
       CheckIdentifier;
diff --git a/Table.asm b/Table.asm
index 2e0e025..635f9e5 100644
--- a/Table.asm
+++ b/Table.asm
@@ -19,7 +19,7 @@ charKinds start                         character set
          enum  (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
          enum  (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
          enum  (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
-         enum  (letter,digit)
+         enum  (ch_backslash,letter,digit)
 
 ! STANDARD
          dc    i'ch_eof'                nul
@@ -114,7 +114,7 @@ charKinds start                         character set
          dc    i'letter'                Y
          dc    i'letter'                Z
          dc    i'ch_special'            [
-         dc    i'illegal'               \
+         dc    i'ch_backslash'          \
          dc    i'ch_special'            ]
          dc    i'ch_carot'              ^
          dc    i'letter'                _
diff --git a/cc.notes b/cc.notes
index 462acb1..c7e37b6 100644
--- a/cc.notes
+++ b/cc.notes
@@ -348,6 +348,8 @@ and may appear wherever a declaration can (including inside and outside function
 
 These behave the same as the existing tokens [, ], {, }, #, and ## (respectively), apart from their spelling.
 
+15. (C99) Universal character names are now supported in string literals, character constants, and identifiers.  These are sequences of the form \unnnn or \Unnnnnnnn, where the nnnn or nnnnnnnn is a hexadecimal representation of a Unicode code point.  These may be used to represent characters in a way that is independent of the source and execution character sets.  In a string literal or character constant, only characters that can be mapped to the execution character set may be represented.  There are also certain other restrictions on what characters can be used; see the C standards for details.  For ORCA/C the source and execution character sets are both considered to be Mac OS Roman, the character set used in the IIGS desktop environment.
+
 
 Multi-Character Character Constants
 -----------------------------------