Implement support for universal character names in identifiers.

2025-03-11 07:29:29 +00:00 · 2020-01-20 17:22:06 -06:00 · 2020-01-20 17:22:06 -06:00 · 656868a095
commit 656868a095
parent b1ad79737c
6 changed files with 136 additions and 11 deletions
--- a/CCommon.pas
+++ b/CCommon.pas
@ -192,7 +192,7 @@ type
      (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc,
       ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string,
       ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon,
-       letter,digit);
+       ch_backslash,letter,digit);

   tokenSet = set of tokenEnum;
   tokenClass = (reservedWord,reservedSymbol,identifier,intConstant,longConstant,
--- a/Charset.pas
+++ b/Charset.pas
@ -43,6 +43,13 @@ function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
 { Returns ordinal value of the character, or -1 if it can't be   }
 { converted.                                                     }

+function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean;
+
+{ Check if a code point is valid for a UCN in an identifier      }
+{                                                                }
+{ ch - the code point                                            }
+{ initial - is this UCN the initial element of the identifier?   }
+
 implementation

 function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
@ -92,4 +99,70 @@ else begin
 1:
 end; {ConvertUCSToMacRoman}

+
+function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
+
+{ Check if a code point is valid for a UCN in an identifier      }
+{                                                                }
+{ ch - the code point                                            }
+{ initial - is this UCN the initial element of the identifier?   }
+
+begin {ValidUCNForIdentifier}
+{See C17 Annex D}
+ValidUCNForIdentifier := false;
+if    (ch = $0000A8)
+   or (ch = $0000AA)
+   or (ch = $0000AD)
+   or (ch = $0000AF)
+   or ((ch >= $0000B2) and (ch <= $0000B5))
+   or ((ch >= $0000B7) and (ch <= $0000BA))
+   or ((ch >= $0000BC) and (ch <= $0000BE))
+   or ((ch >= $0000C0) and (ch <= $0000D6))
+   or ((ch >= $0000D8) and (ch <= $0000F6))
+   or ((ch >= $0000F8) and (ch <= $0000FF))
+   or ((ch >= $000100) and (ch <= $00167F))
+   or ((ch >= $001681) and (ch <= $00180D))
+   or ((ch >= $00180F) and (ch <= $001FFF))
+   or ((ch >= $00200B) and (ch <= $00200D))
+   or ((ch >= $00202A) and (ch <= $00202E))
+   or ((ch >= $00203F) and (ch <= $002040))
+   or (ch = $002054)
+   or ((ch >= $002060) and (ch <= $00206F))
+   or ((ch >= $002070) and (ch <= $00218F))
+   or ((ch >= $002460) and (ch <= $0024FF))
+   or ((ch >= $002776) and (ch <= $002793))
+   or ((ch >= $002C00) and (ch <= $002DFF))
+   or ((ch >= $002E80) and (ch <= $002FFF))
+   or ((ch >= $003004) and (ch <= $003007))
+   or ((ch >= $003021) and (ch <= $00302F))
+   or ((ch >= $003031) and (ch <= $00303F))
+   or ((ch >= $003040) and (ch <= $00D7FF))
+   or ((ch >= $00F900) and (ch <= $00FD3D))
+   or ((ch >= $00FD40) and (ch <= $00FDCF))
+   or ((ch >= $00FDF0) and (ch <= $00FE44))
+   or ((ch >= $00FE47) and (ch <= $00FFFD))
+   or ((ch >= $010000) and (ch <= $01FFFD))
+   or ((ch >= $020000) and (ch <= $02FFFD))
+   or ((ch >= $030000) and (ch <= $03FFFD))
+   or ((ch >= $040000) and (ch <= $04FFFD))
+   or ((ch >= $050000) and (ch <= $05FFFD))
+   or ((ch >= $060000) and (ch <= $06FFFD))
+   or ((ch >= $070000) and (ch <= $07FFFD))
+   or ((ch >= $080000) and (ch <= $08FFFD))
+   or ((ch >= $090000) and (ch <= $09FFFD))
+   or ((ch >= $0A0000) and (ch <= $0AFFFD))
+   or ((ch >= $0B0000) and (ch <= $0BFFFD))
+   or ((ch >= $0C0000) and (ch <= $0CFFFD))
+   or ((ch >= $0D0000) and (ch <= $0DFFFD))
+   or ((ch >= $0E0000) and (ch <= $0EFFFD))
+   then ValidUCNForIdentifier := true;
+
+if initial then
+   if    ((ch >= $000300) and (ch <= $00036F))
+      or ((ch >= $001DC0) and (ch <= $001DFF))
+      or ((ch >= $0020D0) and (ch <= $0020FF))
+      or ((ch >= $00FE20) and (ch <= $00FE2F))
+      then ValidUCNForIdentifier := false;
+end; {ValidUCNForIdentifier}
+
 end.
--- a/Scanner.asm
+++ b/Scanner.asm
@ -116,7 +116,7 @@ cch      equ   13
         enum  (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
         enum  (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
         enum  (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
-         enum  (letter,digit)
+         enum  (ch_backslash,letter,digit)

         tsc                            create stack frame
         sec
--- a/Scanner.pas
+++ b/Scanner.pas
@ -264,6 +264,7 @@ var
   tokenExpandEnabled: boolean;         {can token be macro expanded? (only for ident)}
   versionStrL: longStringPtr;          {macro version string}
   workString: pstring;                 {for building strings and identifiers}
+   ucnString: string[10];               {string of a UCN}

 {-- External procedures; see expresssion evaluator for notes ---}

@ -670,6 +671,7 @@ if list or (numErr <> 0) then begin
        146: msg := @'Unicode character cannot be represented in execution character set';
        147: msg := @'lint: not all parameters were declared with a type';
        148: msg := @'all parameters must have a complete type';
+        149: msg := @'invalid universal character name for use in an identifier';
         otherwise: Error(57);
         end; {case}
       writeln(msg^);
@ -3319,18 +3321,28 @@ function UniversalCharacterName : ucsCodePoint;
 {  The current character should be the 'u' or 'U'.            }
 {                                                             }
 {  Returns the code point value of the UCN.                   }
+{                                                             }
+{  Globals:                                                   }
+{     ucnString - string representation of this UCN           }

 var
   digits: integer;                     {number of hex digits (4 or 8)}
   codePoint: longint;                  {the code point specified by this UCN}
   dig: 0..15;                          {value of a hex digit}
+   i: integer;                          {index for recording UCN string}

 begin {UniversalCharacterName}
+i := 1;
+ucnString[i] := '\';
+i := i + 1;
+
 codePoint := 0;
 if ch = 'u' then
   digits := 4
 else {if ch = 'U' then}
   digits := 8;
+ucnString[i] := ch;
+i := i + 1;
 NextCh;

 while digits > 0 do begin
@ -3342,25 +3354,39 @@ while digits > 0 do begin
         dig := ord(ch)-ord('A')+10;
         end; {else}
      codePoint := (codePoint << 4) | dig;
+      ucnString[i] := ch;
+      i := i + 1;
      NextCh;
      digits := digits - 1;
      end {while}
   else begin
      Error(145);
-      codePoint := ord('$');
+      codePoint := $0000C0;
      digits := 0;
      end; {else}
   end; {while}

+ucnString[0] := chr(i - 1);
+
 if (codePoint < 0) or (codePoint > maxUCSCodePoint)
   or ((codePoint >= $00D800) and (codePoint <= $00DFFF))
   or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60]))
   then begin
   Error(145);
-   UniversalCharacterName := ord('$');
+   UniversalCharacterName := $0000C0;
   end {if}
 else
   UniversalCharacterName := codePoint;
+
+{Normalize UCN string to shorter form for codepoints that fit in 16 bits}
+if (ord(ucnString[0]) = 10) and (codePoint <= $00FFFF) then begin
+   ucnString[2] := 'u';
+   ucnString[3] := ucnString[7];
+   ucnString[4] := ucnString[8];
+   ucnString[5] := ucnString[9];
+   ucnString[6] := ucnString[10];
+   ucnString[0] := chr(6);
+   end; {if}
 end; {UniversalCharacterName}


@ -3816,7 +3842,7 @@ type
 var
   done: boolean;                       {loop termination}
   expandEnabled: boolean;              {can a token be expanded?}
-   i: 0..maxint;                        {loop/index counter}
+   i,j: 0..maxint;                      {loop/index counter}
   inhibit: boolean;                    {inhibit macro expansion?}
   lExpandMacros: boolean;              {local copy of expandMacros}
   lPrintMacroExpansions: boolean;      {local copy of printMacroExpansions}
@ -3826,6 +3852,8 @@ var
   tToken: tokenType;                   {for merging tokens}
   sPtr,tsPtr: gstringPtr;              {for forming string constants}
   lLastWasReturn: boolean;             {local copy of lastWasReturn}
+   codePoint: ucsCodePoint;             {Unicode code point from UCN}
+   chFromUCN: integer;                  {character given by UCN (converted)}


   function EscapeCh: integer;
@ -4361,16 +4389,38 @@ case charKinds[ord(ch)] of
      token.sval^.str[i+1] := chr(0);   {add null in case the string is extended}
      end;

-   letter: begin                         {reserved words and identifiers}
+   letter,ch_backslash: begin           {reserved words and identifiers}
      token.kind := ident;
      token.class := identifier;
      token.name := @workString;
      tokenExpandEnabled := true;
      i := 0;
-      while charKinds[ord(ch)] in [letter,digit] do begin
+      while charKinds[ord(ch)] in [letter,digit,ch_backslash] do begin
         i := i+1;
-         workString[i] := ch;
-         NextCh;
+         if ch = '\' then begin
+            NextCh;
+            if ch in ['u','U'] then begin
+               codePoint := UniversalCharacterName;
+               if not ValidUCNForIdentifier(codePoint, i=1) then
+                  Error(149);
+               chFromUCN := ConvertUCSToMacRoman(codePoint);
+               if chFromUCN >= 0 then
+                  workString[i] := chr(chFromUCN)
+               else begin
+                  for j := 1 to ord(ucnString[0]) do
+                     workString[i+j-1] := ucnString[j];
+                  i := i + ord(ucnString[0]) - 1;
+                  end; {else}
+               end {if}
+            else begin
+               Error(1);
+               workString[i] := '?';
+               end; {else}
+            end {if}
+         else begin
+            workString[i] := ch;
+            NextCh;
+            end; {if}
         end; {while}
      workString[0] := chr(i);
      CheckIdentifier;
--- a/Table.asm
+++ b/Table.asm
@ -19,7 +19,7 @@ charKinds start                         character set
         enum  (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
         enum  (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
         enum  (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
-         enum  (letter,digit)
+         enum  (ch_backslash,letter,digit)

 ! STANDARD
         dc    i'ch_eof'                nul
@ -114,7 +114,7 @@ charKinds start                         character set
         dc    i'letter'                Y
         dc    i'letter'                Z
         dc    i'ch_special'            [
-         dc    i'illegal'               \
+         dc    i'ch_backslash'          \
         dc    i'ch_special'            ]
         dc    i'ch_carot'              ^
         dc    i'letter'                _
--- a/cc.notes
+++ b/cc.notes
@ -348,6 +348,8 @@ and may appear wherever a declaration can (including inside and outside function

 These behave the same as the existing tokens [, ], {, }, #, and ## (respectively), apart from their spelling.

+15. (C99) Universal character names are now supported in string literals, character constants, and identifiers.  These are sequences of the form \unnnn or \Unnnnnnnn, where the nnnn or nnnnnnnn is a hexadecimal representation of a Unicode code point.  These may be used to represent characters in a way that is independent of the source and execution character sets.  In a string literal or character constant, only characters that can be mapped to the execution character set may be represented.  There are also certain other restrictions on what characters can be used; see the C standards for details.  For ORCA/C the source and execution character sets are both considered to be Mac OS Roman, the character set used in the IIGS desktop environment.
+

 Multi-Character Character Constants
 -----------------------------------