Implement support for universal character names in identifiers.

This commit is contained in:
Stephen Heumann 2020-01-20 17:22:06 -06:00
parent b1ad79737c
commit 656868a095
6 changed files with 136 additions and 11 deletions

View File

@ -192,7 +192,7 @@ type
(illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc,
ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string,
ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon,
letter,digit);
ch_backslash,letter,digit);
tokenSet = set of tokenEnum;
tokenClass = (reservedWord,reservedSymbol,identifier,intConstant,longConstant,

View File

@ -43,6 +43,13 @@ function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
{ Returns ordinal value of the character, or -1 if it can't be }
{ converted. }
function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean;
{ Check if a code point is valid for a UCN in an identifier }
{ }
{ ch - the code point }
{ initial - is this UCN the initial element of the identifier? }
implementation
function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
@ -92,4 +99,70 @@ else begin
1:
end; {ConvertUCSToMacRoman}
function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
{ Check if a code point is valid for a UCN in an identifier }
{ }
{ ch - the code point }
{ initial - is this UCN the initial element of the identifier? }
begin {ValidUCNForIdentifier}
{See C17 Annex D}
ValidUCNForIdentifier := false;
if (ch = $0000A8)
or (ch = $0000AA)
or (ch = $0000AD)
or (ch = $0000AF)
or ((ch >= $0000B2) and (ch <= $0000B5))
or ((ch >= $0000B7) and (ch <= $0000BA))
or ((ch >= $0000BC) and (ch <= $0000BE))
or ((ch >= $0000C0) and (ch <= $0000D6))
or ((ch >= $0000D8) and (ch <= $0000F6))
or ((ch >= $0000F8) and (ch <= $0000FF))
or ((ch >= $000100) and (ch <= $00167F))
or ((ch >= $001681) and (ch <= $00180D))
or ((ch >= $00180F) and (ch <= $001FFF))
or ((ch >= $00200B) and (ch <= $00200D))
or ((ch >= $00202A) and (ch <= $00202E))
or ((ch >= $00203F) and (ch <= $002040))
or (ch = $002054)
or ((ch >= $002060) and (ch <= $00206F))
or ((ch >= $002070) and (ch <= $00218F))
or ((ch >= $002460) and (ch <= $0024FF))
or ((ch >= $002776) and (ch <= $002793))
or ((ch >= $002C00) and (ch <= $002DFF))
or ((ch >= $002E80) and (ch <= $002FFF))
or ((ch >= $003004) and (ch <= $003007))
or ((ch >= $003021) and (ch <= $00302F))
or ((ch >= $003031) and (ch <= $00303F))
or ((ch >= $003040) and (ch <= $00D7FF))
or ((ch >= $00F900) and (ch <= $00FD3D))
or ((ch >= $00FD40) and (ch <= $00FDCF))
or ((ch >= $00FDF0) and (ch <= $00FE44))
or ((ch >= $00FE47) and (ch <= $00FFFD))
or ((ch >= $010000) and (ch <= $01FFFD))
or ((ch >= $020000) and (ch <= $02FFFD))
or ((ch >= $030000) and (ch <= $03FFFD))
or ((ch >= $040000) and (ch <= $04FFFD))
or ((ch >= $050000) and (ch <= $05FFFD))
or ((ch >= $060000) and (ch <= $06FFFD))
or ((ch >= $070000) and (ch <= $07FFFD))
or ((ch >= $080000) and (ch <= $08FFFD))
or ((ch >= $090000) and (ch <= $09FFFD))
or ((ch >= $0A0000) and (ch <= $0AFFFD))
or ((ch >= $0B0000) and (ch <= $0BFFFD))
or ((ch >= $0C0000) and (ch <= $0CFFFD))
or ((ch >= $0D0000) and (ch <= $0DFFFD))
or ((ch >= $0E0000) and (ch <= $0EFFFD))
then ValidUCNForIdentifier := true;
if initial then
if ((ch >= $000300) and (ch <= $00036F))
or ((ch >= $001DC0) and (ch <= $001DFF))
or ((ch >= $0020D0) and (ch <= $0020FF))
or ((ch >= $00FE20) and (ch <= $00FE2F))
then ValidUCNForIdentifier := false;
end; {ValidUCNForIdentifier}
end.

View File

@ -116,7 +116,7 @@ cch equ 13
enum (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
enum (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
enum (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
enum (letter,digit)
enum (ch_backslash,letter,digit)
tsc create stack frame
sec

View File

@ -264,6 +264,7 @@ var
tokenExpandEnabled: boolean; {can token be macro expanded? (only for ident)}
versionStrL: longStringPtr; {macro version string}
workString: pstring; {for building strings and identifiers}
ucnString: string[10]; {string of a UCN}
{-- External procedures; see expresssion evaluator for notes ---}
@ -670,6 +671,7 @@ if list or (numErr <> 0) then begin
146: msg := @'Unicode character cannot be represented in execution character set';
147: msg := @'lint: not all parameters were declared with a type';
148: msg := @'all parameters must have a complete type';
149: msg := @'invalid universal character name for use in an identifier';
otherwise: Error(57);
end; {case}
writeln(msg^);
@ -3319,18 +3321,28 @@ function UniversalCharacterName : ucsCodePoint;
{ The current character should be the 'u' or 'U'. }
{ }
{ Returns the code point value of the UCN. }
{ }
{ Globals: }
{ ucnString - string representation of this UCN }
var
digits: integer; {number of hex digits (4 or 8)}
codePoint: longint; {the code point specified by this UCN}
dig: 0..15; {value of a hex digit}
i: integer; {index for recording UCN string}
begin {UniversalCharacterName}
i := 1;
ucnString[i] := '\';
i := i + 1;
codePoint := 0;
if ch = 'u' then
digits := 4
else {if ch = 'U' then}
digits := 8;
ucnString[i] := ch;
i := i + 1;
NextCh;
while digits > 0 do begin
@ -3342,25 +3354,39 @@ while digits > 0 do begin
dig := ord(ch)-ord('A')+10;
end; {else}
codePoint := (codePoint << 4) | dig;
ucnString[i] := ch;
i := i + 1;
NextCh;
digits := digits - 1;
end {while}
else begin
Error(145);
codePoint := ord('$');
codePoint := $0000C0;
digits := 0;
end; {else}
end; {while}
ucnString[0] := chr(i - 1);
if (codePoint < 0) or (codePoint > maxUCSCodePoint)
or ((codePoint >= $00D800) and (codePoint <= $00DFFF))
or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60]))
then begin
Error(145);
UniversalCharacterName := ord('$');
UniversalCharacterName := $0000C0;
end {if}
else
UniversalCharacterName := codePoint;
{Normalize UCN string to shorter form for codepoints that fit in 16 bits}
if (ord(ucnString[0]) = 10) and (codePoint <= $00FFFF) then begin
ucnString[2] := 'u';
ucnString[3] := ucnString[7];
ucnString[4] := ucnString[8];
ucnString[5] := ucnString[9];
ucnString[6] := ucnString[10];
ucnString[0] := chr(6);
end; {if}
end; {UniversalCharacterName}
@ -3816,7 +3842,7 @@ type
var
done: boolean; {loop termination}
expandEnabled: boolean; {can a token be expanded?}
i: 0..maxint; {loop/index counter}
i,j: 0..maxint; {loop/index counter}
inhibit: boolean; {inhibit macro expansion?}
lExpandMacros: boolean; {local copy of expandMacros}
lPrintMacroExpansions: boolean; {local copy of printMacroExpansions}
@ -3826,6 +3852,8 @@ var
tToken: tokenType; {for merging tokens}
sPtr,tsPtr: gstringPtr; {for forming string constants}
lLastWasReturn: boolean; {local copy of lastWasReturn}
codePoint: ucsCodePoint; {Unicode code point from UCN}
chFromUCN: integer; {character given by UCN (converted)}
function EscapeCh: integer;
@ -4361,16 +4389,38 @@ case charKinds[ord(ch)] of
token.sval^.str[i+1] := chr(0); {add null in case the string is extended}
end;
letter: begin {reserved words and identifiers}
letter,ch_backslash: begin {reserved words and identifiers}
token.kind := ident;
token.class := identifier;
token.name := @workString;
tokenExpandEnabled := true;
i := 0;
while charKinds[ord(ch)] in [letter,digit] do begin
while charKinds[ord(ch)] in [letter,digit,ch_backslash] do begin
i := i+1;
workString[i] := ch;
NextCh;
if ch = '\' then begin
NextCh;
if ch in ['u','U'] then begin
codePoint := UniversalCharacterName;
if not ValidUCNForIdentifier(codePoint, i=1) then
Error(149);
chFromUCN := ConvertUCSToMacRoman(codePoint);
if chFromUCN >= 0 then
workString[i] := chr(chFromUCN)
else begin
for j := 1 to ord(ucnString[0]) do
workString[i+j-1] := ucnString[j];
i := i + ord(ucnString[0]) - 1;
end; {else}
end {if}
else begin
Error(1);
workString[i] := '?';
end; {else}
end {if}
else begin
workString[i] := ch;
NextCh;
end; {if}
end; {while}
workString[0] := chr(i);
CheckIdentifier;

View File

@ -19,7 +19,7 @@ charKinds start character set
enum (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
enum (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
enum (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
enum (letter,digit)
enum (ch_backslash,letter,digit)
! STANDARD
dc i'ch_eof' nul
@ -114,7 +114,7 @@ charKinds start character set
dc i'letter' Y
dc i'letter' Z
dc i'ch_special' [
dc i'illegal' \
dc i'ch_backslash' \
dc i'ch_special' ]
dc i'ch_carot' ^
dc i'letter' _

View File

@ -348,6 +348,8 @@ and may appear wherever a declaration can (including inside and outside function
These behave the same as the existing tokens [, ], {, }, #, and ## (respectively), apart from their spelling.
15. (C99) Universal character names are now supported in string literals, character constants, and identifiers. These are sequences of the form \unnnn or \Unnnnnnnn, where the nnnn or nnnnnnnn is a hexadecimal representation of a Unicode code point. These may be used to represent characters in a way that is independent of the source and execution character sets. In a string literal or character constant, only characters that can be mapped to the execution character set may be represented. There are also certain other restrictions on what characters can be used; see the C standards for details. For ORCA/C the source and execution character sets are both considered to be Mac OS Roman, the character set used in the IIGS desktop environment.
Multi-Character Character Constants
-----------------------------------