mirror of
https://github.com/byteworksinc/ORCA-C.git
synced 2025-03-11 07:29:29 +00:00
Implement support for universal character names in identifiers.
This commit is contained in:
parent
b1ad79737c
commit
656868a095
@ -192,7 +192,7 @@ type
|
||||
(illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc,
|
||||
ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string,
|
||||
ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon,
|
||||
letter,digit);
|
||||
ch_backslash,letter,digit);
|
||||
|
||||
tokenSet = set of tokenEnum;
|
||||
tokenClass = (reservedWord,reservedSymbol,identifier,intConstant,longConstant,
|
||||
|
73
Charset.pas
73
Charset.pas
@ -43,6 +43,13 @@ function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
|
||||
{ Returns ordinal value of the character, or -1 if it can't be }
|
||||
{ converted. }
|
||||
|
||||
function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean;
|
||||
|
||||
{ Check if a code point is valid for a UCN in an identifier }
|
||||
{ }
|
||||
{ ch - the code point }
|
||||
{ initial - is this UCN the initial element of the identifier? }
|
||||
|
||||
implementation
|
||||
|
||||
function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
|
||||
@ -92,4 +99,70 @@ else begin
|
||||
1:
|
||||
end; {ConvertUCSToMacRoman}
|
||||
|
||||
|
||||
function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
|
||||
|
||||
{ Check if a code point is valid for a UCN in an identifier }
|
||||
{ }
|
||||
{ ch - the code point }
|
||||
{ initial - is this UCN the initial element of the identifier? }
|
||||
|
||||
begin {ValidUCNForIdentifier}
|
||||
{See C17 Annex D}
|
||||
ValidUCNForIdentifier := false;
|
||||
if (ch = $0000A8)
|
||||
or (ch = $0000AA)
|
||||
or (ch = $0000AD)
|
||||
or (ch = $0000AF)
|
||||
or ((ch >= $0000B2) and (ch <= $0000B5))
|
||||
or ((ch >= $0000B7) and (ch <= $0000BA))
|
||||
or ((ch >= $0000BC) and (ch <= $0000BE))
|
||||
or ((ch >= $0000C0) and (ch <= $0000D6))
|
||||
or ((ch >= $0000D8) and (ch <= $0000F6))
|
||||
or ((ch >= $0000F8) and (ch <= $0000FF))
|
||||
or ((ch >= $000100) and (ch <= $00167F))
|
||||
or ((ch >= $001681) and (ch <= $00180D))
|
||||
or ((ch >= $00180F) and (ch <= $001FFF))
|
||||
or ((ch >= $00200B) and (ch <= $00200D))
|
||||
or ((ch >= $00202A) and (ch <= $00202E))
|
||||
or ((ch >= $00203F) and (ch <= $002040))
|
||||
or (ch = $002054)
|
||||
or ((ch >= $002060) and (ch <= $00206F))
|
||||
or ((ch >= $002070) and (ch <= $00218F))
|
||||
or ((ch >= $002460) and (ch <= $0024FF))
|
||||
or ((ch >= $002776) and (ch <= $002793))
|
||||
or ((ch >= $002C00) and (ch <= $002DFF))
|
||||
or ((ch >= $002E80) and (ch <= $002FFF))
|
||||
or ((ch >= $003004) and (ch <= $003007))
|
||||
or ((ch >= $003021) and (ch <= $00302F))
|
||||
or ((ch >= $003031) and (ch <= $00303F))
|
||||
or ((ch >= $003040) and (ch <= $00D7FF))
|
||||
or ((ch >= $00F900) and (ch <= $00FD3D))
|
||||
or ((ch >= $00FD40) and (ch <= $00FDCF))
|
||||
or ((ch >= $00FDF0) and (ch <= $00FE44))
|
||||
or ((ch >= $00FE47) and (ch <= $00FFFD))
|
||||
or ((ch >= $010000) and (ch <= $01FFFD))
|
||||
or ((ch >= $020000) and (ch <= $02FFFD))
|
||||
or ((ch >= $030000) and (ch <= $03FFFD))
|
||||
or ((ch >= $040000) and (ch <= $04FFFD))
|
||||
or ((ch >= $050000) and (ch <= $05FFFD))
|
||||
or ((ch >= $060000) and (ch <= $06FFFD))
|
||||
or ((ch >= $070000) and (ch <= $07FFFD))
|
||||
or ((ch >= $080000) and (ch <= $08FFFD))
|
||||
or ((ch >= $090000) and (ch <= $09FFFD))
|
||||
or ((ch >= $0A0000) and (ch <= $0AFFFD))
|
||||
or ((ch >= $0B0000) and (ch <= $0BFFFD))
|
||||
or ((ch >= $0C0000) and (ch <= $0CFFFD))
|
||||
or ((ch >= $0D0000) and (ch <= $0DFFFD))
|
||||
or ((ch >= $0E0000) and (ch <= $0EFFFD))
|
||||
then ValidUCNForIdentifier := true;
|
||||
|
||||
if initial then
|
||||
if ((ch >= $000300) and (ch <= $00036F))
|
||||
or ((ch >= $001DC0) and (ch <= $001DFF))
|
||||
or ((ch >= $0020D0) and (ch <= $0020FF))
|
||||
or ((ch >= $00FE20) and (ch <= $00FE2F))
|
||||
then ValidUCNForIdentifier := false;
|
||||
end; {ValidUCNForIdentifier}
|
||||
|
||||
end.
|
||||
|
@ -116,7 +116,7 @@ cch equ 13
|
||||
enum (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
|
||||
enum (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
|
||||
enum (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
|
||||
enum (letter,digit)
|
||||
enum (ch_backslash,letter,digit)
|
||||
|
||||
tsc create stack frame
|
||||
sec
|
||||
|
64
Scanner.pas
64
Scanner.pas
@ -264,6 +264,7 @@ var
|
||||
tokenExpandEnabled: boolean; {can token be macro expanded? (only for ident)}
|
||||
versionStrL: longStringPtr; {macro version string}
|
||||
workString: pstring; {for building strings and identifiers}
|
||||
ucnString: string[10]; {string of a UCN}
|
||||
|
||||
{-- External procedures; see expresssion evaluator for notes ---}
|
||||
|
||||
@ -670,6 +671,7 @@ if list or (numErr <> 0) then begin
|
||||
146: msg := @'Unicode character cannot be represented in execution character set';
|
||||
147: msg := @'lint: not all parameters were declared with a type';
|
||||
148: msg := @'all parameters must have a complete type';
|
||||
149: msg := @'invalid universal character name for use in an identifier';
|
||||
otherwise: Error(57);
|
||||
end; {case}
|
||||
writeln(msg^);
|
||||
@ -3319,18 +3321,28 @@ function UniversalCharacterName : ucsCodePoint;
|
||||
{ The current character should be the 'u' or 'U'. }
|
||||
{ }
|
||||
{ Returns the code point value of the UCN. }
|
||||
{ }
|
||||
{ Globals: }
|
||||
{ ucnString - string representation of this UCN }
|
||||
|
||||
var
|
||||
digits: integer; {number of hex digits (4 or 8)}
|
||||
codePoint: longint; {the code point specified by this UCN}
|
||||
dig: 0..15; {value of a hex digit}
|
||||
i: integer; {index for recording UCN string}
|
||||
|
||||
begin {UniversalCharacterName}
|
||||
i := 1;
|
||||
ucnString[i] := '\';
|
||||
i := i + 1;
|
||||
|
||||
codePoint := 0;
|
||||
if ch = 'u' then
|
||||
digits := 4
|
||||
else {if ch = 'U' then}
|
||||
digits := 8;
|
||||
ucnString[i] := ch;
|
||||
i := i + 1;
|
||||
NextCh;
|
||||
|
||||
while digits > 0 do begin
|
||||
@ -3342,25 +3354,39 @@ while digits > 0 do begin
|
||||
dig := ord(ch)-ord('A')+10;
|
||||
end; {else}
|
||||
codePoint := (codePoint << 4) | dig;
|
||||
ucnString[i] := ch;
|
||||
i := i + 1;
|
||||
NextCh;
|
||||
digits := digits - 1;
|
||||
end {while}
|
||||
else begin
|
||||
Error(145);
|
||||
codePoint := ord('$');
|
||||
codePoint := $0000C0;
|
||||
digits := 0;
|
||||
end; {else}
|
||||
end; {while}
|
||||
|
||||
ucnString[0] := chr(i - 1);
|
||||
|
||||
if (codePoint < 0) or (codePoint > maxUCSCodePoint)
|
||||
or ((codePoint >= $00D800) and (codePoint <= $00DFFF))
|
||||
or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60]))
|
||||
then begin
|
||||
Error(145);
|
||||
UniversalCharacterName := ord('$');
|
||||
UniversalCharacterName := $0000C0;
|
||||
end {if}
|
||||
else
|
||||
UniversalCharacterName := codePoint;
|
||||
|
||||
{Normalize UCN string to shorter form for codepoints that fit in 16 bits}
|
||||
if (ord(ucnString[0]) = 10) and (codePoint <= $00FFFF) then begin
|
||||
ucnString[2] := 'u';
|
||||
ucnString[3] := ucnString[7];
|
||||
ucnString[4] := ucnString[8];
|
||||
ucnString[5] := ucnString[9];
|
||||
ucnString[6] := ucnString[10];
|
||||
ucnString[0] := chr(6);
|
||||
end; {if}
|
||||
end; {UniversalCharacterName}
|
||||
|
||||
|
||||
@ -3816,7 +3842,7 @@ type
|
||||
var
|
||||
done: boolean; {loop termination}
|
||||
expandEnabled: boolean; {can a token be expanded?}
|
||||
i: 0..maxint; {loop/index counter}
|
||||
i,j: 0..maxint; {loop/index counter}
|
||||
inhibit: boolean; {inhibit macro expansion?}
|
||||
lExpandMacros: boolean; {local copy of expandMacros}
|
||||
lPrintMacroExpansions: boolean; {local copy of printMacroExpansions}
|
||||
@ -3826,6 +3852,8 @@ var
|
||||
tToken: tokenType; {for merging tokens}
|
||||
sPtr,tsPtr: gstringPtr; {for forming string constants}
|
||||
lLastWasReturn: boolean; {local copy of lastWasReturn}
|
||||
codePoint: ucsCodePoint; {Unicode code point from UCN}
|
||||
chFromUCN: integer; {character given by UCN (converted)}
|
||||
|
||||
|
||||
function EscapeCh: integer;
|
||||
@ -4361,16 +4389,38 @@ case charKinds[ord(ch)] of
|
||||
token.sval^.str[i+1] := chr(0); {add null in case the string is extended}
|
||||
end;
|
||||
|
||||
letter: begin {reserved words and identifiers}
|
||||
letter,ch_backslash: begin {reserved words and identifiers}
|
||||
token.kind := ident;
|
||||
token.class := identifier;
|
||||
token.name := @workString;
|
||||
tokenExpandEnabled := true;
|
||||
i := 0;
|
||||
while charKinds[ord(ch)] in [letter,digit] do begin
|
||||
while charKinds[ord(ch)] in [letter,digit,ch_backslash] do begin
|
||||
i := i+1;
|
||||
workString[i] := ch;
|
||||
NextCh;
|
||||
if ch = '\' then begin
|
||||
NextCh;
|
||||
if ch in ['u','U'] then begin
|
||||
codePoint := UniversalCharacterName;
|
||||
if not ValidUCNForIdentifier(codePoint, i=1) then
|
||||
Error(149);
|
||||
chFromUCN := ConvertUCSToMacRoman(codePoint);
|
||||
if chFromUCN >= 0 then
|
||||
workString[i] := chr(chFromUCN)
|
||||
else begin
|
||||
for j := 1 to ord(ucnString[0]) do
|
||||
workString[i+j-1] := ucnString[j];
|
||||
i := i + ord(ucnString[0]) - 1;
|
||||
end; {else}
|
||||
end {if}
|
||||
else begin
|
||||
Error(1);
|
||||
workString[i] := '?';
|
||||
end; {else}
|
||||
end {if}
|
||||
else begin
|
||||
workString[i] := ch;
|
||||
NextCh;
|
||||
end; {if}
|
||||
end; {while}
|
||||
workString[0] := chr(i);
|
||||
CheckIdentifier;
|
||||
|
@ -19,7 +19,7 @@ charKinds start character set
|
||||
enum (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
|
||||
enum (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
|
||||
enum (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
|
||||
enum (letter,digit)
|
||||
enum (ch_backslash,letter,digit)
|
||||
|
||||
! STANDARD
|
||||
dc i'ch_eof' nul
|
||||
@ -114,7 +114,7 @@ charKinds start character set
|
||||
dc i'letter' Y
|
||||
dc i'letter' Z
|
||||
dc i'ch_special' [
|
||||
dc i'illegal' \
|
||||
dc i'ch_backslash' \
|
||||
dc i'ch_special' ]
|
||||
dc i'ch_carot' ^
|
||||
dc i'letter' _
|
||||
|
2
cc.notes
2
cc.notes
@ -348,6 +348,8 @@ and may appear wherever a declaration can (including inside and outside function
|
||||
|
||||
These behave the same as the existing tokens [, ], {, }, #, and ## (respectively), apart from their spelling.
|
||||
|
||||
15. (C99) Universal character names are now supported in string literals, character constants, and identifiers. These are sequences of the form \unnnn or \Unnnnnnnn, where the nnnn or nnnnnnnn is a hexadecimal representation of a Unicode code point. These may be used to represent characters in a way that is independent of the source and execution character sets. In a string literal or character constant, only characters that can be mapped to the execution character set may be represented. There are also certain other restrictions on what characters can be used; see the C standards for details. For ORCA/C the source and execution character sets are both considered to be Mac OS Roman, the character set used in the IIGS desktop environment.
|
||||
|
||||
|
||||
Multi-Character Character Constants
|
||||
-----------------------------------
|
||||
|
Loading…
x
Reference in New Issue
Block a user