From 656868a09512dc56f14b6127beabe908089b896c Mon Sep 17 00:00:00 2001 From: Stephen Heumann Date: Mon, 20 Jan 2020 17:22:06 -0600 Subject: [PATCH] Implement support for universal character names in identifiers. --- CCommon.pas | 2 +- Charset.pas | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++ Scanner.asm | 2 +- Scanner.pas | 64 +++++++++++++++++++++++++++++++++++++++++----- Table.asm | 4 +-- cc.notes | 2 ++ 6 files changed, 136 insertions(+), 11 deletions(-) diff --git a/CCommon.pas b/CCommon.pas index 66bdecc..cb9a243 100644 --- a/CCommon.pas +++ b/CCommon.pas @@ -192,7 +192,7 @@ type (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc, ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string, ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon, - letter,digit); + ch_backslash,letter,digit); tokenSet = set of tokenEnum; tokenClass = (reservedWord,reservedSymbol,identifier,intConstant,longConstant, diff --git a/Charset.pas b/Charset.pas index 272d3ba..c976f81 100644 --- a/Charset.pas +++ b/Charset.pas @@ -43,6 +43,13 @@ function ConvertUCSToMacRoman(ch: ucsCodePoint): integer; { Returns ordinal value of the character, or -1 if it can't be } { converted. } +function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean; + +{ Check if a code point is valid for a UCN in an identifier } +{ } +{ ch - the code point } +{ initial - is this UCN the initial element of the identifier? } + implementation function ConvertMacRomanToUCS{(ch: char): ucsCodePoint}; @@ -92,4 +99,70 @@ else begin 1: end; {ConvertUCSToMacRoman} + +function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean}; + +{ Check if a code point is valid for a UCN in an identifier } +{ } +{ ch - the code point } +{ initial - is this UCN the initial element of the identifier? } + +begin {ValidUCNForIdentifier} +{See C17 Annex D} +ValidUCNForIdentifier := false; +if (ch = $0000A8) + or (ch = $0000AA) + or (ch = $0000AD) + or (ch = $0000AF) + or ((ch >= $0000B2) and (ch <= $0000B5)) + or ((ch >= $0000B7) and (ch <= $0000BA)) + or ((ch >= $0000BC) and (ch <= $0000BE)) + or ((ch >= $0000C0) and (ch <= $0000D6)) + or ((ch >= $0000D8) and (ch <= $0000F6)) + or ((ch >= $0000F8) and (ch <= $0000FF)) + or ((ch >= $000100) and (ch <= $00167F)) + or ((ch >= $001681) and (ch <= $00180D)) + or ((ch >= $00180F) and (ch <= $001FFF)) + or ((ch >= $00200B) and (ch <= $00200D)) + or ((ch >= $00202A) and (ch <= $00202E)) + or ((ch >= $00203F) and (ch <= $002040)) + or (ch = $002054) + or ((ch >= $002060) and (ch <= $00206F)) + or ((ch >= $002070) and (ch <= $00218F)) + or ((ch >= $002460) and (ch <= $0024FF)) + or ((ch >= $002776) and (ch <= $002793)) + or ((ch >= $002C00) and (ch <= $002DFF)) + or ((ch >= $002E80) and (ch <= $002FFF)) + or ((ch >= $003004) and (ch <= $003007)) + or ((ch >= $003021) and (ch <= $00302F)) + or ((ch >= $003031) and (ch <= $00303F)) + or ((ch >= $003040) and (ch <= $00D7FF)) + or ((ch >= $00F900) and (ch <= $00FD3D)) + or ((ch >= $00FD40) and (ch <= $00FDCF)) + or ((ch >= $00FDF0) and (ch <= $00FE44)) + or ((ch >= $00FE47) and (ch <= $00FFFD)) + or ((ch >= $010000) and (ch <= $01FFFD)) + or ((ch >= $020000) and (ch <= $02FFFD)) + or ((ch >= $030000) and (ch <= $03FFFD)) + or ((ch >= $040000) and (ch <= $04FFFD)) + or ((ch >= $050000) and (ch <= $05FFFD)) + or ((ch >= $060000) and (ch <= $06FFFD)) + or ((ch >= $070000) and (ch <= $07FFFD)) + or ((ch >= $080000) and (ch <= $08FFFD)) + or ((ch >= $090000) and (ch <= $09FFFD)) + or ((ch >= $0A0000) and (ch <= $0AFFFD)) + or ((ch >= $0B0000) and (ch <= $0BFFFD)) + or ((ch >= $0C0000) and (ch <= $0CFFFD)) + or ((ch >= $0D0000) and (ch <= $0DFFFD)) + or ((ch >= $0E0000) and (ch <= $0EFFFD)) + then ValidUCNForIdentifier := true; + +if initial then + if ((ch >= $000300) and (ch <= $00036F)) + or ((ch >= $001DC0) and (ch <= $001DFF)) + or ((ch >= $0020D0) and (ch <= $0020FF)) + or ((ch >= $00FE20) and (ch <= $00FE2F)) + then ValidUCNForIdentifier := false; +end; {ValidUCNForIdentifier} + end. diff --git a/Scanner.asm b/Scanner.asm index 4b0d92c..757249e 100644 --- a/Scanner.asm +++ b/Scanner.asm @@ -116,7 +116,7 @@ cch equ 13 enum (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0 enum (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string) enum (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon) - enum (letter,digit) + enum (ch_backslash,letter,digit) tsc create stack frame sec diff --git a/Scanner.pas b/Scanner.pas index 11e236c..be30a93 100644 --- a/Scanner.pas +++ b/Scanner.pas @@ -264,6 +264,7 @@ var tokenExpandEnabled: boolean; {can token be macro expanded? (only for ident)} versionStrL: longStringPtr; {macro version string} workString: pstring; {for building strings and identifiers} + ucnString: string[10]; {string of a UCN} {-- External procedures; see expresssion evaluator for notes ---} @@ -670,6 +671,7 @@ if list or (numErr <> 0) then begin 146: msg := @'Unicode character cannot be represented in execution character set'; 147: msg := @'lint: not all parameters were declared with a type'; 148: msg := @'all parameters must have a complete type'; + 149: msg := @'invalid universal character name for use in an identifier'; otherwise: Error(57); end; {case} writeln(msg^); @@ -3319,18 +3321,28 @@ function UniversalCharacterName : ucsCodePoint; { The current character should be the 'u' or 'U'. } { } { Returns the code point value of the UCN. } +{ } +{ Globals: } +{ ucnString - string representation of this UCN } var digits: integer; {number of hex digits (4 or 8)} codePoint: longint; {the code point specified by this UCN} dig: 0..15; {value of a hex digit} + i: integer; {index for recording UCN string} begin {UniversalCharacterName} +i := 1; +ucnString[i] := '\'; +i := i + 1; + codePoint := 0; if ch = 'u' then digits := 4 else {if ch = 'U' then} digits := 8; +ucnString[i] := ch; +i := i + 1; NextCh; while digits > 0 do begin @@ -3342,25 +3354,39 @@ while digits > 0 do begin dig := ord(ch)-ord('A')+10; end; {else} codePoint := (codePoint << 4) | dig; + ucnString[i] := ch; + i := i + 1; NextCh; digits := digits - 1; end {while} else begin Error(145); - codePoint := ord('$'); + codePoint := $0000C0; digits := 0; end; {else} end; {while} +ucnString[0] := chr(i - 1); + if (codePoint < 0) or (codePoint > maxUCSCodePoint) or ((codePoint >= $00D800) and (codePoint <= $00DFFF)) or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60])) then begin Error(145); - UniversalCharacterName := ord('$'); + UniversalCharacterName := $0000C0; end {if} else UniversalCharacterName := codePoint; + +{Normalize UCN string to shorter form for codepoints that fit in 16 bits} +if (ord(ucnString[0]) = 10) and (codePoint <= $00FFFF) then begin + ucnString[2] := 'u'; + ucnString[3] := ucnString[7]; + ucnString[4] := ucnString[8]; + ucnString[5] := ucnString[9]; + ucnString[6] := ucnString[10]; + ucnString[0] := chr(6); + end; {if} end; {UniversalCharacterName} @@ -3816,7 +3842,7 @@ type var done: boolean; {loop termination} expandEnabled: boolean; {can a token be expanded?} - i: 0..maxint; {loop/index counter} + i,j: 0..maxint; {loop/index counter} inhibit: boolean; {inhibit macro expansion?} lExpandMacros: boolean; {local copy of expandMacros} lPrintMacroExpansions: boolean; {local copy of printMacroExpansions} @@ -3826,6 +3852,8 @@ var tToken: tokenType; {for merging tokens} sPtr,tsPtr: gstringPtr; {for forming string constants} lLastWasReturn: boolean; {local copy of lastWasReturn} + codePoint: ucsCodePoint; {Unicode code point from UCN} + chFromUCN: integer; {character given by UCN (converted)} function EscapeCh: integer; @@ -4361,16 +4389,38 @@ case charKinds[ord(ch)] of token.sval^.str[i+1] := chr(0); {add null in case the string is extended} end; - letter: begin {reserved words and identifiers} + letter,ch_backslash: begin {reserved words and identifiers} token.kind := ident; token.class := identifier; token.name := @workString; tokenExpandEnabled := true; i := 0; - while charKinds[ord(ch)] in [letter,digit] do begin + while charKinds[ord(ch)] in [letter,digit,ch_backslash] do begin i := i+1; - workString[i] := ch; - NextCh; + if ch = '\' then begin + NextCh; + if ch in ['u','U'] then begin + codePoint := UniversalCharacterName; + if not ValidUCNForIdentifier(codePoint, i=1) then + Error(149); + chFromUCN := ConvertUCSToMacRoman(codePoint); + if chFromUCN >= 0 then + workString[i] := chr(chFromUCN) + else begin + for j := 1 to ord(ucnString[0]) do + workString[i+j-1] := ucnString[j]; + i := i + ord(ucnString[0]) - 1; + end; {else} + end {if} + else begin + Error(1); + workString[i] := '?'; + end; {else} + end {if} + else begin + workString[i] := ch; + NextCh; + end; {if} end; {while} workString[0] := chr(i); CheckIdentifier; diff --git a/Table.asm b/Table.asm index 2e0e025..635f9e5 100644 --- a/Table.asm +++ b/Table.asm @@ -19,7 +19,7 @@ charKinds start character set enum (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0 enum (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string) enum (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon) - enum (letter,digit) + enum (ch_backslash,letter,digit) ! STANDARD dc i'ch_eof' nul @@ -114,7 +114,7 @@ charKinds start character set dc i'letter' Y dc i'letter' Z dc i'ch_special' [ - dc i'illegal' \ + dc i'ch_backslash' \ dc i'ch_special' ] dc i'ch_carot' ^ dc i'letter' _ diff --git a/cc.notes b/cc.notes index 462acb1..c7e37b6 100644 --- a/cc.notes +++ b/cc.notes @@ -348,6 +348,8 @@ and may appear wherever a declaration can (including inside and outside function These behave the same as the existing tokens [, ], {, }, #, and ## (respectively), apart from their spelling. +15. (C99) Universal character names are now supported in string literals, character constants, and identifiers. These are sequences of the form \unnnn or \Unnnnnnnn, where the nnnn or nnnnnnnn is a hexadecimal representation of a Unicode code point. These may be used to represent characters in a way that is independent of the source and execution character sets. In a string literal or character constant, only characters that can be mapped to the execution character set may be represented. There are also certain other restrictions on what characters can be used; see the C standards for details. For ORCA/C the source and execution character sets are both considered to be Mac OS Roman, the character set used in the IIGS desktop environment. + Multi-Character Character Constants -----------------------------------