Add initial support for universal character names.
This currently only works in character constants or strings, not identifiers.
This commit is contained in:
parent
6f46078108
commit
d24dacf01a
|
@ -0,0 +1,95 @@
|
||||||
|
{$optimize 7}
|
||||||
|
{---------------------------------------------------------------}
|
||||||
|
{ }
|
||||||
|
{ Character set handling routines }
|
||||||
|
{ }
|
||||||
|
{ This module handles different character sets and performs }
|
||||||
|
{ conversions between them. }
|
||||||
|
{ }
|
||||||
|
{ Externally available procedures: }
|
||||||
|
{ }
|
||||||
|
{ ConvertMacRomanToUCS - convert MacRoman character to UCS }
|
||||||
|
{ ConvertUCSToMacRoman - convert UCS character to MacRoman }
|
||||||
|
{ }
|
||||||
|
{---------------------------------------------------------------}
|
||||||
|
|
||||||
|
unit Charset;
|
||||||
|
|
||||||
|
{$LibPrefix '0/obj/'}
|
||||||
|
|
||||||
|
interface
|
||||||
|
|
||||||
|
{$segment 'SCANNER'}
|
||||||
|
|
||||||
|
uses CCommon, Table;
|
||||||
|
|
||||||
|
const
|
||||||
|
maxUCSCodePoint = $10ffff;
|
||||||
|
|
||||||
|
type
|
||||||
|
ucsCodePoint = 0..maxUCSCodePoint;
|
||||||
|
|
||||||
|
|
||||||
|
function ConvertMacRomanToUCS(ch: char): ucsCodePoint;
|
||||||
|
|
||||||
|
{ convert a character from MacRoman charset to UCS (Unicode) }
|
||||||
|
{ }
|
||||||
|
{ Returns UCS code point value for the character. }
|
||||||
|
|
||||||
|
function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
|
||||||
|
|
||||||
|
{ convert a character from UCS (Unicode) to MacRoman charset }
|
||||||
|
{ }
|
||||||
|
{ Returns ordinal value of the character, or -1 if it can't be }
|
||||||
|
{ converted. }
|
||||||
|
|
||||||
|
implementation
|
||||||
|
|
||||||
|
function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
|
||||||
|
|
||||||
|
{ convert a character from MacRoman charset to UCS (Unicode) }
|
||||||
|
{ }
|
||||||
|
{ Returns UCS code point value for the character. }
|
||||||
|
|
||||||
|
begin {ConvertMacRomanToUCS}
|
||||||
|
if ord(ch) < $80 then
|
||||||
|
ConvertMacRomanToUCS := ord(ch)
|
||||||
|
else if ord(ch) <= $ff then
|
||||||
|
ConvertMacRomanToUCS := ord4(macRomanToUCS[ord(ch)]) & $0000ffff
|
||||||
|
else
|
||||||
|
ConvertMacRomanToUCS := $00fffd; {invalid input => REPLACEMENT CHARACTER}
|
||||||
|
end; {ConvertMacRomanToUCS}
|
||||||
|
|
||||||
|
|
||||||
|
function ConvertUCSToMacRoman{(ch: ucsCodePoint): integer};
|
||||||
|
|
||||||
|
{ convert a character from UCS (Unicode) to MacRoman charset }
|
||||||
|
{ }
|
||||||
|
{ Returns ordinal value of the character, or -1 if it can't be }
|
||||||
|
{ converted. }
|
||||||
|
|
||||||
|
label 1;
|
||||||
|
|
||||||
|
var
|
||||||
|
i: $80..$ff; {loop index}
|
||||||
|
ch16bit: integer; {16-bit version of ch (maybe negative)}
|
||||||
|
|
||||||
|
begin {ConvertUCSToMacRoman}
|
||||||
|
if ch < $80 then
|
||||||
|
ConvertUCSToMacRoman := ord(ch)
|
||||||
|
else begin
|
||||||
|
if ch <= $00ffff then begin
|
||||||
|
ch16bit := ord(ch);
|
||||||
|
for i := $80 to $ff do begin
|
||||||
|
if macRomanToUCS[i] = ch16Bit then begin
|
||||||
|
ConvertUCSToMacRoman := i;
|
||||||
|
goto 1;
|
||||||
|
end {if}
|
||||||
|
end; {for}
|
||||||
|
end; {if}
|
||||||
|
ConvertUCSToMacRoman := -1;
|
||||||
|
end; {else}
|
||||||
|
1:
|
||||||
|
end; {ConvertUCSToMacRoman}
|
||||||
|
|
||||||
|
end.
|
70
Scanner.pas
70
Scanner.pas
|
@ -29,7 +29,7 @@ interface
|
||||||
|
|
||||||
{$LibPrefix '0/obj/'}
|
{$LibPrefix '0/obj/'}
|
||||||
|
|
||||||
uses CCommon, Table, CGI, MM;
|
uses CCommon, Table, CGI, MM, Charset;
|
||||||
|
|
||||||
{$segment 'SCANNER'}
|
{$segment 'SCANNER'}
|
||||||
|
|
||||||
|
@ -666,6 +666,8 @@ if list or (numErr <> 0) then begin
|
||||||
142: msg := @'_Alignas may not be used in this declaration or type name';
|
142: msg := @'_Alignas may not be used in this declaration or type name';
|
||||||
143: msg := @'only object pointer types may be restrict-qualified';
|
143: msg := @'only object pointer types may be restrict-qualified';
|
||||||
144: msg := @'generic selection expressions are not supported by ORCA/C';
|
144: msg := @'generic selection expressions are not supported by ORCA/C';
|
||||||
|
145: msg := @'invalid universal character name';
|
||||||
|
146: msg := @'Unicode character cannot be represented in execution character set';
|
||||||
otherwise: Error(57);
|
otherwise: Error(57);
|
||||||
end; {case}
|
end; {case}
|
||||||
writeln(msg^);
|
writeln(msg^);
|
||||||
|
@ -3309,6 +3311,57 @@ if scanWork then {make sure we read all characters}
|
||||||
end; {DoNumber}
|
end; {DoNumber}
|
||||||
|
|
||||||
|
|
||||||
|
function UniversalCharacterName : ucsCodePoint;
|
||||||
|
|
||||||
|
{ Scan a universal character name. }
|
||||||
|
{ The current character should be the 'u' or 'U'. }
|
||||||
|
{ }
|
||||||
|
{ Returns the code point value of the UCN. }
|
||||||
|
|
||||||
|
var
|
||||||
|
digits: integer; {number of hex digits (4 or 8)}
|
||||||
|
codePoint: longint; {the code point specified by this UCN}
|
||||||
|
dig: 0..15; {value of a hex digit}
|
||||||
|
|
||||||
|
begin {UniversalCharacterName}
|
||||||
|
codePoint := 0;
|
||||||
|
if ch = 'u' then
|
||||||
|
digits := 4
|
||||||
|
else {if ch = 'U' then}
|
||||||
|
digits := 8;
|
||||||
|
NextCh;
|
||||||
|
|
||||||
|
while digits > 0 do begin
|
||||||
|
if ch in ['0'..'9','a'..'f','A'..'F'] then begin
|
||||||
|
if ch in ['0'..'9'] then
|
||||||
|
dig := ord(ch) & $0F
|
||||||
|
else begin
|
||||||
|
ch := chr(ord(ch)&$5F);
|
||||||
|
dig := ord(ch)-ord('A')+10;
|
||||||
|
end; {else}
|
||||||
|
codePoint := (codePoint << 4) | dig;
|
||||||
|
NextCh;
|
||||||
|
digits := digits - 1;
|
||||||
|
end {while}
|
||||||
|
else begin
|
||||||
|
Error(145);
|
||||||
|
codePoint := ord('$');
|
||||||
|
digits := 0;
|
||||||
|
end; {else}
|
||||||
|
end; {while}
|
||||||
|
|
||||||
|
if (codePoint < 0) or (codePoint > maxUCSCodePoint)
|
||||||
|
or ((codePoint >= $00D800) and (codePoint <= $00DFFF))
|
||||||
|
or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60]))
|
||||||
|
then begin
|
||||||
|
Error(145);
|
||||||
|
UniversalCharacterName := ord('$');
|
||||||
|
end {if}
|
||||||
|
else
|
||||||
|
UniversalCharacterName := codePoint;
|
||||||
|
end; {UniversalCharacterName}
|
||||||
|
|
||||||
|
|
||||||
procedure InitScanner {start, end: ptr};
|
procedure InitScanner {start, end: ptr};
|
||||||
|
|
||||||
{ initialize the scanner }
|
{ initialize the scanner }
|
||||||
|
@ -3790,12 +3843,14 @@ var
|
||||||
dig: 0..15; {value of a hex digit}
|
dig: 0..15; {value of a hex digit}
|
||||||
skipChar: boolean; {get next char when done?}
|
skipChar: boolean; {get next char when done?}
|
||||||
val: 0..4095; {hex escape code value (scaled to 0..255)}
|
val: 0..4095; {hex escape code value (scaled to 0..255)}
|
||||||
|
codePoint: ucsCodePoint; {code point given by UCN}
|
||||||
|
chFromUCN: integer; {character given by UCN (converted)}
|
||||||
|
|
||||||
begin {EscapeCh}
|
begin {EscapeCh}
|
||||||
1: skipChar := true;
|
1: skipChar := true;
|
||||||
if ch = '\' then begin
|
if ch = '\' then begin
|
||||||
NextCh;
|
NextCh;
|
||||||
if ch in ['0'..'7','a','b','t','n','v','f','p','r','x'] then
|
if ch in ['0'..'7','a','b','t','n','v','f','p','r','x','u','U'] then
|
||||||
case ch of
|
case ch of
|
||||||
'0','1','2','3','4','5','6','7': begin
|
'0','1','2','3','4','5','6','7': begin
|
||||||
val := 0;
|
val := 0;
|
||||||
|
@ -3835,6 +3890,17 @@ var
|
||||||
skipChar := false;
|
skipChar := false;
|
||||||
EscapeCh := val & $FF;
|
EscapeCh := val & $FF;
|
||||||
end;
|
end;
|
||||||
|
'u','U': begin
|
||||||
|
codePoint := UniversalCharacterName;
|
||||||
|
chFromUCN := ConvertUCSToMacRoman(codePoint);
|
||||||
|
skipChar := false;
|
||||||
|
if chFromUCN >= 0 then
|
||||||
|
EscapeCh := chFromUCN
|
||||||
|
else begin
|
||||||
|
EscapeCh := 0;
|
||||||
|
Error(146);
|
||||||
|
end; {else}
|
||||||
|
end;
|
||||||
otherwise: Error(57);
|
otherwise: Error(57);
|
||||||
end {case}
|
end {case}
|
||||||
else
|
else
|
||||||
|
|
19
Table.asm
19
Table.asm
|
@ -913,3 +913,22 @@ wordHash start reserved word hash table
|
||||||
dc i'pascalsy,pascalsy,pascalsy,pascalsy,registersy,registersy'
|
dc i'pascalsy,pascalsy,pascalsy,pascalsy,registersy,registersy'
|
||||||
dc i'shortsy,typedefsy,unionsy,voidsy,whilesy,succwhilesy'
|
dc i'shortsy,typedefsy,unionsy,voidsy,whilesy,succwhilesy'
|
||||||
end
|
end
|
||||||
|
|
||||||
|
macRomanToUCS start
|
||||||
|
dc i2'$00C4, $00C5, $00C7, $00C9, $00D1, $00D6, $00DC, $00E1'
|
||||||
|
dc i2'$00E0, $00E2, $00E4, $00E3, $00E5, $00E7, $00E9, $00E8'
|
||||||
|
dc i2'$00EA, $00EB, $00ED, $00EC, $00EE, $00EF, $00F1, $00F3'
|
||||||
|
dc i2'$00F2, $00F4, $00F6, $00F5, $00FA, $00F9, $00FB, $00FC'
|
||||||
|
dc i2'$2020, $00B0, $00A2, $00A3, $00A7, $2022, $00B6, $00DF'
|
||||||
|
dc i2'$00AE, $00A9, $2122, $00B4, $00A8, $2260, $00C6, $00D8'
|
||||||
|
dc i2'$221E, $00B1, $2264, $2265, $00A5, $00B5, $2202, $2211'
|
||||||
|
dc i2'$220F, $03C0, $222B, $00AA, $00BA, $03A9, $00E6, $00F8'
|
||||||
|
dc i2'$00BF, $00A1, $00AC, $221A, $0192, $2248, $2206, $00AB'
|
||||||
|
dc i2'$00BB, $2026, $00A0, $00C0, $00C3, $00D5, $0152, $0153'
|
||||||
|
dc i2'$2013, $2014, $201C, $201D, $2018, $2019, $00F7, $25CA'
|
||||||
|
dc i2'$00FF, $0178, $2044, $00A4, $2039, $203A, $FB01, $FB02'
|
||||||
|
dc i2'$2021, $00B7, $201A, $201E, $2030, $00C2, $00CA, $00C1'
|
||||||
|
dc i2'$00CB, $00C8, $00CD, $00CE, $00CF, $00CC, $00D3, $00D4'
|
||||||
|
dc i2'$F8FF, $00D2, $00DA, $00DB, $00D9, $0131, $02C6, $02DC'
|
||||||
|
dc i2'$00AF, $02D8, $02D9, $02DA, $00B8, $02DD, $02DB, $02C7'
|
||||||
|
end
|
||||||
|
|
|
@ -38,6 +38,9 @@ var
|
||||||
icp: array[tokenEnum] of byte; {in-commong priorities}
|
icp: array[tokenEnum] of byte; {in-commong priorities}
|
||||||
isp: array[tokenEnum] of byte; {in-stack priorities}
|
isp: array[tokenEnum] of byte; {in-stack priorities}
|
||||||
|
|
||||||
|
{from Charset.pas}
|
||||||
|
{----------------}
|
||||||
|
macRomanToUCS: array[$80..$FF] of integer; {mapping from MacRoman charset to UCS}
|
||||||
implementation
|
implementation
|
||||||
|
|
||||||
end.
|
end.
|
||||||
|
|
1
linkit
1
linkit
|
@ -15,5 +15,6 @@ obj/dag
|
||||||
obj/gen
|
obj/gen
|
||||||
obj/header
|
obj/header
|
||||||
obj/printf
|
obj/printf
|
||||||
|
obj/charset
|
||||||
|
|
||||||
keep=16/cc
|
keep=16/cc
|
||||||
|
|
15
make
15
make
|
@ -43,6 +43,7 @@ if {#} == 0
|
||||||
set gen gen
|
set gen gen
|
||||||
set header header
|
set header header
|
||||||
set printf printf
|
set printf printf
|
||||||
|
set charset charset
|
||||||
end
|
end
|
||||||
|
|
||||||
Newer obj/cgc.a cgc.pas cgc.asm
|
Newer obj/cgc.a cgc.pas cgc.asm
|
||||||
|
@ -134,6 +135,7 @@ if {#} == 0
|
||||||
set expression expression
|
set expression expression
|
||||||
set parser parser
|
set parser parser
|
||||||
set scanner scanner
|
set scanner scanner
|
||||||
|
set charset charset
|
||||||
end
|
end
|
||||||
|
|
||||||
Newer obj/dag.a dag.pas
|
Newer obj/dag.a dag.pas
|
||||||
|
@ -161,6 +163,13 @@ if {#} == 0
|
||||||
set expression expression
|
set expression expression
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Newer obj/charset.a charset.pas
|
||||||
|
if {status} != 0
|
||||||
|
set cc cc
|
||||||
|
set scanner scanner
|
||||||
|
set charset charset
|
||||||
|
end
|
||||||
|
|
||||||
else
|
else
|
||||||
for i
|
for i
|
||||||
set {i} {i}
|
set {i} {i}
|
||||||
|
@ -184,9 +193,9 @@ if "{table}" == table
|
||||||
end
|
end
|
||||||
|
|
||||||
set list ""
|
set list ""
|
||||||
set list {ccommon} {mm} {cgi} {scanner} {symbol} {header} {printf}
|
set list {ccommon} {mm} {cgi} {charset} {scanner} {symbol} {header}
|
||||||
set list {list} {expression} {cgc} {asm} {parser} {cc} {objout} {native}
|
set list {list} {printf} {expression} {cgc} {asm} {parser} {cc}
|
||||||
set list {list} {gen} {dag}
|
set list {list} {objout} {native} {gen} {dag}
|
||||||
|
|
||||||
if "{list}" != ""
|
if "{list}" != ""
|
||||||
for i in {list}
|
for i in {list}
|
||||||
|
|
Loading…
Reference in New Issue