mirror of
https://github.com/byteworksinc/ORCA-C.git
synced 2024-12-27 10:32:35 +00:00
ead95bcb12
As of C23, UCNs within string literals or character constants can contain any valid Unicode code point, including ASCII characters or control characters. The validity of UCNs within identifiers is now defined based on the XID_Start and XID_Continue Unicode properties. A helper program is used to generate tables of the allowed characters based on a Unicode data file. These can be updated for future Unicode versions by re-running the helper program using the updated Unicode data files.
328 lines
10 KiB
ObjectPascal
328 lines
10 KiB
ObjectPascal
{$optimize 7}
|
|
{---------------------------------------------------------------}
|
|
{ }
|
|
{ Character set handling routines }
|
|
{ }
|
|
{ This module handles different character sets and performs }
|
|
{ conversions between them. }
|
|
{ }
|
|
{ Externally available procedures: }
|
|
{ }
|
|
{ ConvertMacRomanToUCS - convert MacRoman character to UCS }
|
|
{ ConvertUCSToMacRoman - convert UCS character to MacRoman }
|
|
{ }
|
|
{---------------------------------------------------------------}
|
|
|
|
unit Charset;
|
|
|
|
{$LibPrefix '0/obj/'}
|
|
|
|
interface
|
|
|
|
{$segment 'SCANNER'}
|
|
|
|
uses CCommon, Table;
|
|
|
|
const
|
|
maxUCSCodePoint = $10ffff; {Maximum Unicode code point}
|
|
maxPlane = 16; {Maximum Unicode plane}
|
|
|
|
type
|
|
ucsCodePoint = 0..maxUCSCodePoint;
|
|
utf8Rec = record
|
|
length: integer;
|
|
bytes: packed array [1..4] of byte;
|
|
end;
|
|
utf16Rec = record
|
|
length: integer;
|
|
codeUnits: packed array [1..2] of integer;
|
|
end;
|
|
|
|
|
|
function ConvertMacRomanToUCS(ch: char): ucsCodePoint;
|
|
|
|
{ convert a character from MacRoman charset to UCS (Unicode) }
|
|
{ }
|
|
{ Returns UCS code point value for the character. }
|
|
|
|
|
|
function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
|
|
|
|
{ convert a character from UCS (Unicode) to MacRoman charset }
|
|
{ }
|
|
{ Returns ordinal value of the character, or -1 if it can't be }
|
|
{ converted. }
|
|
|
|
|
|
procedure UTF16Encode(ch: ucsCodePoint; var utf16: utf16Rec);
|
|
|
|
{ Encode a UCS code point in UTF-16 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-16 representation of the code point }
|
|
|
|
|
|
procedure UTF8Encode(ch: ucsCodePoint; var utf8: utf8Rec);
|
|
|
|
{ Encode a UCS code point in UTF-8 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-8 representation of the code point }
|
|
|
|
|
|
function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean;
|
|
|
|
{ Check if a code point is valid for a UCN in an identifier }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ initial - is this UCN the initial element of the identifier? }
|
|
|
|
{----------------------------------------------------------------}
|
|
|
|
implementation
|
|
|
|
function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
|
|
|
|
{ convert a character from MacRoman charset to UCS (Unicode) }
|
|
{ }
|
|
{ Returns UCS code point value for the character. }
|
|
|
|
begin {ConvertMacRomanToUCS}
|
|
if ord(ch) < $80 then
|
|
ConvertMacRomanToUCS := ord(ch)
|
|
else if ord(ch) <= $ff then
|
|
ConvertMacRomanToUCS := ord4(macRomanToUCS[ord(ch)]) & $0000ffff
|
|
else
|
|
ConvertMacRomanToUCS := $00fffd; {invalid input => REPLACEMENT CHARACTER}
|
|
end; {ConvertMacRomanToUCS}
|
|
|
|
|
|
function ConvertUCSToMacRoman{(ch: ucsCodePoint): integer};
|
|
|
|
{ convert a character from UCS (Unicode) to MacRoman charset }
|
|
{ }
|
|
{ Returns ordinal value of the character, or -1 if it can't be }
|
|
{ converted. }
|
|
|
|
label 1;
|
|
|
|
var
|
|
i: $80..$ff; {loop index}
|
|
ch16bit: integer; {16-bit version of ch (maybe negative)}
|
|
|
|
begin {ConvertUCSToMacRoman}
|
|
if ch < $80 then
|
|
ConvertUCSToMacRoman := ord(ch)
|
|
else begin
|
|
if ch <= $00ffff then begin
|
|
ch16bit := ord(ch);
|
|
for i := $80 to $ff do begin
|
|
if macRomanToUCS[i] = ch16Bit then begin
|
|
ConvertUCSToMacRoman := i;
|
|
goto 1;
|
|
end {if}
|
|
end; {for}
|
|
end; {if}
|
|
ConvertUCSToMacRoman := -1;
|
|
end; {else}
|
|
1:
|
|
end; {ConvertUCSToMacRoman}
|
|
|
|
|
|
procedure UTF16Encode{ch: ucsCodePoint; var utf16: utf16Rec};
|
|
|
|
{ Encode a UCS code point in UTF-16 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-16 representation of the code point }
|
|
|
|
begin {UTF16Encode}
|
|
if ch <= $00ffff then begin
|
|
utf16.length := 1;
|
|
utf16.codeUnits[1] := ord(ch);
|
|
end {if}
|
|
else begin
|
|
utf16.length := 2;
|
|
ch := ch - $010000;
|
|
utf16.codeUnits[1] := $D800 | ord(ch >> 10);
|
|
utf16.codeUnits[2] := $DC00 | ord(ch & $03ff);
|
|
end; {else}
|
|
end; {UTF16Encode}
|
|
|
|
|
|
procedure UTF8Encode{ch: ucsCodePoint; var utf8: utf8Rec};
|
|
|
|
{ Encode a UCS code point in UTF-8 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-8 representation of the code point }
|
|
|
|
begin {UTF8Encode}
|
|
if ch <= $00007f then begin
|
|
utf8.length := 1;
|
|
utf8.bytes[1] := ord(ch);
|
|
end {if}
|
|
else if ch <= $0007ff then begin
|
|
utf8.length := 2;
|
|
utf8.bytes[1] := $C0 | ord(ch >> 6);
|
|
utf8.bytes[2] := $80 | ord(ch & $3f)
|
|
end {else if}
|
|
else if ch <= $00ffff then begin
|
|
utf8.length := 3;
|
|
utf8.bytes[1] := $E0 | ord(ch >> 12);
|
|
utf8.bytes[2] := $80 | ord((ch >> 6) & $3f);
|
|
utf8.bytes[3] := $80 | ord(ch & $3f);
|
|
end {else if}
|
|
else begin
|
|
utf8.length := 4;
|
|
utf8.bytes[1] := $F0 | ord(ch >> 18);
|
|
utf8.bytes[2] := $80 | ord((ch >> 12) & $3f);
|
|
utf8.bytes[3] := $80 | ord((ch >> 6) & $3f);
|
|
utf8.bytes[4] := $80 | ord(ch & $3f);
|
|
end; {else}
|
|
end; {UTF8Encode}
|
|
|
|
|
|
function XID_Start(ch: ucsCodePoint): boolean;
|
|
|
|
{ Check if a Unicode code point has the XID_Start property. }
|
|
|
|
label 1;
|
|
|
|
var
|
|
plane: integer;
|
|
low16: longint;
|
|
index: integer;
|
|
|
|
begin {XID_Start}
|
|
XID_Start := false;
|
|
plane := ord(ch >> 16);
|
|
low16 := ch & $0000FFFF;
|
|
|
|
if (plane < 0) or (plane > maxPlane) then
|
|
goto 1;
|
|
|
|
for index := XID_Start_PlaneStart[plane] to XID_Start_PlaneStart[plane+1]-1 do
|
|
begin
|
|
if (low16 >= (XID_Start_Table[index].min & $0000FFFF))
|
|
and (low16 <= (XID_Start_Table[index].max & $0000FFFF)) then begin
|
|
XID_Start := true;
|
|
goto 1;
|
|
end; {if}
|
|
end; {for}
|
|
1:
|
|
end; {XID_Start}
|
|
|
|
|
|
function XID_Continue(ch: ucsCodePoint): boolean;
|
|
|
|
{ Check if a Unicode code point has the XID_Continue property. }
|
|
|
|
label 1;
|
|
|
|
var
|
|
plane: integer;
|
|
low16: longint;
|
|
index: integer;
|
|
|
|
begin {XID_Continue}
|
|
if XID_Start(ch) then begin
|
|
XID_Continue := true;
|
|
goto 1;
|
|
end; {if}
|
|
|
|
XID_Continue := false;
|
|
plane := ord(ch >> 16);
|
|
low16 := ch & $0000FFFF;
|
|
|
|
if (plane < 0) or (plane > maxPlane) then
|
|
goto 1;
|
|
|
|
for index := XID_Continue_PlaneStart[plane]
|
|
to XID_Continue_PlaneStart[plane+1]-1 do begin
|
|
if (low16 >= (XID_Continue_Table[index].min & $0000FFFF))
|
|
and (low16 <= (XID_Continue_Table[index].max & $0000FFFF)) then begin
|
|
XID_Continue := true;
|
|
goto 1;
|
|
end; {if}
|
|
end; {for}
|
|
1:
|
|
end; {XID_Continue}
|
|
|
|
|
|
function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
|
|
|
|
{ Check if a code point is valid for a UCN in an identifier }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ initial - is this UCN the initial element of the identifier? }
|
|
|
|
begin {ValidUCNForIdentifier}
|
|
if cStd < c23 then begin
|
|
{See C17 Annex D}
|
|
ValidUCNForIdentifier := false;
|
|
if (ch = $0000A8)
|
|
or (ch = $0000AA)
|
|
or (ch = $0000AD)
|
|
or (ch = $0000AF)
|
|
or ((ch >= $0000B2) and (ch <= $0000B5))
|
|
or ((ch >= $0000B7) and (ch <= $0000BA))
|
|
or ((ch >= $0000BC) and (ch <= $0000BE))
|
|
or ((ch >= $0000C0) and (ch <= $0000D6))
|
|
or ((ch >= $0000D8) and (ch <= $0000F6))
|
|
or ((ch >= $0000F8) and (ch <= $0000FF))
|
|
or ((ch >= $000100) and (ch <= $00167F))
|
|
or ((ch >= $001681) and (ch <= $00180D))
|
|
or ((ch >= $00180F) and (ch <= $001FFF))
|
|
or ((ch >= $00200B) and (ch <= $00200D))
|
|
or ((ch >= $00202A) and (ch <= $00202E))
|
|
or ((ch >= $00203F) and (ch <= $002040))
|
|
or (ch = $002054)
|
|
or ((ch >= $002060) and (ch <= $00206F))
|
|
or ((ch >= $002070) and (ch <= $00218F))
|
|
or ((ch >= $002460) and (ch <= $0024FF))
|
|
or ((ch >= $002776) and (ch <= $002793))
|
|
or ((ch >= $002C00) and (ch <= $002DFF))
|
|
or ((ch >= $002E80) and (ch <= $002FFF))
|
|
or ((ch >= $003004) and (ch <= $003007))
|
|
or ((ch >= $003021) and (ch <= $00302F))
|
|
or ((ch >= $003031) and (ch <= $00303F))
|
|
or ((ch >= $003040) and (ch <= $00D7FF))
|
|
or ((ch >= $00F900) and (ch <= $00FD3D))
|
|
or ((ch >= $00FD40) and (ch <= $00FDCF))
|
|
or ((ch >= $00FDF0) and (ch <= $00FE44))
|
|
or ((ch >= $00FE47) and (ch <= $00FFFD))
|
|
or ((ch >= $010000) and (ch <= $01FFFD))
|
|
or ((ch >= $020000) and (ch <= $02FFFD))
|
|
or ((ch >= $030000) and (ch <= $03FFFD))
|
|
or ((ch >= $040000) and (ch <= $04FFFD))
|
|
or ((ch >= $050000) and (ch <= $05FFFD))
|
|
or ((ch >= $060000) and (ch <= $06FFFD))
|
|
or ((ch >= $070000) and (ch <= $07FFFD))
|
|
or ((ch >= $080000) and (ch <= $08FFFD))
|
|
or ((ch >= $090000) and (ch <= $09FFFD))
|
|
or ((ch >= $0A0000) and (ch <= $0AFFFD))
|
|
or ((ch >= $0B0000) and (ch <= $0BFFFD))
|
|
or ((ch >= $0C0000) and (ch <= $0CFFFD))
|
|
or ((ch >= $0D0000) and (ch <= $0DFFFD))
|
|
or ((ch >= $0E0000) and (ch <= $0EFFFD))
|
|
then ValidUCNForIdentifier := true;
|
|
|
|
if initial then
|
|
if ((ch >= $000300) and (ch <= $00036F))
|
|
or ((ch >= $001DC0) and (ch <= $001DFF))
|
|
or ((ch >= $0020D0) and (ch <= $0020FF))
|
|
or ((ch >= $00FE20) and (ch <= $00FE2F))
|
|
then ValidUCNForIdentifier := false;
|
|
end {if}
|
|
else begin
|
|
{C23 rules}
|
|
ValidUCNForIdentifier := false;
|
|
if ch >= $0000A0 then
|
|
if XID_Start(ch) or (not initial and XID_Continue(ch)) then
|
|
ValidUCNForIdentifier := true;
|
|
end; {else}
|
|
end; {ValidUCNForIdentifier}
|
|
|
|
end.
|