ORCA-C/Charset.pas
Stephen Heumann ead95bcb12 Implement C23 changes to universal character names.
As of C23, UCNs within string literals or character constants can contain any valid Unicode code point, including ASCII characters or control characters.

The validity of UCNs within identifiers is now defined based on the XID_Start and XID_Continue Unicode properties. A helper program is used to generate tables of the allowed characters based on a Unicode data file. These can be updated for future Unicode versions by re-running the helper program using the updated Unicode data files.
2024-09-13 22:14:43 -05:00

328 lines
10 KiB
ObjectPascal

{$optimize 7}
{---------------------------------------------------------------}
{ }
{ Character set handling routines }
{ }
{ This module handles different character sets and performs }
{ conversions between them. }
{ }
{ Externally available procedures: }
{ }
{ ConvertMacRomanToUCS - convert MacRoman character to UCS }
{ ConvertUCSToMacRoman - convert UCS character to MacRoman }
{ }
{---------------------------------------------------------------}
unit Charset;
{$LibPrefix '0/obj/'}
interface
{$segment 'SCANNER'}
uses CCommon, Table;
const
maxUCSCodePoint = $10ffff; {Maximum Unicode code point}
maxPlane = 16; {Maximum Unicode plane}
type
ucsCodePoint = 0..maxUCSCodePoint;
utf8Rec = record
length: integer;
bytes: packed array [1..4] of byte;
end;
utf16Rec = record
length: integer;
codeUnits: packed array [1..2] of integer;
end;
function ConvertMacRomanToUCS(ch: char): ucsCodePoint;
{ convert a character from MacRoman charset to UCS (Unicode) }
{ }
{ Returns UCS code point value for the character. }
function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
{ convert a character from UCS (Unicode) to MacRoman charset }
{ }
{ Returns ordinal value of the character, or -1 if it can't be }
{ converted. }
procedure UTF16Encode(ch: ucsCodePoint; var utf16: utf16Rec);
{ Encode a UCS code point in UTF-16 }
{ }
{ ch - the code point }
{ utf16 - set to the UTF-16 representation of the code point }
procedure UTF8Encode(ch: ucsCodePoint; var utf8: utf8Rec);
{ Encode a UCS code point in UTF-8 }
{ }
{ ch - the code point }
{ utf16 - set to the UTF-8 representation of the code point }
function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean;
{ Check if a code point is valid for a UCN in an identifier }
{ }
{ ch - the code point }
{ initial - is this UCN the initial element of the identifier? }
{----------------------------------------------------------------}
implementation
function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
{ convert a character from MacRoman charset to UCS (Unicode) }
{ }
{ Returns UCS code point value for the character. }
begin {ConvertMacRomanToUCS}
if ord(ch) < $80 then
ConvertMacRomanToUCS := ord(ch)
else if ord(ch) <= $ff then
ConvertMacRomanToUCS := ord4(macRomanToUCS[ord(ch)]) & $0000ffff
else
ConvertMacRomanToUCS := $00fffd; {invalid input => REPLACEMENT CHARACTER}
end; {ConvertMacRomanToUCS}
function ConvertUCSToMacRoman{(ch: ucsCodePoint): integer};
{ convert a character from UCS (Unicode) to MacRoman charset }
{ }
{ Returns ordinal value of the character, or -1 if it can't be }
{ converted. }
label 1;
var
i: $80..$ff; {loop index}
ch16bit: integer; {16-bit version of ch (maybe negative)}
begin {ConvertUCSToMacRoman}
if ch < $80 then
ConvertUCSToMacRoman := ord(ch)
else begin
if ch <= $00ffff then begin
ch16bit := ord(ch);
for i := $80 to $ff do begin
if macRomanToUCS[i] = ch16Bit then begin
ConvertUCSToMacRoman := i;
goto 1;
end {if}
end; {for}
end; {if}
ConvertUCSToMacRoman := -1;
end; {else}
1:
end; {ConvertUCSToMacRoman}
procedure UTF16Encode{ch: ucsCodePoint; var utf16: utf16Rec};
{ Encode a UCS code point in UTF-16 }
{ }
{ ch - the code point }
{ utf16 - set to the UTF-16 representation of the code point }
begin {UTF16Encode}
if ch <= $00ffff then begin
utf16.length := 1;
utf16.codeUnits[1] := ord(ch);
end {if}
else begin
utf16.length := 2;
ch := ch - $010000;
utf16.codeUnits[1] := $D800 | ord(ch >> 10);
utf16.codeUnits[2] := $DC00 | ord(ch & $03ff);
end; {else}
end; {UTF16Encode}
procedure UTF8Encode{ch: ucsCodePoint; var utf8: utf8Rec};
{ Encode a UCS code point in UTF-8 }
{ }
{ ch - the code point }
{ utf16 - set to the UTF-8 representation of the code point }
begin {UTF8Encode}
if ch <= $00007f then begin
utf8.length := 1;
utf8.bytes[1] := ord(ch);
end {if}
else if ch <= $0007ff then begin
utf8.length := 2;
utf8.bytes[1] := $C0 | ord(ch >> 6);
utf8.bytes[2] := $80 | ord(ch & $3f)
end {else if}
else if ch <= $00ffff then begin
utf8.length := 3;
utf8.bytes[1] := $E0 | ord(ch >> 12);
utf8.bytes[2] := $80 | ord((ch >> 6) & $3f);
utf8.bytes[3] := $80 | ord(ch & $3f);
end {else if}
else begin
utf8.length := 4;
utf8.bytes[1] := $F0 | ord(ch >> 18);
utf8.bytes[2] := $80 | ord((ch >> 12) & $3f);
utf8.bytes[3] := $80 | ord((ch >> 6) & $3f);
utf8.bytes[4] := $80 | ord(ch & $3f);
end; {else}
end; {UTF8Encode}
function XID_Start(ch: ucsCodePoint): boolean;
{ Check if a Unicode code point has the XID_Start property. }
label 1;
var
plane: integer;
low16: longint;
index: integer;
begin {XID_Start}
XID_Start := false;
plane := ord(ch >> 16);
low16 := ch & $0000FFFF;
if (plane < 0) or (plane > maxPlane) then
goto 1;
for index := XID_Start_PlaneStart[plane] to XID_Start_PlaneStart[plane+1]-1 do
begin
if (low16 >= (XID_Start_Table[index].min & $0000FFFF))
and (low16 <= (XID_Start_Table[index].max & $0000FFFF)) then begin
XID_Start := true;
goto 1;
end; {if}
end; {for}
1:
end; {XID_Start}
function XID_Continue(ch: ucsCodePoint): boolean;
{ Check if a Unicode code point has the XID_Continue property. }
label 1;
var
plane: integer;
low16: longint;
index: integer;
begin {XID_Continue}
if XID_Start(ch) then begin
XID_Continue := true;
goto 1;
end; {if}
XID_Continue := false;
plane := ord(ch >> 16);
low16 := ch & $0000FFFF;
if (plane < 0) or (plane > maxPlane) then
goto 1;
for index := XID_Continue_PlaneStart[plane]
to XID_Continue_PlaneStart[plane+1]-1 do begin
if (low16 >= (XID_Continue_Table[index].min & $0000FFFF))
and (low16 <= (XID_Continue_Table[index].max & $0000FFFF)) then begin
XID_Continue := true;
goto 1;
end; {if}
end; {for}
1:
end; {XID_Continue}
function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
{ Check if a code point is valid for a UCN in an identifier }
{ }
{ ch - the code point }
{ initial - is this UCN the initial element of the identifier? }
begin {ValidUCNForIdentifier}
if cStd < c23 then begin
{See C17 Annex D}
ValidUCNForIdentifier := false;
if (ch = $0000A8)
or (ch = $0000AA)
or (ch = $0000AD)
or (ch = $0000AF)
or ((ch >= $0000B2) and (ch <= $0000B5))
or ((ch >= $0000B7) and (ch <= $0000BA))
or ((ch >= $0000BC) and (ch <= $0000BE))
or ((ch >= $0000C0) and (ch <= $0000D6))
or ((ch >= $0000D8) and (ch <= $0000F6))
or ((ch >= $0000F8) and (ch <= $0000FF))
or ((ch >= $000100) and (ch <= $00167F))
or ((ch >= $001681) and (ch <= $00180D))
or ((ch >= $00180F) and (ch <= $001FFF))
or ((ch >= $00200B) and (ch <= $00200D))
or ((ch >= $00202A) and (ch <= $00202E))
or ((ch >= $00203F) and (ch <= $002040))
or (ch = $002054)
or ((ch >= $002060) and (ch <= $00206F))
or ((ch >= $002070) and (ch <= $00218F))
or ((ch >= $002460) and (ch <= $0024FF))
or ((ch >= $002776) and (ch <= $002793))
or ((ch >= $002C00) and (ch <= $002DFF))
or ((ch >= $002E80) and (ch <= $002FFF))
or ((ch >= $003004) and (ch <= $003007))
or ((ch >= $003021) and (ch <= $00302F))
or ((ch >= $003031) and (ch <= $00303F))
or ((ch >= $003040) and (ch <= $00D7FF))
or ((ch >= $00F900) and (ch <= $00FD3D))
or ((ch >= $00FD40) and (ch <= $00FDCF))
or ((ch >= $00FDF0) and (ch <= $00FE44))
or ((ch >= $00FE47) and (ch <= $00FFFD))
or ((ch >= $010000) and (ch <= $01FFFD))
or ((ch >= $020000) and (ch <= $02FFFD))
or ((ch >= $030000) and (ch <= $03FFFD))
or ((ch >= $040000) and (ch <= $04FFFD))
or ((ch >= $050000) and (ch <= $05FFFD))
or ((ch >= $060000) and (ch <= $06FFFD))
or ((ch >= $070000) and (ch <= $07FFFD))
or ((ch >= $080000) and (ch <= $08FFFD))
or ((ch >= $090000) and (ch <= $09FFFD))
or ((ch >= $0A0000) and (ch <= $0AFFFD))
or ((ch >= $0B0000) and (ch <= $0BFFFD))
or ((ch >= $0C0000) and (ch <= $0CFFFD))
or ((ch >= $0D0000) and (ch <= $0DFFFD))
or ((ch >= $0E0000) and (ch <= $0EFFFD))
then ValidUCNForIdentifier := true;
if initial then
if ((ch >= $000300) and (ch <= $00036F))
or ((ch >= $001DC0) and (ch <= $001DFF))
or ((ch >= $0020D0) and (ch <= $0020FF))
or ((ch >= $00FE20) and (ch <= $00FE2F))
then ValidUCNForIdentifier := false;
end {if}
else begin
{C23 rules}
ValidUCNForIdentifier := false;
if ch >= $0000A0 then
if XID_Start(ch) or (not initial and XID_Continue(ch)) then
ValidUCNForIdentifier := true;
end; {else}
end; {ValidUCNForIdentifier}
end.