mirror of
https://github.com/byteworksinc/ORCA-C.git
synced 2024-12-21 01:30:15 +00:00
5871820e0c
These have u8, u, or U prefixes, respectively. The types char16_t and char32_t (defined in <uchar.h>) are used for UTF-16 and UTF-32 code points.
251 lines
8.5 KiB
ObjectPascal
251 lines
8.5 KiB
ObjectPascal
{$optimize 7}
|
|
{---------------------------------------------------------------}
|
|
{ }
|
|
{ Character set handling routines }
|
|
{ }
|
|
{ This module handles different character sets and performs }
|
|
{ conversions between them. }
|
|
{ }
|
|
{ Externally available procedures: }
|
|
{ }
|
|
{ ConvertMacRomanToUCS - convert MacRoman character to UCS }
|
|
{ ConvertUCSToMacRoman - convert UCS character to MacRoman }
|
|
{ }
|
|
{---------------------------------------------------------------}
|
|
|
|
unit Charset;
|
|
|
|
{$LibPrefix '0/obj/'}
|
|
|
|
interface
|
|
|
|
{$segment 'SCANNER'}
|
|
|
|
uses CCommon, Table;
|
|
|
|
const
|
|
maxUCSCodePoint = $10ffff;
|
|
|
|
type
|
|
ucsCodePoint = 0..maxUCSCodePoint;
|
|
utf8Rec = record
|
|
length: integer;
|
|
bytes: packed array [1..4] of byte;
|
|
end;
|
|
utf16Rec = record
|
|
length: integer;
|
|
codeUnits: packed array [1..2] of integer;
|
|
end;
|
|
|
|
|
|
function ConvertMacRomanToUCS(ch: char): ucsCodePoint;
|
|
|
|
{ convert a character from MacRoman charset to UCS (Unicode) }
|
|
{ }
|
|
{ Returns UCS code point value for the character. }
|
|
|
|
|
|
function ConvertUCSToMacRoman(ch: ucsCodePoint): integer;
|
|
|
|
{ convert a character from UCS (Unicode) to MacRoman charset }
|
|
{ }
|
|
{ Returns ordinal value of the character, or -1 if it can't be }
|
|
{ converted. }
|
|
|
|
|
|
procedure UTF16Encode(ch: ucsCodePoint; var utf16: utf16Rec);
|
|
|
|
{ Encode a UCS code point in UTF-16 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-16 representation of the code point }
|
|
|
|
|
|
procedure UTF8Encode(ch: ucsCodePoint; var utf8: utf8Rec);
|
|
|
|
{ Encode a UCS code point in UTF-8 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-8 representation of the code point }
|
|
|
|
|
|
function ValidUCNForIdentifier(ch: ucsCodePoint; initial: boolean): boolean;
|
|
|
|
{ Check if a code point is valid for a UCN in an identifier }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ initial - is this UCN the initial element of the identifier? }
|
|
|
|
{----------------------------------------------------------------}
|
|
|
|
implementation
|
|
|
|
function ConvertMacRomanToUCS{(ch: char): ucsCodePoint};
|
|
|
|
{ convert a character from MacRoman charset to UCS (Unicode) }
|
|
{ }
|
|
{ Returns UCS code point value for the character. }
|
|
|
|
begin {ConvertMacRomanToUCS}
|
|
if ord(ch) < $80 then
|
|
ConvertMacRomanToUCS := ord(ch)
|
|
else if ord(ch) <= $ff then
|
|
ConvertMacRomanToUCS := ord4(macRomanToUCS[ord(ch)]) & $0000ffff
|
|
else
|
|
ConvertMacRomanToUCS := $00fffd; {invalid input => REPLACEMENT CHARACTER}
|
|
end; {ConvertMacRomanToUCS}
|
|
|
|
|
|
function ConvertUCSToMacRoman{(ch: ucsCodePoint): integer};
|
|
|
|
{ convert a character from UCS (Unicode) to MacRoman charset }
|
|
{ }
|
|
{ Returns ordinal value of the character, or -1 if it can't be }
|
|
{ converted. }
|
|
|
|
label 1;
|
|
|
|
var
|
|
i: $80..$ff; {loop index}
|
|
ch16bit: integer; {16-bit version of ch (maybe negative)}
|
|
|
|
begin {ConvertUCSToMacRoman}
|
|
if ch < $80 then
|
|
ConvertUCSToMacRoman := ord(ch)
|
|
else begin
|
|
if ch <= $00ffff then begin
|
|
ch16bit := ord(ch);
|
|
for i := $80 to $ff do begin
|
|
if macRomanToUCS[i] = ch16Bit then begin
|
|
ConvertUCSToMacRoman := i;
|
|
goto 1;
|
|
end {if}
|
|
end; {for}
|
|
end; {if}
|
|
ConvertUCSToMacRoman := -1;
|
|
end; {else}
|
|
1:
|
|
end; {ConvertUCSToMacRoman}
|
|
|
|
|
|
procedure UTF16Encode{ch: ucsCodePoint; var utf16: utf16Rec};
|
|
|
|
{ Encode a UCS code point in UTF-16 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-16 representation of the code point }
|
|
|
|
begin {UTF16Encode}
|
|
if ch <= $00ffff then begin
|
|
utf16.length := 1;
|
|
utf16.codeUnits[1] := ord(ch);
|
|
end {if}
|
|
else begin
|
|
utf16.length := 2;
|
|
ch := ch - $010000;
|
|
utf16.codeUnits[1] := $D800 | ord(ch >> 10);
|
|
utf16.codeUnits[2] := $DC00 | ord(ch & $03ff);
|
|
end; {else}
|
|
end; {UTF16Encode}
|
|
|
|
|
|
procedure UTF8Encode{ch: ucsCodePoint; var utf8: utf8Rec};
|
|
|
|
{ Encode a UCS code point in UTF-8 }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ utf16 - set to the UTF-8 representation of the code point }
|
|
|
|
begin {UTF8Encode}
|
|
if ch <= $00007f then begin
|
|
utf8.length := 1;
|
|
utf8.bytes[1] := ord(ch);
|
|
end {if}
|
|
else if ch <= $0007ff then begin
|
|
utf8.length := 2;
|
|
utf8.bytes[1] := $C0 | ord(ch >> 6);
|
|
utf8.bytes[2] := $80 | ord(ch & $3f)
|
|
end {else if}
|
|
else if ch <= $00ffff then begin
|
|
utf8.length := 3;
|
|
utf8.bytes[1] := $E0 | ord(ch >> 12);
|
|
utf8.bytes[2] := $80 | ord((ch >> 6) & $3f);
|
|
utf8.bytes[3] := $80 | ord(ch & $3f);
|
|
end {else if}
|
|
else begin
|
|
utf8.length := 4;
|
|
utf8.bytes[1] := $F0 | ord(ch >> 18);
|
|
utf8.bytes[2] := $80 | ord((ch >> 12) & $3f);
|
|
utf8.bytes[3] := $80 | ord((ch >> 6) & $3f);
|
|
utf8.bytes[4] := $80 | ord(ch & $3f);
|
|
end; {else}
|
|
end; {UTF8Encode}
|
|
|
|
|
|
function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
|
|
|
|
{ Check if a code point is valid for a UCN in an identifier }
|
|
{ }
|
|
{ ch - the code point }
|
|
{ initial - is this UCN the initial element of the identifier? }
|
|
|
|
begin {ValidUCNForIdentifier}
|
|
{See C17 Annex D}
|
|
ValidUCNForIdentifier := false;
|
|
if (ch = $0000A8)
|
|
or (ch = $0000AA)
|
|
or (ch = $0000AD)
|
|
or (ch = $0000AF)
|
|
or ((ch >= $0000B2) and (ch <= $0000B5))
|
|
or ((ch >= $0000B7) and (ch <= $0000BA))
|
|
or ((ch >= $0000BC) and (ch <= $0000BE))
|
|
or ((ch >= $0000C0) and (ch <= $0000D6))
|
|
or ((ch >= $0000D8) and (ch <= $0000F6))
|
|
or ((ch >= $0000F8) and (ch <= $0000FF))
|
|
or ((ch >= $000100) and (ch <= $00167F))
|
|
or ((ch >= $001681) and (ch <= $00180D))
|
|
or ((ch >= $00180F) and (ch <= $001FFF))
|
|
or ((ch >= $00200B) and (ch <= $00200D))
|
|
or ((ch >= $00202A) and (ch <= $00202E))
|
|
or ((ch >= $00203F) and (ch <= $002040))
|
|
or (ch = $002054)
|
|
or ((ch >= $002060) and (ch <= $00206F))
|
|
or ((ch >= $002070) and (ch <= $00218F))
|
|
or ((ch >= $002460) and (ch <= $0024FF))
|
|
or ((ch >= $002776) and (ch <= $002793))
|
|
or ((ch >= $002C00) and (ch <= $002DFF))
|
|
or ((ch >= $002E80) and (ch <= $002FFF))
|
|
or ((ch >= $003004) and (ch <= $003007))
|
|
or ((ch >= $003021) and (ch <= $00302F))
|
|
or ((ch >= $003031) and (ch <= $00303F))
|
|
or ((ch >= $003040) and (ch <= $00D7FF))
|
|
or ((ch >= $00F900) and (ch <= $00FD3D))
|
|
or ((ch >= $00FD40) and (ch <= $00FDCF))
|
|
or ((ch >= $00FDF0) and (ch <= $00FE44))
|
|
or ((ch >= $00FE47) and (ch <= $00FFFD))
|
|
or ((ch >= $010000) and (ch <= $01FFFD))
|
|
or ((ch >= $020000) and (ch <= $02FFFD))
|
|
or ((ch >= $030000) and (ch <= $03FFFD))
|
|
or ((ch >= $040000) and (ch <= $04FFFD))
|
|
or ((ch >= $050000) and (ch <= $05FFFD))
|
|
or ((ch >= $060000) and (ch <= $06FFFD))
|
|
or ((ch >= $070000) and (ch <= $07FFFD))
|
|
or ((ch >= $080000) and (ch <= $08FFFD))
|
|
or ((ch >= $090000) and (ch <= $09FFFD))
|
|
or ((ch >= $0A0000) and (ch <= $0AFFFD))
|
|
or ((ch >= $0B0000) and (ch <= $0BFFFD))
|
|
or ((ch >= $0C0000) and (ch <= $0CFFFD))
|
|
or ((ch >= $0D0000) and (ch <= $0DFFFD))
|
|
or ((ch >= $0E0000) and (ch <= $0EFFFD))
|
|
then ValidUCNForIdentifier := true;
|
|
|
|
if initial then
|
|
if ((ch >= $000300) and (ch <= $00036F))
|
|
or ((ch >= $001DC0) and (ch <= $001DFF))
|
|
or ((ch >= $0020D0) and (ch <= $0020FF))
|
|
or ((ch >= $00FE20) and (ch <= $00FE2F))
|
|
then ValidUCNForIdentifier := false;
|
|
end; {ValidUCNForIdentifier}
|
|
|
|
end.
|