From d24dacf01aa01378d29f296920c9a70401f7045d Mon Sep 17 00:00:00 2001 From: Stephen Heumann Date: Sun, 19 Jan 2020 22:00:05 -0600 Subject: [PATCH] Add initial support for universal character names. This currently only works in character constants or strings, not identifiers. --- Charset.pas | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++ Scanner.pas | 70 +++++++++++++++++++++++++++++++++++++-- Table.asm | 19 +++++++++++ Table.pas | 3 ++ linkit | 1 + make | 15 +++++++-- 6 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 Charset.pas diff --git a/Charset.pas b/Charset.pas new file mode 100644 index 0000000..272d3ba --- /dev/null +++ b/Charset.pas @@ -0,0 +1,95 @@ +{$optimize 7} +{---------------------------------------------------------------} +{ } +{ Character set handling routines } +{ } +{ This module handles different character sets and performs } +{ conversions between them. } +{ } +{ Externally available procedures: } +{ } +{ ConvertMacRomanToUCS - convert MacRoman character to UCS } +{ ConvertUCSToMacRoman - convert UCS character to MacRoman } +{ } +{---------------------------------------------------------------} + +unit Charset; + +{$LibPrefix '0/obj/'} + +interface + +{$segment 'SCANNER'} + +uses CCommon, Table; + +const + maxUCSCodePoint = $10ffff; + +type + ucsCodePoint = 0..maxUCSCodePoint; + + +function ConvertMacRomanToUCS(ch: char): ucsCodePoint; + +{ convert a character from MacRoman charset to UCS (Unicode) } +{ } +{ Returns UCS code point value for the character. } + +function ConvertUCSToMacRoman(ch: ucsCodePoint): integer; + +{ convert a character from UCS (Unicode) to MacRoman charset } +{ } +{ Returns ordinal value of the character, or -1 if it can't be } +{ converted. } + +implementation + +function ConvertMacRomanToUCS{(ch: char): ucsCodePoint}; + +{ convert a character from MacRoman charset to UCS (Unicode) } +{ } +{ Returns UCS code point value for the character. } + +begin {ConvertMacRomanToUCS} +if ord(ch) < $80 then + ConvertMacRomanToUCS := ord(ch) +else if ord(ch) <= $ff then + ConvertMacRomanToUCS := ord4(macRomanToUCS[ord(ch)]) & $0000ffff +else + ConvertMacRomanToUCS := $00fffd; {invalid input => REPLACEMENT CHARACTER} +end; {ConvertMacRomanToUCS} + + +function ConvertUCSToMacRoman{(ch: ucsCodePoint): integer}; + +{ convert a character from UCS (Unicode) to MacRoman charset } +{ } +{ Returns ordinal value of the character, or -1 if it can't be } +{ converted. } + +label 1; + +var + i: $80..$ff; {loop index} + ch16bit: integer; {16-bit version of ch (maybe negative)} + +begin {ConvertUCSToMacRoman} +if ch < $80 then + ConvertUCSToMacRoman := ord(ch) +else begin + if ch <= $00ffff then begin + ch16bit := ord(ch); + for i := $80 to $ff do begin + if macRomanToUCS[i] = ch16Bit then begin + ConvertUCSToMacRoman := i; + goto 1; + end {if} + end; {for} + end; {if} + ConvertUCSToMacRoman := -1; + end; {else} +1: +end; {ConvertUCSToMacRoman} + +end. diff --git a/Scanner.pas b/Scanner.pas index 08fcc53..d4d90c3 100644 --- a/Scanner.pas +++ b/Scanner.pas @@ -29,7 +29,7 @@ interface {$LibPrefix '0/obj/'} -uses CCommon, Table, CGI, MM; +uses CCommon, Table, CGI, MM, Charset; {$segment 'SCANNER'} @@ -666,6 +666,8 @@ if list or (numErr <> 0) then begin 142: msg := @'_Alignas may not be used in this declaration or type name'; 143: msg := @'only object pointer types may be restrict-qualified'; 144: msg := @'generic selection expressions are not supported by ORCA/C'; + 145: msg := @'invalid universal character name'; + 146: msg := @'Unicode character cannot be represented in execution character set'; otherwise: Error(57); end; {case} writeln(msg^); @@ -3309,6 +3311,57 @@ if scanWork then {make sure we read all characters} end; {DoNumber} +function UniversalCharacterName : ucsCodePoint; + +{ Scan a universal character name. } +{ The current character should be the 'u' or 'U'. } +{ } +{ Returns the code point value of the UCN. } + +var + digits: integer; {number of hex digits (4 or 8)} + codePoint: longint; {the code point specified by this UCN} + dig: 0..15; {value of a hex digit} + +begin {UniversalCharacterName} +codePoint := 0; +if ch = 'u' then + digits := 4 +else {if ch = 'U' then} + digits := 8; +NextCh; + +while digits > 0 do begin + if ch in ['0'..'9','a'..'f','A'..'F'] then begin + if ch in ['0'..'9'] then + dig := ord(ch) & $0F + else begin + ch := chr(ord(ch)&$5F); + dig := ord(ch)-ord('A')+10; + end; {else} + codePoint := (codePoint << 4) | dig; + NextCh; + digits := digits - 1; + end {while} + else begin + Error(145); + codePoint := ord('$'); + digits := 0; + end; {else} + end; {while} + +if (codePoint < 0) or (codePoint > maxUCSCodePoint) + or ((codePoint >= $00D800) and (codePoint <= $00DFFF)) + or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60])) + then begin + Error(145); + UniversalCharacterName := ord('$'); + end {if} +else + UniversalCharacterName := codePoint; +end; {UniversalCharacterName} + + procedure InitScanner {start, end: ptr}; { initialize the scanner } @@ -3790,12 +3843,14 @@ var dig: 0..15; {value of a hex digit} skipChar: boolean; {get next char when done?} val: 0..4095; {hex escape code value (scaled to 0..255)} + codePoint: ucsCodePoint; {code point given by UCN} + chFromUCN: integer; {character given by UCN (converted)} begin {EscapeCh} 1: skipChar := true; if ch = '\' then begin NextCh; - if ch in ['0'..'7','a','b','t','n','v','f','p','r','x'] then + if ch in ['0'..'7','a','b','t','n','v','f','p','r','x','u','U'] then case ch of '0','1','2','3','4','5','6','7': begin val := 0; @@ -3835,6 +3890,17 @@ var skipChar := false; EscapeCh := val & $FF; end; + 'u','U': begin + codePoint := UniversalCharacterName; + chFromUCN := ConvertUCSToMacRoman(codePoint); + skipChar := false; + if chFromUCN >= 0 then + EscapeCh := chFromUCN + else begin + EscapeCh := 0; + Error(146); + end; {else} + end; otherwise: Error(57); end {case} else diff --git a/Table.asm b/Table.asm index a8f71dd..2e0e025 100644 --- a/Table.asm +++ b/Table.asm @@ -913,3 +913,22 @@ wordHash start reserved word hash table dc i'pascalsy,pascalsy,pascalsy,pascalsy,registersy,registersy' dc i'shortsy,typedefsy,unionsy,voidsy,whilesy,succwhilesy' end + +macRomanToUCS start + dc i2'$00C4, $00C5, $00C7, $00C9, $00D1, $00D6, $00DC, $00E1' + dc i2'$00E0, $00E2, $00E4, $00E3, $00E5, $00E7, $00E9, $00E8' + dc i2'$00EA, $00EB, $00ED, $00EC, $00EE, $00EF, $00F1, $00F3' + dc i2'$00F2, $00F4, $00F6, $00F5, $00FA, $00F9, $00FB, $00FC' + dc i2'$2020, $00B0, $00A2, $00A3, $00A7, $2022, $00B6, $00DF' + dc i2'$00AE, $00A9, $2122, $00B4, $00A8, $2260, $00C6, $00D8' + dc i2'$221E, $00B1, $2264, $2265, $00A5, $00B5, $2202, $2211' + dc i2'$220F, $03C0, $222B, $00AA, $00BA, $03A9, $00E6, $00F8' + dc i2'$00BF, $00A1, $00AC, $221A, $0192, $2248, $2206, $00AB' + dc i2'$00BB, $2026, $00A0, $00C0, $00C3, $00D5, $0152, $0153' + dc i2'$2013, $2014, $201C, $201D, $2018, $2019, $00F7, $25CA' + dc i2'$00FF, $0178, $2044, $00A4, $2039, $203A, $FB01, $FB02' + dc i2'$2021, $00B7, $201A, $201E, $2030, $00C2, $00CA, $00C1' + dc i2'$00CB, $00C8, $00CD, $00CE, $00CF, $00CC, $00D3, $00D4' + dc i2'$F8FF, $00D2, $00DA, $00DB, $00D9, $0131, $02C6, $02DC' + dc i2'$00AF, $02D8, $02D9, $02DA, $00B8, $02DD, $02DB, $02C7' + end diff --git a/Table.pas b/Table.pas index 77418ea..e590bdb 100644 --- a/Table.pas +++ b/Table.pas @@ -38,6 +38,9 @@ var icp: array[tokenEnum] of byte; {in-commong priorities} isp: array[tokenEnum] of byte; {in-stack priorities} + {from Charset.pas} + {----------------} + macRomanToUCS: array[$80..$FF] of integer; {mapping from MacRoman charset to UCS} implementation end. diff --git a/linkit b/linkit index 4ca9b21..73a221a 100644 --- a/linkit +++ b/linkit @@ -15,5 +15,6 @@ obj/dag obj/gen obj/header obj/printf +obj/charset keep=16/cc diff --git a/make b/make index babad39..b568e13 100644 --- a/make +++ b/make @@ -43,6 +43,7 @@ if {#} == 0 set gen gen set header header set printf printf + set charset charset end Newer obj/cgc.a cgc.pas cgc.asm @@ -134,6 +135,7 @@ if {#} == 0 set expression expression set parser parser set scanner scanner + set charset charset end Newer obj/dag.a dag.pas @@ -161,6 +163,13 @@ if {#} == 0 set expression expression end + Newer obj/charset.a charset.pas + if {status} != 0 + set cc cc + set scanner scanner + set charset charset + end + else for i set {i} {i} @@ -184,9 +193,9 @@ if "{table}" == table end set list "" -set list {ccommon} {mm} {cgi} {scanner} {symbol} {header} {printf} -set list {list} {expression} {cgc} {asm} {parser} {cc} {objout} {native} -set list {list} {gen} {dag} +set list {ccommon} {mm} {cgi} {charset} {scanner} {symbol} {header} +set list {list} {printf} {expression} {cgc} {asm} {parser} {cc} +set list {list} {objout} {native} {gen} {dag} if "{list}" != "" for i in {list}