Implement C23 changes to universal character names.

As of C23, UCNs within string literals or character constants can contain any valid Unicode code point, including ASCII characters or control characters.

The validity of UCNs within identifiers is now defined based on the XID_Start and XID_Continue Unicode properties. A helper program is used to generate tables of the allowed characters based on a Unicode data file. These can be updated for future Unicode versions by re-running the helper program using the updated Unicode data files.
This commit is contained in:
Stephen Heumann 2024-09-13 22:14:43 -05:00
parent bae40bc615
commit ead95bcb12
7 changed files with 1877 additions and 58 deletions

1508
CharTables.asm Normal file

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,8 @@ interface
uses CCommon, Table;
const
maxUCSCodePoint = $10ffff;
maxUCSCodePoint = $10ffff; {Maximum Unicode code point}
maxPlane = 16; {Maximum Unicode plane}
type
ucsCodePoint = 0..maxUCSCodePoint;
@ -182,6 +183,73 @@ else begin
end; {UTF8Encode}
function XID_Start(ch: ucsCodePoint): boolean;
{ Check if a Unicode code point has the XID_Start property. }
label 1;
var
plane: integer;
low16: longint;
index: integer;
begin {XID_Start}
XID_Start := false;
plane := ord(ch >> 16);
low16 := ch & $0000FFFF;
if (plane < 0) or (plane > maxPlane) then
goto 1;
for index := XID_Start_PlaneStart[plane] to XID_Start_PlaneStart[plane+1]-1 do
begin
if (low16 >= (XID_Start_Table[index].min & $0000FFFF))
and (low16 <= (XID_Start_Table[index].max & $0000FFFF)) then begin
XID_Start := true;
goto 1;
end; {if}
end; {for}
1:
end; {XID_Start}
function XID_Continue(ch: ucsCodePoint): boolean;
{ Check if a Unicode code point has the XID_Continue property. }
label 1;
var
plane: integer;
low16: longint;
index: integer;
begin {XID_Continue}
if XID_Start(ch) then begin
XID_Continue := true;
goto 1;
end; {if}
XID_Continue := false;
plane := ord(ch >> 16);
low16 := ch & $0000FFFF;
if (plane < 0) or (plane > maxPlane) then
goto 1;
for index := XID_Continue_PlaneStart[plane]
to XID_Continue_PlaneStart[plane+1]-1 do begin
if (low16 >= (XID_Continue_Table[index].min & $0000FFFF))
and (low16 <= (XID_Continue_Table[index].max & $0000FFFF)) then begin
XID_Continue := true;
goto 1;
end; {if}
end; {for}
1:
end; {XID_Continue}
function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
{ Check if a code point is valid for a UCN in an identifier }
@ -190,61 +258,70 @@ function ValidUCNForIdentifier{(ch: ucsCodePoint; initial: boolean): boolean};
{ initial - is this UCN the initial element of the identifier? }
begin {ValidUCNForIdentifier}
{See C17 Annex D}
ValidUCNForIdentifier := false;
if (ch = $0000A8)
or (ch = $0000AA)
or (ch = $0000AD)
or (ch = $0000AF)
or ((ch >= $0000B2) and (ch <= $0000B5))
or ((ch >= $0000B7) and (ch <= $0000BA))
or ((ch >= $0000BC) and (ch <= $0000BE))
or ((ch >= $0000C0) and (ch <= $0000D6))
or ((ch >= $0000D8) and (ch <= $0000F6))
or ((ch >= $0000F8) and (ch <= $0000FF))
or ((ch >= $000100) and (ch <= $00167F))
or ((ch >= $001681) and (ch <= $00180D))
or ((ch >= $00180F) and (ch <= $001FFF))
or ((ch >= $00200B) and (ch <= $00200D))
or ((ch >= $00202A) and (ch <= $00202E))
or ((ch >= $00203F) and (ch <= $002040))
or (ch = $002054)
or ((ch >= $002060) and (ch <= $00206F))
or ((ch >= $002070) and (ch <= $00218F))
or ((ch >= $002460) and (ch <= $0024FF))
or ((ch >= $002776) and (ch <= $002793))
or ((ch >= $002C00) and (ch <= $002DFF))
or ((ch >= $002E80) and (ch <= $002FFF))
or ((ch >= $003004) and (ch <= $003007))
or ((ch >= $003021) and (ch <= $00302F))
or ((ch >= $003031) and (ch <= $00303F))
or ((ch >= $003040) and (ch <= $00D7FF))
or ((ch >= $00F900) and (ch <= $00FD3D))
or ((ch >= $00FD40) and (ch <= $00FDCF))
or ((ch >= $00FDF0) and (ch <= $00FE44))
or ((ch >= $00FE47) and (ch <= $00FFFD))
or ((ch >= $010000) and (ch <= $01FFFD))
or ((ch >= $020000) and (ch <= $02FFFD))
or ((ch >= $030000) and (ch <= $03FFFD))
or ((ch >= $040000) and (ch <= $04FFFD))
or ((ch >= $050000) and (ch <= $05FFFD))
or ((ch >= $060000) and (ch <= $06FFFD))
or ((ch >= $070000) and (ch <= $07FFFD))
or ((ch >= $080000) and (ch <= $08FFFD))
or ((ch >= $090000) and (ch <= $09FFFD))
or ((ch >= $0A0000) and (ch <= $0AFFFD))
or ((ch >= $0B0000) and (ch <= $0BFFFD))
or ((ch >= $0C0000) and (ch <= $0CFFFD))
or ((ch >= $0D0000) and (ch <= $0DFFFD))
or ((ch >= $0E0000) and (ch <= $0EFFFD))
then ValidUCNForIdentifier := true;
if initial then
if ((ch >= $000300) and (ch <= $00036F))
or ((ch >= $001DC0) and (ch <= $001DFF))
or ((ch >= $0020D0) and (ch <= $0020FF))
or ((ch >= $00FE20) and (ch <= $00FE2F))
then ValidUCNForIdentifier := false;
if cStd < c23 then begin
{See C17 Annex D}
ValidUCNForIdentifier := false;
if (ch = $0000A8)
or (ch = $0000AA)
or (ch = $0000AD)
or (ch = $0000AF)
or ((ch >= $0000B2) and (ch <= $0000B5))
or ((ch >= $0000B7) and (ch <= $0000BA))
or ((ch >= $0000BC) and (ch <= $0000BE))
or ((ch >= $0000C0) and (ch <= $0000D6))
or ((ch >= $0000D8) and (ch <= $0000F6))
or ((ch >= $0000F8) and (ch <= $0000FF))
or ((ch >= $000100) and (ch <= $00167F))
or ((ch >= $001681) and (ch <= $00180D))
or ((ch >= $00180F) and (ch <= $001FFF))
or ((ch >= $00200B) and (ch <= $00200D))
or ((ch >= $00202A) and (ch <= $00202E))
or ((ch >= $00203F) and (ch <= $002040))
or (ch = $002054)
or ((ch >= $002060) and (ch <= $00206F))
or ((ch >= $002070) and (ch <= $00218F))
or ((ch >= $002460) and (ch <= $0024FF))
or ((ch >= $002776) and (ch <= $002793))
or ((ch >= $002C00) and (ch <= $002DFF))
or ((ch >= $002E80) and (ch <= $002FFF))
or ((ch >= $003004) and (ch <= $003007))
or ((ch >= $003021) and (ch <= $00302F))
or ((ch >= $003031) and (ch <= $00303F))
or ((ch >= $003040) and (ch <= $00D7FF))
or ((ch >= $00F900) and (ch <= $00FD3D))
or ((ch >= $00FD40) and (ch <= $00FDCF))
or ((ch >= $00FDF0) and (ch <= $00FE44))
or ((ch >= $00FE47) and (ch <= $00FFFD))
or ((ch >= $010000) and (ch <= $01FFFD))
or ((ch >= $020000) and (ch <= $02FFFD))
or ((ch >= $030000) and (ch <= $03FFFD))
or ((ch >= $040000) and (ch <= $04FFFD))
or ((ch >= $050000) and (ch <= $05FFFD))
or ((ch >= $060000) and (ch <= $06FFFD))
or ((ch >= $070000) and (ch <= $07FFFD))
or ((ch >= $080000) and (ch <= $08FFFD))
or ((ch >= $090000) and (ch <= $09FFFD))
or ((ch >= $0A0000) and (ch <= $0AFFFD))
or ((ch >= $0B0000) and (ch <= $0BFFFD))
or ((ch >= $0C0000) and (ch <= $0CFFFD))
or ((ch >= $0D0000) and (ch <= $0DFFFD))
or ((ch >= $0E0000) and (ch <= $0EFFFD))
then ValidUCNForIdentifier := true;
if initial then
if ((ch >= $000300) and (ch <= $00036F))
or ((ch >= $001DC0) and (ch <= $001DFF))
or ((ch >= $0020D0) and (ch <= $0020FF))
or ((ch >= $00FE20) and (ch <= $00FE2F))
then ValidUCNForIdentifier := false;
end {if}
else begin
{C23 rules}
ValidUCNForIdentifier := false;
if ch >= $0000A0 then
if XID_Start(ch) or (not initial and XID_Continue(ch)) then
ValidUCNForIdentifier := true;
end; {else}
end; {ValidUCNForIdentifier}
end.

219
GenCharTbl.c Normal file
View File

@ -0,0 +1,219 @@
/*
* This program is designed to parse the Unicode DerivedCoreProperties.txt
* file and produce tables indicating if a code point has the XID_Start or
* XID_Continue properties. This is needed to define the legal universal
* character names in identifiers under C23.
*
* The DerivedCoreProperties.txt file for the current Unicode version is at:
* https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_PLANE 16
/* Maximum number of ranges supported -- increase if necessary. */
#define MAX_RANGES 2000
#define MAX_LINE 1000
#define OUTPUT_FILE "CharTables.asm"
typedef struct CharRange {
unsigned long start, end;
} CharRange;
static CharRange XID_Start_Ranges[MAX_RANGES];
static CharRange XID_Continue_Ranges[MAX_RANGES];
static char line[MAX_LINE];
static int cmp(const void *a_, const void *b_) {
const CharRange *a = a_;
const CharRange *b = b_;
if (a->start < b->start) {
return -1;
} else if (a->start > b->start) {
return 1;
} else {
if (a->end < b->end) {
return -1;
} else if (a->end > b->end) {
return 1;
} else {
return 0;
}
}
}
int main(int argc, char *argv[]) {
FILE *infile, *outfile;
unsigned xid_start_idx = 0;
unsigned xid_continue_idx = 0;
CharRange range;
char property[101];
int i;
int last_plane;
if (argc != 2) {
fprintf(stderr, "Usage: %s DerivedCoreProperties.txt\n",
argc > 0 ? argv[0] : "GetIDChars");
return EXIT_FAILURE;
}
infile = fopen(argv[1], "r");
if (!infile) {
fprintf(stderr, "Error opening %s\n", argv[1]);
return EXIT_FAILURE;
}
outfile = fopen(OUTPUT_FILE, "w");
if (!outfile) {
fclose(infile);
fprintf(stderr, "Error opening %s\n", OUTPUT_FILE);
return EXIT_FAILURE;
}
fprintf(outfile, "*****************************************************************\n");
fprintf(outfile, "*\n");
fprintf(outfile, "* %s\n", OUTPUT_FILE);
fprintf(outfile, "*\n");
fprintf(outfile, "* These tables record the Unicode code points that have the\n");
fprintf(outfile, "* XID_Start or XID_Continue properties as defined in the\n");
fprintf(outfile, "* Unicode Character Database. These define the legal\n");
fprintf(outfile, "* universal character names in identifiers under C23.\n");
fprintf(outfile, "*\n");
fprintf(outfile, "* THIS FILE IS AUTO-GENERATED FROM UNICODE DATA BY GenCharTbl.\n");
fprintf(outfile, "* DO NOT EDIT IT MANUALLY.\n");
fprintf(outfile, "*\n");
fprintf(outfile, "* Generated from:\n");
fgets(line, MAX_LINE, infile);
if (line[0]) line[strlen(line)-1] = 0;
fprintf(outfile, "* %-61s\n", line);
fgets(line, MAX_LINE, infile);
if (line[0]) line[strlen(line)-1] = 0;
fprintf(outfile, "* %-61s\n", line);
fprintf(outfile, "*\n");
fprintf(outfile, "*****************************************************************\n");
fprintf(outfile, "\n");
fseek(infile, 0, SEEK_SET);
for (; !feof(infile); fgets(line, MAX_LINE, infile)) {
int count = fscanf(infile, "%lx..%lx", &range.start, &range.end);
if (count == 1) {
range.end = range.start;
} else if (count != 2) {
continue;
}
count = fscanf(infile, " ; %100s", property);
if (count != 1) {
fclose(infile);
fclose(outfile);
remove(OUTPUT_FILE);
fprintf(stderr, "Unexpected file format\n");
return EXIT_FAILURE;
}
if (strcmp(property, "XID_Start") == 0) {
XID_Start_Ranges[xid_start_idx++] = range;
//printf("XID_Start range: %04lx..%04lx\n", range.start, range.end);
//printf("%lu\n", range.end-range.start);
if (xid_start_idx == MAX_RANGES) {
fclose(infile);
fclose(outfile);
remove(OUTPUT_FILE);
fprintf(stderr, "Too many XID_Start ranges\n");
return EXIT_FAILURE;
}
} else if (strcmp(property, "XID_Continue") == 0) {
if (bsearch(&range, XID_Start_Ranges, xid_start_idx,
sizeof(CharRange), cmp)) {
//printf("Skipping XID_Continue range: %04lx..%04lx\n", range.start, range.end);
continue;
}
XID_Continue_Ranges[xid_continue_idx++] = range;
//printf("XID_Continue range: %04lx..%04lx\n", range.start, range.end);
if (xid_continue_idx == MAX_RANGES) {
fclose(infile);
fclose(outfile);
remove(OUTPUT_FILE);
fprintf(stderr, "Too many XID_Continue ranges\n");
return EXIT_FAILURE;
}
}
}
fprintf(outfile, "* Declarations (to be copied into Table.pas):\n");
fprintf(outfile, "*\n");
fprintf(outfile, "* XID_Start_Table: array[0..%u] of charRange;\n", xid_start_idx-1);
fprintf(outfile, "* XID_Continue_Table: array[0..%u] of charRange;\n", xid_continue_idx-1);
fprintf(outfile, "* XID_Start_PlaneStart: array[0..17] of integer;\n");
fprintf(outfile, "* XID_Continue_PlaneStart: array[0..17] of integer;\n");
fprintf(outfile, "\n");
fprintf(outfile, "XID_Start_Table start\n");
last_plane = -1;
for (i = 0; i < xid_start_idx; i++) {
while (XID_Start_Ranges[i].start >> 16 != last_plane)
fprintf(outfile, "plane%d anop\n", ++last_plane);
if (XID_Start_Ranges[i].end >> 16 != last_plane) {
fprintf(stderr, "Range spans multiple planes\n");
return EXIT_FAILURE;
}
fprintf(outfile, " dc i2'$%04lx,$%04lx'\n",
XID_Start_Ranges[i].start & 0xFFFF,
XID_Start_Ranges[i].end & 0xFFFF);
}
while (last_plane < MAX_PLANE + 1)
fprintf(outfile, "plane%d anop\n", ++last_plane);
fprintf(outfile, "\n");
fprintf(outfile, "XID_Start_PlaneStart entry\n");
for (i = 0; i <= MAX_PLANE + 1; i++) {
fprintf(outfile, " dc i2'(plane%d-plane0)/4'\n", i);
}
fprintf(outfile, " end\n");
fprintf(outfile, "\n");
fprintf(outfile, "\n");
fprintf(outfile, "* This table only contains XID_Continue ranges that are not in XID_Start_Table.\n");
fprintf(outfile, "* A code point has the XID_Continue property if it is in either table.\n");
fprintf(outfile, "XID_Continue_Table start\n");
last_plane = -1;
for (i = 0; i < xid_continue_idx; i++) {
while (XID_Continue_Ranges[i].start >> 16 != last_plane)
fprintf(outfile, "plane%d anop\n", ++last_plane);
if (XID_Continue_Ranges[i].end >> 16 != last_plane) {
fprintf(stderr, "Range spans multiple planes\n");
return EXIT_FAILURE;
}
fprintf(outfile, " dc i2'$%04lx,$%04lx'\n",
XID_Continue_Ranges[i].start & 0xFFFF,
XID_Continue_Ranges[i].end & 0xFFFF);
}
while (last_plane < MAX_PLANE + 1)
fprintf(outfile, "plane%d anop\n", ++last_plane);
fprintf(outfile, "\n");
fprintf(outfile, "XID_Continue_PlaneStart entry\n");
for (i = 0; i <= MAX_PLANE + 1; i++) {
fprintf(outfile, " dc i2'(plane%d-plane0)/4'\n", i);
}
fprintf(outfile, " end\n");
if (ferror(infile) || ferror(outfile)) {
fclose(infile);
fclose(outfile);
remove(OUTPUT_FILE);
fprintf(stderr, "I/O error\n");
return EXIT_FAILURE;
}
fclose(infile);
fclose(outfile);
}

View File

@ -4333,7 +4333,8 @@ ucnString[0] := chr(i - 1);
if (codePoint < 0) or (codePoint > maxUCSCodePoint)
or ((codePoint >= $00D800) and (codePoint <= $00DFFF))
or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60]))
or ((codePoint < $A0) and not (ord(codePoint) in [$24,$40,$60])
and (cStd < c23))
then begin
Error(145);
UniversalCharacterName := $0000C0;

View File

@ -1098,3 +1098,5 @@ macRomanToUCS start
dc i2'$F8FF, $00D2, $00DA, $00DB, $00D9, $0131, $02C6, $02DC'
dc i2'$00AF, $02D8, $02D9, $02DA, $00B8, $02DD, $02DB, $02C7'
end
copy chartables.asm

View File

@ -15,6 +15,12 @@ interface
uses CCommon;
type
charRange = record {Range of Unicode chars (low 16 bits)}
min: integer;
max: integer;
end;
var
{from scanner.pas}
{----------------}
@ -43,6 +49,12 @@ var
{from Charset.pas}
{----------------}
macRomanToUCS: array[$80..$FF] of integer; {mapping from MacRoman charset to UCS}
{Unicode data tables in CharTables.asm}
XID_Start_Table: array[0..765] of charRange;
XID_Continue_Table: array[0..632] of charRange;
XID_Start_PlaneStart: array[0..17] of integer;
XID_Continue_PlaneStart: array[0..17] of integer;
implementation
end.

2
make
View File

@ -128,7 +128,7 @@ if {#} == 0
set header header
end
Newer obj/table.a table.pas table.asm
Newer obj/table.a table.pas table.asm chartables.asm
if {status} != 0
set table table
set asm asm