From 9cc72c88452bf6b75a94ba28ddeb5ec2849a6667 Mon Sep 17 00:00:00 2001
From: Stephen Heumann <stephenheumann@gmail.com>
Date: Tue, 8 Nov 2022 18:47:03 -0600
Subject: [PATCH] Support "other character" preprocessing tokens.

This implements the catch-all category for preprocessing tokens for "each non-white-space character that cannot be one of the above" (C17 section 6.4). These may appear in skipped code, or in macros or macro parameters if they are never expanded or are stringized during macro processing. The affected characters are $, @, `, and many extended characters.

It is still an error if these tokens are used in contexts where they remain present after preprocessing. If #pragma ignore bit 0 is clear, these characters are also reported as errors in skipped code or preprocessor constructs.
---
 CCommon.pas |   7 +++-
 Header.pas  |   4 +-
 Scanner.asm |   2 +-
 Scanner.pas |  26 ++++++++++++-
 Table.asm   | 104 ++++++++++++++++++++++++++--------------------------
 5 files changed, 86 insertions(+), 57 deletions(-)

diff --git a/CCommon.pas b/CCommon.pas
index b4faa57..a7ea74c 100644
--- a/CCommon.pas
+++ b/CCommon.pas
@@ -196,6 +196,7 @@ type
                barbarop,pluseqop,minuseqop,asteriskeqop,slasheqop,
                percenteqop,ltlteqop,gtgteqop,andeqop,caroteqop,
                bareqop,poundpoundop,dotdotdotsy,
+               otherch,                 {other non-whitespace char (pp-token)}
                eolsy,eofsy,             {control characters}
                typedef,                 {user types}
                uminus,uand,uasterisk,   {converted operations}
@@ -209,14 +210,15 @@ type
       (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc,
        ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string,
        ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon,
-       ch_backslash,letter,digit);
+       ch_backslash,ch_other,letter,digit);
 
                                         {prefixes of a character/string literal}
    charStrPrefixEnum = (prefix_none,prefix_L,prefix_u16,prefix_U32,prefix_u8);
 
    tokenSet = set of tokenEnum;
    tokenClass = (reservedWord,reservedSymbol,identifier,intConstant,longConstant,
-                 longlongConstant,realConstant,stringConstant,macroParameter);
+                 longlongConstant,realConstant,stringConstant,otherCharacter,
+                 macroParameter);
    identPtr = ^identRecord;             {^ to a symbol table entry}
    tokenType = record                   {a token}
       kind: tokenEnum;                  {kind of token}
@@ -233,6 +235,7 @@ type
          stringConstant: (sval: longstringPtr;
                           ispstring: boolean;
                           prefix: charStrPrefixEnum);
+         otherCharacter: (ch: char);    {used for preprocessing tokens only}
          macroParameter: (pnum: integer);
      end;
  
diff --git a/Header.pas b/Header.pas
index 1507c63..7b53efb 100644
--- a/Header.pas
+++ b/Header.pas
@@ -18,7 +18,7 @@ uses CCommon, MM, Scanner, Symbol, CGI;
 {$segment 'HEADER'}
 
 const
-   symFileVersion = 31;                 {version number of .sym file format}
+   symFileVersion = 32;                 {version number of .sym file format}
 
 var
    inhibitHeader: boolean;		{should .sym includes be blocked?}
@@ -721,6 +721,7 @@ procedure EndInclude {chPtr: ptr};
                 		WriteByte(ord(token.ispstring));
                 		WriteByte(ord(token.prefix));
                 		end;
+            otherCharacter:	WriteByte(ord(token.ch));
             macroParameter:	WriteWord(token.pnum);
             reservedSymbol:	if token.kind in [lbracech,rbracech,lbrackch,
                                    rbrackch,poundch,poundpoundop] then 
@@ -1360,6 +1361,7 @@ var
                 	        token.ispstring := ReadByte <> 0;
                 	        token.prefix := charStrPrefixEnum(ReadByte);
                 	        end;
+         otherCharacter:	token.ch := chr(ReadByte);
          macroParameter:	token.pnum := ReadWord;
          reservedSymbol:	if token.kind in [lbracech,rbracech,lbrackch,
                                    rbrackch,poundch,poundpoundop] then 
diff --git a/Scanner.asm b/Scanner.asm
index 87584f3..515779d 100644
--- a/Scanner.asm
+++ b/Scanner.asm
@@ -465,7 +465,7 @@ cch      equ   13
          enum  (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
          enum  (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
          enum  (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
-         enum  (ch_backslash,letter,digit)
+         enum  (ch_backslash,ch_other,letter,digit)
 
 ! begin {NextCh}
          tsc                            create stack frame
diff --git a/Scanner.pas b/Scanner.pas
index e43b637..1b765ef 100644
--- a/Scanner.pas
+++ b/Scanner.pas
@@ -77,6 +77,7 @@ var
    macros: ^macroTable;                 {preprocessor macro list}
    pathList: pathRecordPtr;		{additional search paths}
    printMacroExpansions: boolean;       {print the token list?}
+   preprocessing: boolean;              {doing pp directive or macro params?}
    suppressMacroExpansions: boolean;    {suppress printing even if requested?}
    reportEOL: boolean;                  {report eolsy as a token?}
    token: tokenType;                    {next token to process}
@@ -1059,6 +1060,8 @@ case token.kind of
                         write('%:%:');
 
    dotdotdotsy:      write('...');
+   
+   otherch:          write(token.ch);
 
    macroParm:        write('$', token.pnum:1);
 
@@ -1838,6 +1841,7 @@ var
    i: integer;                          {loop counter}
    inhibit: boolean;                    {inhibit parameter expansion?}
    lexpandMacros: boolean;              {local copy of expandMacros}
+   lPreprocessing: boolean;             {local copy of preprocessing}
    lSuppressMacroExpansions: boolean;   {local copy of suppressMacroExpansions}
    mPtr: macroRecordPtr;                {for checking list of macros}
    newParm: parameterPtr;               {for building a new parameter entry}
@@ -1861,6 +1865,8 @@ parms := nil;                           {no parms so far}
 if macro^.parameters >= 0 then begin    {find the values of the parameters}
    NextToken;                           {get the '(' (we hope...)}
    if token.kind = lparench then begin
+      lPreprocessing := preprocessing;
+      preprocessing := true;
       NextToken;                        {skip the '('}
       paramCount := 0;                  {process the parameters}
       parmEnd := nil;
@@ -1912,6 +1918,7 @@ if macro^.parameters >= 0 then begin    {find the values of the parameters}
             PutBackToken(token, true);
          Error(12);
          end; {if}
+      preprocessing := lPreprocessing;
       end {if}
    else begin
       Error(13);
@@ -3294,6 +3301,7 @@ var
 
 
 begin {PreProcess}
+preprocessing := true;
 lSuppressMacroExpansions := suppressMacroExpansions; {inhibit token printing}
 suppressMacroExpansions := true;
 lReportEOL := reportEOL;                {we need to see eol's}
@@ -3693,6 +3701,7 @@ expandMacros := true;
 reportEOL := lReportEOL;                {restore flags}
 suppressMacroExpansions := lSuppressMacroExpansions;
 skipping := tskipping;
+preprocessing := false;
 if nextLineNumber >= 0 then
    lineNumber := nextLineNumber;
 end; {PreProcess}
@@ -4447,6 +4456,7 @@ customDefaultName := nil;               {no custom default name}
 pragmaKeepFile := nil;                  {no #pragma keep file so far}
 doingFakeFile := false;                 {not doing a fake file}
 doingDigitSequence := false;            {not expecting a digit sequence}
+preprocessing := false;                 {not preprocessing}
 
                                         {error codes for lint messages}
                                         {if changed, also change maxLint}
@@ -5681,9 +5691,19 @@ case charKinds[ord(ch)] of
       CheckIdentifier;
       end;
 
-   digit :                               {numeric constants}
+   digit :                              {numeric constants}
       DoNumber(false);
 
+   ch_other: begin                      {other non-whitespace char (pp-token)}
+      token.kind := otherch;
+      token.class := otherCharacter;
+      token.ch := ch;
+      NextCh;
+      if skipping or preprocessing then
+         if not skipIllegalTokens then
+            Error(1);
+      end;
+
    otherwise: Error(57);
    end; {case}
 tokenEnd := currentChPtr;               {record the end of the token}
@@ -5728,6 +5748,10 @@ if doingPPExpression then begin
    end; {if}
 if printMacroExpansions and not suppressMacroExpansions then
    PrintToken(token);                   {print the token stream}
+if token.kind = otherch then
+   if not (skipping or preprocessing or suppressMacroExpansions)
+      or doingPPExpression then
+      Error(1);
 end; {NextToken}
 
 
diff --git a/Table.asm b/Table.asm
index c74e460..b5c2ee2 100644
--- a/Table.asm
+++ b/Table.asm
@@ -19,7 +19,7 @@ charKinds start                         character set
          enum  (illegal,ch_special,ch_dash,ch_plus,ch_lt,ch_gt,ch_eq,ch_exc),0
          enum  (ch_and,ch_bar,ch_dot,ch_white,ch_eol,ch_eof,ch_char,ch_string)
          enum  (ch_asterisk,ch_slash,ch_percent,ch_carot,ch_pound,ch_colon)
-         enum  (ch_backslash,letter,digit)
+         enum  (ch_backslash,ch_other,letter,digit)
 
 ! STANDARD
          dc    i'ch_eof'                nul
@@ -58,7 +58,7 @@ charKinds start                         character set
          dc    i'ch_exc'                !
          dc    i'ch_string'             "
          dc    i'ch_pound'              #
-         dc    i'illegal'               $
+         dc    i'ch_other'              $
          dc    i'ch_percent'            %
          dc    i'ch_and'                &
          dc    i'ch_char'               '
@@ -86,7 +86,7 @@ charKinds start                         character set
          dc    i'ch_eq'                 =
          dc    i'ch_gt'                 >
          dc    i'ch_special'            ?
-         dc    i'illegal'               @
+         dc    i'ch_other'              @
          dc    i'letter'                A
          dc    i'letter'                B
          dc    i'letter'                C
@@ -118,7 +118,7 @@ charKinds start                         character set
          dc    i'ch_special'            ]
          dc    i'ch_carot'              ^
          dc    i'letter'                _
-         dc    i'illegal'               `
+         dc    i'ch_other'              `
          dc    i'letter'                a
          dc    i'letter'                b
          dc    i'letter'                c
@@ -183,24 +183,24 @@ charKinds start                         character set
          dc    i'letter'                gs
          dc    i'letter'                rs
          dc    i'letter'                us
-         dc    i'illegal'               space
-         dc    i'illegal'               !
-         dc    i'illegal'               "
-         dc    i'illegal'               #
-         dc    i'illegal'               $
-         dc    i'illegal'               %
-         dc    i'illegal'               &
+         dc    i'ch_other'              space
+         dc    i'ch_other'              !
+         dc    i'ch_other'              "
+         dc    i'ch_other'              #
+         dc    i'ch_other'              $
+         dc    i'ch_other'              %
+         dc    i'ch_other'              &
          dc    i'letter'                '
-         dc    i'illegal'               (
-         dc    i'illegal'               )
-         dc    i'illegal'               *
-         dc    i'illegal'               +
-         dc    i'illegal'               ,
+         dc    i'ch_other'              (
+         dc    i'ch_other'              )
+         dc    i'ch_other'              *
+         dc    i'ch_other'              +
+         dc    i'ch_other'              ,
          dc    i'ch_special'            -
          dc    i'letter'                .
          dc    i'letter'                /
-         dc    i'illegal'               0
-         dc    i'illegal'               1
+         dc    i'ch_other'              0
+         dc    i'ch_other'              1
          dc    i'ch_special'            2
          dc    i'ch_special'            3
          dc    i'letter'                4
@@ -209,49 +209,49 @@ charKinds start                         character set
          dc    i'letter'                7
          dc    i'letter'                8
          dc    i'letter'                9
-         dc    i'illegal'               :
+         dc    i'ch_other'              :
          dc    i'letter'                ;
          dc    i'letter'                <
          dc    i'letter'                =
          dc    i'letter'                >
          dc    i'letter'                ?
-         dc    i'illegal'               @
-         dc    i'illegal'               A
-         dc    i'illegal'               B
-         dc    i'illegal'               C
+         dc    i'ch_other'              @
+         dc    i'ch_other'              A
+         dc    i'ch_other'              B
+         dc    i'ch_other'              C
          dc    i'letter'                D
-         dc    i'illegal'               E
+         dc    i'ch_other'              E
          dc    i'letter'                F
          dc    i'ch_special'            G
          dc    i'ch_special'            H
-         dc    i'illegal'               I
+         dc    i'ch_other'              I
          dc    i'ch_white'              J
          dc    i'letter'                K
          dc    i'letter'                L
          dc    i'letter'                M
          dc    i'letter'                N
          dc    i'letter'                O
-         dc    i'illegal'               P
-         dc    i'illegal'               Q
-         dc    i'illegal'               R
-         dc    i'illegal'               S
-         dc    i'illegal'               T
-         dc    i'illegal'               U
+         dc    i'ch_other'              P
+         dc    i'ch_other'              Q
+         dc    i'ch_other'              R
+         dc    i'ch_other'              S
+         dc    i'ch_other'              T
+         dc    i'ch_other'              U
          dc    i'ch_special'            V
-         dc    i'illegal'               W
+         dc    i'ch_other'              W
          dc    i'letter'                X
          dc    i'letter'                Y
-         dc    i'illegal'               Z
-         dc    i'illegal'               [
-         dc    i'illegal'               \
-         dc    i'illegal'               ]
+         dc    i'ch_other'              Z
+         dc    i'ch_other'              [
+         dc    i'ch_other'              \
+         dc    i'ch_other'              ]
          dc    i'letter'                ^
          dc    i'letter'                _
-         dc    i'illegal'               `
-         dc    i'illegal'               a
-         dc    i'illegal'               b
-         dc    i'illegal'               c
-         dc    i'illegal'               d
+         dc    i'ch_other'              `
+         dc    i'ch_other'              a
+         dc    i'ch_other'              b
+         dc    i'ch_other'              c
+         dc    i'ch_other'              d
          dc    i'letter'                e
          dc    i'letter'                f
          dc    i'letter'                g
@@ -263,22 +263,22 @@ charKinds start                         character set
          dc    i'letter'                m
          dc    i'letter'                n
          dc    i'letter'                o
-         dc    i'illegal'               p
+         dc    i'ch_other'              p
          dc    i'letter'                q
          dc    i'letter'                r
          dc    i'letter'                s
          dc    i'letter'                t
          dc    i'letter'                u
-         dc    i'illegal'               v
-         dc    i'illegal'               w
-         dc    i'illegal'               x
-         dc    i'illegal'               y
-         dc    i'illegal'               z
-         dc    i'illegal'               {
-         dc    i'illegal'               |
-         dc    i'illegal'               }
-         dc    i'illegal'               ~
-         dc    i'illegal'               rub
+         dc    i'ch_other'              v
+         dc    i'ch_other'              w
+         dc    i'ch_other'              x
+         dc    i'ch_other'              y
+         dc    i'ch_other'              z
+         dc    i'ch_other'              {
+         dc    i'ch_other'              |
+         dc    i'ch_other'              }
+         dc    i'ch_other'              ~
+         dc    i'ch_other'              rub
          end
 
 charSym  start                          single character symbols