syn68k/syngen/token.c
2008-09-26 08:25:10 -06:00

694 lines
17 KiB
C

/*
* token.c
*/
#include <stdio.h>
#include <string.h>
#include <sys/param.h>
#if defined (__MINGW32__)
#include <stddef.h> /* needed for ctype.h */
#endif
#include <ctype.h>
#include <stdlib.h>
#include "token.h"
#include "error.h"
#include "common.h"
#include "hash.h"
#include "tokenlist.h"
#include "uniquestring.h"
static InputFile file_stack[MAX_INCLUDE_DEPTH];
static int file_stack_ptr = 0;
static SymbolTable *tok_sym_table = NULL;
static const char *include_dirs[] = { ".", NULL };
static unsigned char break_char_table[257];
static unsigned char num_conv_table[256];
#define BINARY_MASK 16
#define OCTAL_MASK 32
#define DECIMAL_MASK 64
#define HEX_MASK 128
#define ALL_MASKS (BINARY_MASK | OCTAL_MASK | DECIMAL_MASK | HEX_MASK)
/* Private routines. */
static long parse_number (const char *num);
static BOOL raw_fetch_next_token (Token *t);
/* Initializes the tokenizer. Call it once before any calls to any other
* token routines. Do not call it more than once; the only way to reset
* everything is to keep fetching tokens until there are none left.
*/
void
init_tokenizer ()
{
static const unsigned char break_chars[] = " \t\n\r();";
const unsigned char *p;
SymbolInfo sym;
int i;
/* Start out with 0 open streams. */
file_stack_ptr = 0;
/* Initialize the symbol table. */
if (tok_sym_table != NULL)
free_symbol_table (tok_sym_table);
tok_sym_table = make_symbol_table ();
for (i = 0; i < (sizeof token_list / sizeof token_list[0]); i++)
{
sym.n = token_list[i].value;
insert_symbol (tok_sym_table, token_list[i].name, sym);
}
/* Initialize break character table. */
for (p = &break_chars[0]; *p; p++)
break_char_table[*p + 1] = TRUE;
break_char_table[EOF + 1] = TRUE;
/* Initialize ascii -> number conversion table. */
num_conv_table['0'] = (ALL_MASKS + 0);
num_conv_table['1'] = (ALL_MASKS + 1);
for (i = '2'; i < '8'; i++)
num_conv_table[i] = (ALL_MASKS - BINARY_MASK) + i - '0';
num_conv_table['8'] = (DECIMAL_MASK + HEX_MASK + 8);
num_conv_table['9'] = (DECIMAL_MASK + HEX_MASK + 9);
for (i = 0; i < 7; i++)
num_conv_table[i + 'A'] = num_conv_table[i + 'a'] = (HEX_MASK + i + 10);
}
/* Skips whitespace and comments. Returns the first non-whitespace character
* encountered, or EOF if there wasn't one.
*/
int
skip_to_next_token ()
{
InputFile *f;
FILE *fp;
int c;
/* Skip leading whitespace & handle EOF by popping up one file. */
while (1)
{
if (file_stack_ptr <= 0)
return EOF;
f = &file_stack[file_stack_ptr - 1];
fp = f->fp;
/* Skip leading spaces & comments */
while (1)
{
while (isspace (c = fgetc (fp))) /* Linux dies if you use getc. */
if (c == '\n') f->lineno++;
if (c == ';') /* Skip over comments */
{
while ((c = getc (fp)) != '\n' && c != EOF);
if (c == '\n') f->lineno++;
}
else break;
}
if (c == EOF)
close_file ();
else break;
}
return c;
}
/* Same as raw_fetch_next_token, but processes include directives. */
BOOL
fetch_next_token (Token *t)
{
static Token saved_token = { TOK_EMPTY };
Token temp, temp2;
if (saved_token.type != TOK_EMPTY)
{
*t = saved_token;
saved_token.type = TOK_EMPTY;
return YES;
}
/* Read in the next token. If it's not an open paren, pass it on. */
if (!raw_fetch_next_token (t))
return NO;
if (t->type != TOK_LEFT_PAREN)
return YES;
/* Check to see if this is an #include directive. */
if (!raw_fetch_next_token (&saved_token))
return YES;
if (saved_token.type != TOK_INCLUDE)
return YES;
saved_token.type = TOK_EMPTY;
if (!raw_fetch_next_token (&temp))
{
fatal_input_error ("include directive is missing filename!\n");
}
if (temp.type != TOK_QUOTED_STRING)
{
/* Punt unless it's an identifier; if it is, include the file anyway. */
if (temp.type != TOK_IDENTIFIER)
{
fatal_input_error ("Filename for include must be in \"\" quotes.\n");
}
input_error ("Filename for include must be in \"\" quotes.\n");
}
raw_fetch_next_token (&temp2); /* Eat the close paren. */
if (temp2.type != TOK_RIGHT_PAREN)
{
fatal_input_error ("Too many arguments to include directive.\n");
}
open_file (temp.u.string, include_dirs);
return fetch_next_token (t);
}
/* Fetches the next token from the stack of input streams. Returns FALSE
* if there are no tokens left.
*/
static BOOL
raw_fetch_next_token (Token *t)
{
char buf[MAX_TOKEN_SIZE], *p;
FILE *fp;
InputFile *f;
SymbolInfo sym;
int c, i;
c = skip_to_next_token ();
if (c == EOF)
return NO;
/* Set up convenience variables pointing to the current file. */
f = &file_stack[file_stack_ptr - 1];
fp = f->fp;
/* Set up file and line number fields of the token. */
t->filename = f->filename;
t->lineno = f->lineno;
/* Special case for paren's, as they are both legal tokens and break chars */
if (c == '(')
{
t->type = TOK_LEFT_PAREN;
t->u.string = "(";
return YES;
}
else if (c == ')')
{
t->type = TOK_RIGHT_PAREN;
t->u.string = ")";
return YES;
}
/* Loop and grab all the characters in this token, putting them in buf. */
p = buf, i = MAX_TOKEN_SIZE - 1;
/* Special case for quoted strings. */
if (c == '\"')
{
int backslash = 0;
do
{
if (c == '\\')
{
if (!backslash)
{
backslash = 1;
c = getc (fp);
continue;
}
backslash = 0;
}
else if (backslash)
{
backslash = 0;
switch (c) {
case '\n':
continue;
case 'n':
c = '\n';
break;
case 't':
c = '\t';
break;
case '\\':
break;
default:
input_error ("Unknown escape sequence '\\%c'.\n", c);
break;
}
}
else if (c == '\n')
{
input_error ("Unterminated string.\n");
return raw_fetch_next_token (t);
}
*p++ = c;
c = getc (fp);
}
while (c != '\"' && --i);
*p = '\0';
t->type = TOK_QUOTED_STRING;
t->u.string = unique_string (buf + 1);
return YES;
}
else /* Not a quoted string... */
{
do
{
*p++ = c;
c = getc (fp);
}
while (!break_char_table[c + 1] && --i);
ungetc (c, fp);
*p = '\0';
}
/* Is it a normal token we recognize? */
if (lookup_symbol (tok_sym_table, buf, &sym, &t->u.string) == HASH_NOERR)
{
int buflen = strlen (buf);
t->type = sym.n;
if (t->type == TOK_TEMP_REGISTER)
{
if (isdigit (buf[3]))
t->u.reginfo.which = atoi (buf + 3);
else
t->u.reginfo.which = 1;
t->u.reginfo.sgnd = (buf[buflen - 2] == 's');
switch (buf[buflen - 1]) {
case 'b': t->u.reginfo.size = 1; break;
case 'w': t->u.reginfo.size = 2; break;
case 'l': t->u.reginfo.size = 4; break;
default:
fatal_error ("Internal error, token.c: impossible register "
"size '%c'\n", buf[buflen - 1]);
break;
}
}
else if (t->type == TOK_DEREF)
{
if (!strcmp (buf, "deref"))
{
t->u.derefinfo.sgnd = FALSE;
t->u.derefinfo.size = 0; /* untyped deref. */
}
else
{
t->u.derefinfo.sgnd = (buf[buflen - 2] == 's');
switch (buf[buflen - 1]) {
case 'b': t->u.derefinfo.size = 1; break;
case 'w': t->u.derefinfo.size = 2; break;
case 'l': t->u.derefinfo.size = 4; break;
default:
fatal_error ("Internal error, token.c: impossible deref "
"size '%c'\n", buf[buflen - 1]);
break;
}
}
}
else if (t->type == TOK_SWAP)
{
t->u.derefinfo.sgnd = (buf[buflen - 2] == 's');
switch (buf[buflen - 1]) {
case 'b': t->u.derefinfo.size = 1; break;
case 'w': t->u.derefinfo.size = 2; break;
case 'l': t->u.derefinfo.size = 4; break;
default:
fatal_error ("Internal error, token.c: impossible swap "
"size '%c'\n", buf[buflen - 1]);
break;
}
}
else if (IS_DOLLAR_TOKEN (t->type))
{
t->u.dollarinfo.which = atoi (buf + 1);
switch (t->type) {
case TOK_DOLLAR_DATA_REGISTER:
case TOK_DOLLAR_ADDRESS_REGISTER:
case TOK_DOLLAR_GENERAL_REGISTER:
case TOK_DOLLAR_REVERSED_AMODE:
case TOK_DOLLAR_AMODE:
case TOK_DOLLAR_NUMBER:
case TOK_DOLLAR_AMODE_PTR:
case TOK_DOLLAR_REVERSED_AMODE_PTR:
t->u.dollarinfo.sgnd = (buf[buflen - 2] == 's');
switch (buf[buflen - 1]) {
case 'b': t->u.dollarinfo.size = 1; break;
case 'w': t->u.dollarinfo.size = 2; break;
case 'l': t->u.dollarinfo.size = 4; break;
default:
fatal_error ("Internal error, token.c: impossible dollar "
"size '%c'\n", buf[buflen - 1]);
break;
}
break;
default:
fatal_error ("Internal error, token.c: IS_DOLLAR_IDENTIFIER must "
"be invalid.\n");
break;
}
}
return YES;
}
/* Is it a number? */
else if (isdigit (buf[0]) || (buf[0] == '-' && isdigit (buf[1])))
{
t->type = TOK_NUMBER;
t->u.n = parse_number (buf);
return YES;
}
/* Is it a register? eg d0.ub, d3.w, d7.l, a2.b, a4.w, a1.uw, etc. */
else if ((buf[0] == 'a' || buf[0] == 'd')
&& buf[1] >= '0' && buf[1] <= '7')
{
if (buf[2] != '.')
input_error ("Missing size/signedness specifier for register.\n");
t->type = (buf[0] == 'a') ? TOK_ADDRESS_REGISTER : TOK_DATA_REGISTER;
t->u.reginfo.sgnd = (buf[3] != 'u');
switch (buf[strlen (buf) - 1]) {
case 'b':
t->u.reginfo.size = 1;
break;
case 's':
case 'w':
t->u.reginfo.size = 2;
break;
case 'l':
default:
t->u.reginfo.size = 4;
break;
}
t->u.reginfo.which = buf[1] - '0';
return YES;
}
/* Must be a label. */
t->type = TOK_IDENTIFIER;
t->u.string = unique_string (buf);
return YES;
}
/* Parses an ASCII number held in buf. Recognizes 0x prefix as hexadecimal,
* 0b as binary and 0 followed by more digits as octal. Other numbers
* are interpreted as decimal.
*/
static long
parse_number (const char *buf)
{
int sign = 1, base;
long n = 0;
unsigned char mask, v;
/* Check for sign. */
if (buf[0] == '-')
sign = -1, buf++;
/* Figure out which base the number is. */
if (buf[0] == '0') /* Either octal, hexadecimal, or binary. */
{
if (buf[1] == 'x')
base = 16, mask = HEX_MASK, buf += 2;
else if (buf[1] == 'b')
base = 2, mask = BINARY_MASK, buf += 2;
else base = 8, mask = OCTAL_MASK;
}
else base = 10, mask = DECIMAL_MASK;
/* Convert it to an int. */
while ((v = *buf++))
{
if (num_conv_table[v] & mask)
n = (n * base) + (num_conv_table[v] & 15);
else
{
input_error ("Illegal character in numeric constant.\n");
return 0;
}
}
return n * sign;
}
/* Returns a pointer to the InputFile struct for a file being parsed.
* levels_back specifies how many #include levels to pop back. A levels_back
* of zero will return the current input file. levels_back must be >= 0.
*/
const InputFile *
get_input_file (int levels_back)
{
if (levels_back < 0 || file_stack_ptr - levels_back - 1 < 0)
return NULL;
return &file_stack[file_stack_ptr - levels_back - 1];
}
/* Opens a file and pushes it onto the stack of files being parsed. "file"
* is the filename of the file to be #include'd, search_dirs is a
* NULL-terminated list of all directories to check. These directories will
* not be checked if file has a leading '/'.
*/
void
open_file (const char *file, const char *search_dirs[])
{
FILE *fp;
const char **dir;
char buf[MAXPATHLEN];
/* See if we've opened too many files already. */
if (file_stack_ptr >= MAX_INCLUDE_DEPTH)
fatal_input_error ("Too many levels of nested #include's.\n");
/* If the filename has a leading slash, don't check directories.
* Otherwise, check all the directories in the search path.
*/
fp = NULL;
if (file[0] == '/')
fp = fopen (file, "r");
else for (dir = &search_dirs[0]; *dir != NULL; dir++)
{
sprintf (buf, "%s/%s", *dir, file);
fp = fopen (buf, "r");
if (fp != NULL)
break;
}
if (fp == NULL)
fatal_input_error ("%s: No such file or directory.\n", file);
open_stream (file, fp);
}
/* Adds a stream to the stack of input files. The reason this routine
* is distinct from open_file (above) is so that opening stdin is trivial.
*/
void
open_stream (const char *name, FILE *fp)
{
InputFile *new = &file_stack[file_stack_ptr];
if (file_stack_ptr >= MAX_INCLUDE_DEPTH)
return;
/* Add this file to the file input stack. */
new->fp = fp;
new->lineno = 1;
strncpy (new->filename, name, MAX_FILENAME_LENGTH - 1);
new->filename[MAX_FILENAME_LENGTH - 1] = '\0';
file_stack_ptr++;
}
FILE *
current_stream ()
{
if (file_stack_ptr > 0)
return file_stack[file_stack_ptr - 1].fp;
return NULL;
}
/* Closes the file currently being read and pops back to the file that was
* #including the current one, if any.
*/
void
close_file ()
{
if (file_stack_ptr > 0)
fclose (file_stack[--file_stack_ptr].fp);
}
BOOL
tokens_equal (const Token *t1, const Token *t2)
{
if (t1->type != t2->type)
return FALSE;
switch (t1->type) {
case TOK_IDENTIFIER:
case TOK_QUOTED_STRING:
return !strcmp (t1->u.string, t2->u.string);
case TOK_NUMBER:
return (t1->u.n == t2->u.n);
case TOK_DOLLAR_AMODE:
case TOK_DOLLAR_REVERSED_AMODE:
case TOK_DOLLAR_AMODE_PTR:
case TOK_DOLLAR_REVERSED_AMODE_PTR:
case TOK_DOLLAR_NUMBER:
case TOK_DOLLAR_DATA_REGISTER:
case TOK_DOLLAR_ADDRESS_REGISTER:
case TOK_DOLLAR_GENERAL_REGISTER:
return (t1->u.dollarinfo.which == t2->u.dollarinfo.which
&& t1->u.dollarinfo.size == t2->u.dollarinfo.size
&& t1->u.dollarinfo.sgnd == t2->u.dollarinfo.sgnd);
case TOK_DEREF:
return (t1->u.derefinfo.sgnd == t2->u.derefinfo.sgnd
&& t1->u.derefinfo.size == t2->u.derefinfo.size);
case TOK_AMODE:
case TOK_REVERSED_AMODE:
case TOK_AMODE_PTR:
case TOK_REVERSED_AMODE_PTR:
return (t1->u.amodeinfo.sgnd == t2->u.amodeinfo.sgnd
&& t1->u.amodeinfo.size == t2->u.amodeinfo.size
&& t1->u.amodeinfo.which == t2->u.amodeinfo.which);
case TOK_DATA_REGISTER:
case TOK_ADDRESS_REGISTER:
case TOK_TEMP_REGISTER:
return (t1->u.reginfo.sgnd == t2->u.reginfo.sgnd
&& t1->u.reginfo.size == t2->u.reginfo.size
&& t1->u.reginfo.which == t2->u.reginfo.which);
default:
return TRUE;
}
}
/* Dumps a token in human-readable format. For debugging purposes only. */
void
dump_token (const Token *t)
{
if (t->type == TOK_NUMBER)
printf ("type = %d,\tn = %ld, \tfilename = \"%s\",\tlineno = %lu\n",
t->type, t->u.n, t->filename, t->lineno);
else
printf ("type = %d,\tstring = \"%s\", \tfilename = \"%s\",\t"
"lineno = %lu\n", t->type, t->u.string, t->filename, t->lineno);
}
/* Copies a human-readable version of the token to buf and returns buf. */
char *
unparse_token (const Token *t, char *buf)
{
static const char *regdesc[2][5] = {
{ "", "ub", "uw", "", "ul" },
{ "", "sb", "sw", "", "sl" }
};
switch (t->type) {
case TOK_NUMBER:
sprintf (buf, "%ld", t->u.n);
break;
case TOK_QUOTED_STRING:
sprintf (buf, "\"%s\"", t->u.string);
break;
case TOK_EMPTY:
strcpy (buf, "[EMPTY]");
break;
case TOK_DEREF:
if (t->u.derefinfo.size == 0)
strcpy (buf, "deref");
else sprintf (buf, "deref%s",
regdesc[t->u.derefinfo.sgnd][t->u.derefinfo.size]);
break;
case TOK_SWAP:
sprintf (buf, "swap%s", regdesc[t->u.derefinfo.sgnd][t->u.derefinfo.size]);
break;
case TOK_DATA_REGISTER:
sprintf (buf, "d%d.%s", t->u.reginfo.which,
regdesc[t->u.reginfo.sgnd][t->u.reginfo.size]);
break;
case TOK_ADDRESS_REGISTER:
sprintf (buf, "a%d.%s", t->u.reginfo.which,
regdesc[t->u.reginfo.sgnd][t->u.reginfo.size]);
break;
case TOK_TEMP_REGISTER:
sprintf (buf, "tmp%d.%s", t->u.reginfo.which,
regdesc[t->u.reginfo.sgnd][t->u.reginfo.size]);
break;
case TOK_DOLLAR_DATA_REGISTER:
sprintf (buf, "$%d.d%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
case TOK_DOLLAR_ADDRESS_REGISTER:
sprintf (buf, "$%d.a%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
case TOK_DOLLAR_GENERAL_REGISTER:
sprintf (buf, "$%d.g%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
case TOK_DOLLAR_AMODE:
sprintf (buf, "$%d.m%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
case TOK_DOLLAR_REVERSED_AMODE:
sprintf (buf, "$%d.r%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
case TOK_DOLLAR_AMODE_PTR:
sprintf (buf, "$%d.p%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
case TOK_DOLLAR_REVERSED_AMODE_PTR:
sprintf (buf, "$%d.q%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
case TOK_DOLLAR_NUMBER:
sprintf (buf, "$%d.%s", t->u.dollarinfo.which,
regdesc[t->u.dollarinfo.sgnd][t->u.dollarinfo.size]);
break;
default:
strcpy (buf, t->u.string);
break;
}
return buf;
}