Retro68/Rez/RezLexerNextToken.cc

#include "RezLexer.h"
#include "RezLexerWaveToken.h"
#include "RezParser.generated.hh"
#include <unordered_map>

#include <boost/regex.hpp>

using namespace boost::wave;

static int readInt(const char *str, const char *end = NULL, int baseOverride = 0)
{
	int x = 0;

	int base = 10;

	if(baseOverride)
		base = baseOverride;
	else if(*str == '0')
	{
		base = 8;
		++str;
		if(*str == 'x' || *str == 'X')
		{
			base = 16;
			++str;
		}
		if(*str == 'b' || *str == 'B')
		{
			base = 2;
			++str;
		}
	}
	else if(*str == 'b' || *str == 'B')
	{
		base = 2;
		++str;
	}

	while(str != end && *str)
	{
		x *= base;
		if(*str >= 'a' && *str <= 'z')
			x += *str - 'a' + 10;
		else if(*str >= 'A' && *str <= 'Z')
			x += *str - 'A' + 10;
		else if(*str >= '0' && *str <= '9')
			x += *str - '0';
		str++;
	}

	return x;
}

static int readCharLit(const char *str)
{
	const char *p = str + 1;
	const char *e = str + strlen(str) - 1;

	if(e - p != 4)
		std::cout << "warning: CHAR LITERAL " << str << "\n";

	int x = 0;
	while(p != e)
	{
		x <<= 8;
		x |= (*p) & 0xFF;
		++p;
	}
	return x;
}

static std::string readStringLit(const char *str)
{
	const char *p = str + 1;
	const char *e = str + strlen(str) - 1;

	std::ostringstream out;

	while(p != e)
	{
		if(*p == '\\')
		{
			++p;
			if(p != e)
			{
				switch(*p)
				{
					case 'n':
						out << '\n'; ++p;
						break;
					case 'r':
						out << '\r'; ++p;
						break;
					case 't':
						out << '\t'; ++p;
						break;
					case '0':
					case '1':
					case '2':
					case '3':
						if(p + 3 > e)
							continue;
						if(p[0] == '0' && (p[1] == 'x' || p[1] == 'X'))
						{
							if(p + 4 > e)
								continue;
							out << (char)readInt(p+2, p+4, 16);
							p += 4;
						}
						else
						{
							out << (char)readInt(p, p+3, 8);
							p += 3;
						}
						break;
					case '$':
						{
							if(p + 3 > e)
								continue;
							out << (char)readInt(p+1, p+3, 16);
							p += 3;
						}
						break;
				}
			}
		}
		else
		{
			out << *p++;
		}
	}

	return out.str();
}

RezSymbol RezLexer::nextToken()
{
	for(auto tok = nextWave(); tok != T_EOI && tok != T_EOF; tok = nextWave())
	{
		if(IS_CATEGORY(tok, WhiteSpaceTokenType))
			continue;
		else if(IS_CATEGORY(tok, EOLTokenType))
			continue;
		else if(tok == T_PP_LINE)
		{
			while(tok != T_EOI && tok != T_EOF && !IS_CATEGORY(tok, EOLTokenType))
				tok = nextWave();
			continue;
		}
		else
		{
			//std::cout << "{" << std::hex << (token_id)tok << std::dec << "|" << tok.get_value() << "}\n";

			auto pos = tok.get_position();
			curFile = pos.get_file().c_str();
			auto yypos = yy::position(&curFile, pos.get_line(), pos.get_column());
			yy::location loc(yypos);

			if(tok == (UnknownTokenType | '"'))
			{
				return RezParser::make_STRINGLIT("Hello, world.", loc);
			}
			else if(IS_CATEGORY(tok, IdentifierTokenType) || IS_CATEGORY(tok, KeywordTokenType) || IS_CATEGORY(tok, BoolLiteralTokenType))
			{
				typedef decltype(&RezParser::make_TYPE) memfun;
#define KEYWORD(upper, lower) \
{ lower, &RezParser::make_ ## upper }

				static std::unordered_map<std::string, memfun> keywords = {
					KEYWORD(TYPE, "type"),
					KEYWORD(RESOURCE, "resource"),
					KEYWORD(DATA, "data"),
					KEYWORD(READ, "read"),
					KEYWORD(INCLUDE, "include"),
					KEYWORD(CHANGE, "change"),
					KEYWORD(DELETE, "delete"),

					KEYWORD(ARRAY,"array"),
					KEYWORD(SWITCH, "switch"),
					KEYWORD(CASE, "case"),
					KEYWORD(AS, "as"),
					KEYWORD(FILL,"fill"),
					KEYWORD(ALIGN, "align"),
					KEYWORD(HEX,"hex"),
					KEYWORD(KEY, "key"),
					KEYWORD(WIDE,"wide"),
					KEYWORD(UNSIGNED, "unsigned"),
					KEYWORD(LITERAL, "literal"),
					KEYWORD(BOOLEAN, "boolean"),
					KEYWORD(BIT, "bit"),
					KEYWORD(NIBBLE, "nibble"),
					KEYWORD(BYTE, "byte"),
					KEYWORD(CHAR, "char"),
					KEYWORD(WORD, "word"),
					KEYWORD(INTEGER, "integer"),
					KEYWORD(LONG, "long"),
					KEYWORD(LONGINT, "longint"),
					KEYWORD(PSTRING, "pstring"),
					KEYWORD(PSTRING, "wstring"),
					KEYWORD(STRING, "string"),
					KEYWORD(POINT, "point"),
					KEYWORD(RECT, "rect"),
					KEYWORD(BITSTRING, "bitstring"),

					KEYWORD(INTEGER, "int"),
					KEYWORD(DOLLAR, "$"),

					KEYWORD(FUN_COUNTOF, "$$countof"),
					KEYWORD(FUN_ARRAYINDEX, "$$arrayindex"),
					KEYWORD(FUN_READ, "$$read"),
					KEYWORD(FUN_BITFIELD, "$$bitfield"),
					KEYWORD(FUN_WORD, "$$word"),
					KEYWORD(FUN_BYTE, "$$byte"),
					KEYWORD(FUN_LONG, "$$long"),
				};

				std::string s = tok.get_value().c_str();
				std::string lower = s;
				std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
				auto p = keywords.find(lower);
				if(p == keywords.end())
				{
					//std::cout << "id: " << s << std::endl;
					return RezParser::make_IDENTIFIER(lower, loc);
				}
				else
				{
					//std::cout << "key: " << s << std::endl;
					return (*p->second)(loc);
				}
			}
			else if(tok == T_INTLIT)
			{
				if(tok.get_value() == "0")
				{
					auto tok2 = peekWave();
					while(tok2 != T_EOI && tok2 != T_EOF && IS_CATEGORY(tok2, WhiteSpaceTokenType))
						nextWave(), tok2 = peekWave();

					//std::cout << "!" << std::hex << (token_id)tok2 << std::dec << "|" << tok2.get_value() << "!\n";
					static boost::regex binlit("[bB][01]+");
					if(tok2 == T_IDENTIFIER && boost::regex_match(tok2.get_value().c_str(), binlit))
						tok = nextWave();
				}
				return RezParser::make_INTLIT(readInt(tok.get_value().c_str()), loc);
			}
			else
			{
#define NOVAL_TOK(name)	\
case T_ ## name: /*std::cout << #name << std::endl;*/ return RezParser::make_ ## name(loc)
				switch(token_id(tok))
				{
					case T_INTLIT: return RezParser::make_INTLIT(readInt(tok.get_value().c_str()), loc);

					case T_CHARLIT: return RezParser::make_CHARLIT(readCharLit(tok.get_value().c_str()), loc);
					case T_STRINGLIT: return RezParser::make_STRINGLIT(readStringLit(tok.get_value().c_str()), loc);

					NOVAL_TOK(LEFTBRACE);
					NOVAL_TOK(RIGHTBRACE);
					NOVAL_TOK(LEFTBRACKET);
					NOVAL_TOK(RIGHTBRACKET);
					NOVAL_TOK(LEFTPAREN);
					NOVAL_TOK(RIGHTPAREN);
					NOVAL_TOK(SEMICOLON);
					NOVAL_TOK(COMMA);
					NOVAL_TOK(PLUS);
					NOVAL_TOK(MINUS);
					NOVAL_TOK(DIVIDE);
					NOVAL_TOK(STAR);
					NOVAL_TOK(ASSIGN);
					NOVAL_TOK(COLON);
					NOVAL_TOK(SHIFTLEFT);
					NOVAL_TOK(SHIFTRIGHT);
					NOVAL_TOK(EQUAL);
					NOVAL_TOK(NOTEQUAL);
					NOVAL_TOK(AND);
					NOVAL_TOK(OR);
					NOVAL_TOK(XOR);
					NOVAL_TOK(COMPL);

					default:

						return RezParser::make_BADTOKEN(tok.get_value().c_str(), loc);
				}

			}
		}
	}
	return RezSymbol();
}
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`#include "RezLexer.h"`
			`#include "RezLexerWaveToken.h"`
			`#include "RezParser.generated.hh"`
			`#include <unordered_map>`

			`#include <boost/regex.hpp>`

			`using namespace boost::wave;`

handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`static int readInt(const char str, const char end = NULL, int baseOverride = 0)`
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`{`
			`int x = 0;`

			`int base = 10;`

handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`if(baseOverride)`
			`base = baseOverride;`
			`else if(*str == '0')`
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`{`
			`base = 8;`
			`++str;`
			`if(str == 'x' \|\| str == 'X')`
			`{`
			`base = 16;`
			`++str;`
			`}`
			`if(str == 'b' \|\| str == 'B')`
			`{`
			`base = 2;`
			`++str;`
			`}`
			`}`
			`else if(str == 'b' \|\| str == 'B')`
			`{`
			`base = 2;`
			`++str;`
			`}`

handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`while(str != end && *str)`
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`{`
			`x *= base;`
			`if(str >= 'a' && str <= 'z')`
fix bug in hex numbers 2014-10-13 20:59:14 +00:00			`x += *str - 'a' + 10;`
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`else if(str >= 'A' && str <= 'Z')`
			`x += *str - 'A' + 10;`
			`else if(str >= '0' && str <= '9')`
			`x += *str - '0';`
fix handling of \n, \r, \t escape sequences in strings 2014-10-16 00:29:41 +00:00			`str++;`
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`}`

			`return x;`
			`}`

start with semantics 2014-10-06 15:03:25 +00:00			`static int readCharLit(const char *str)`
			`{`
			`const char *p = str + 1;`
			`const char *e = str + strlen(str) - 1;`

			`if(e - p != 4)`
			`std::cout << "warning: CHAR LITERAL " << str << "\n";`

			`int x = 0;`
			`while(p != e)`
			`{`
			`x <<= 8;`
			`x \|= (*p) & 0xFF;`
			`++p;`
			`}`
			`return x;`
			`}`

handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`static std::string readStringLit(const char *str)`
			`{`
			`const char *p = str + 1;`
			`const char *e = str + strlen(str) - 1;`

			`std::ostringstream out;`

			`while(p != e)`
			`{`
			`if(*p == '\\')`
			`{`
			`++p;`
			`if(p != e)`
			`{`
			`switch(*p)`
			`{`
			`case 'n':`
fix handling of \n, \r, \t escape sequences in strings 2014-10-16 00:29:41 +00:00			`out << '\n'; ++p;`
handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`break;`
			`case 'r':`
fix handling of \n, \r, \t escape sequences in strings 2014-10-16 00:29:41 +00:00			`out << '\r'; ++p;`
handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`break;`
			`case 't':`
fix handling of \n, \r, \t escape sequences in strings 2014-10-16 00:29:41 +00:00			`out << '\t'; ++p;`
handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`break;`
			`case '0':`
			`case '1':`
			`case '2':`
			`case '3':`
			`if(p + 3 > e)`
			`continue;`
			`if(p[0] == '0' && (p[1] == 'x' \|\| p[1] == 'X'))`
			`{`
			`if(p + 4 > e)`
			`continue;`
			`out << (char)readInt(p+2, p+4, 16);`
			`p += 4;`
			`}`
			`else`
			`{`
			`out << (char)readInt(p, p+3, 8);`
			`p += 3;`
			`}`
			`break;`
			`case '$':`
			`{`
			`if(p + 3 > e)`
			`continue;`
			`out << (char)readInt(p+1, p+3, 16);`
			`p += 3;`
			`}`
			`break;`
			`}`
			`}`
			`}`
			`else`
			`{`
			`out << *p++;`
			`}`
			`}`

			`return out.str();`
			`}`

Rez: successful parse. 2014-10-05 21:52:34 +00:00			`RezSymbol RezLexer::nextToken()`
			`{`
			`for(auto tok = nextWave(); tok != T_EOI && tok != T_EOF; tok = nextWave())`
			`{`
			`if(IS_CATEGORY(tok, WhiteSpaceTokenType))`
			`continue;`
			`else if(IS_CATEGORY(tok, EOLTokenType))`
			`continue;`
			`else if(tok == T_PP_LINE)`
			`{`
			`while(tok != T_EOI && tok != T_EOF && !IS_CATEGORY(tok, EOLTokenType))`
			`tok = nextWave();`
			`continue;`
			`}`
			`else`
			`{`
			`//std::cout << "{" << std::hex << (token_id)tok << std::dec << "\|" << tok.get_value() << "}\n";`

			`auto pos = tok.get_position();`
			`curFile = pos.get_file().c_str();`
			`auto yypos = yy::position(&curFile, pos.get_line(), pos.get_column());`
			`yy::location loc(yypos);`

			`if(tok == (UnknownTokenType \| '"'))`
			`{`
			`return RezParser::make_STRINGLIT("Hello, world.", loc);`
			`}`
			`else if(IS_CATEGORY(tok, IdentifierTokenType) \|\| IS_CATEGORY(tok, KeywordTokenType) \|\| IS_CATEGORY(tok, BoolLiteralTokenType))`
			`{`
			`typedef decltype(&RezParser::make_TYPE) memfun;`
			`#define KEYWORD(upper, lower) \`
			`{ lower, &RezParser::make_ ## upper }`

			`static std::unordered_map<std::string, memfun> keywords = {`
			`KEYWORD(TYPE, "type"),`
			`KEYWORD(RESOURCE, "resource"),`
Rez: "data" declarations 2014-10-14 00:19:26 +00:00			`KEYWORD(DATA, "data"),`
			`KEYWORD(READ, "read"),`
			`KEYWORD(INCLUDE, "include"),`
			`KEYWORD(CHANGE, "change"),`
			`KEYWORD(DELETE, "delete"),`
Rez: successful parse. 2014-10-05 21:52:34 +00:00
			`KEYWORD(ARRAY,"array"),`
			`KEYWORD(SWITCH, "switch"),`
			`KEYWORD(CASE, "case"),`
			`KEYWORD(AS, "as"),`
			`KEYWORD(FILL,"fill"),`
			`KEYWORD(ALIGN, "align"),`
			`KEYWORD(HEX,"hex"),`
			`KEYWORD(KEY, "key"),`
			`KEYWORD(WIDE,"wide"),`
			`KEYWORD(UNSIGNED, "unsigned"),`
			`KEYWORD(LITERAL, "literal"),`
			`KEYWORD(BOOLEAN, "boolean"),`
			`KEYWORD(BIT, "bit"),`
Rez: fill and align 2014-10-07 23:37:28 +00:00			`KEYWORD(NIBBLE, "nibble"),`
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`KEYWORD(BYTE, "byte"),`
			`KEYWORD(CHAR, "char"),`
			`KEYWORD(WORD, "word"),`
			`KEYWORD(INTEGER, "integer"),`
			`KEYWORD(LONG, "long"),`
			`KEYWORD(LONGINT, "longint"),`
			`KEYWORD(PSTRING, "pstring"),`
			`KEYWORD(PSTRING, "wstring"),`
			`KEYWORD(STRING, "string"),`
			`KEYWORD(POINT, "point"),`
			`KEYWORD(RECT, "rect"),`
			`KEYWORD(BITSTRING, "bitstring"),`

			`KEYWORD(INTEGER, "int"),`
string concat, $$read function; make function names keywords 2014-10-12 17:16:02 +00:00			`KEYWORD(DOLLAR, "$"),`

			`KEYWORD(FUN_COUNTOF, "$$countof"),`
			`KEYWORD(FUN_ARRAYINDEX, "$$arrayindex"),`
			`KEYWORD(FUN_READ, "$$read"),`
			`KEYWORD(FUN_BITFIELD, "$$bitfield"),`
			`KEYWORD(FUN_WORD, "$$word"),`
			`KEYWORD(FUN_BYTE, "$$byte"),`
			`KEYWORD(FUN_LONG, "$$long"),`
Rez: successful parse. 2014-10-05 21:52:34 +00:00			`};`

			`std::string s = tok.get_value().c_str();`
			`std::string lower = s;`
			`std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);`
			`auto p = keywords.find(lower);`
			`if(p == keywords.end())`
			`{`
			`//std::cout << "id: " << s << std::endl;`
			`return RezParser::make_IDENTIFIER(lower, loc);`
			`}`
			`else`
			`{`
			`//std::cout << "key: " << s << std::endl;`
			`return (*p->second)(loc);`
			`}`
			`}`
			`else if(tok == T_INTLIT)`
			`{`
			`if(tok.get_value() == "0")`
			`{`
			`auto tok2 = peekWave();`
			`while(tok2 != T_EOI && tok2 != T_EOF && IS_CATEGORY(tok2, WhiteSpaceTokenType))`
			`nextWave(), tok2 = peekWave();`

			`//std::cout << "!" << std::hex << (token_id)tok2 << std::dec << "\|" << tok2.get_value() << "!\n";`
			`static boost::regex binlit("[bB][01]+");`
			`if(tok2 == T_IDENTIFIER && boost::regex_match(tok2.get_value().c_str(), binlit))`
			`tok = nextWave();`
			`}`
			`return RezParser::make_INTLIT(readInt(tok.get_value().c_str()), loc);`
			`}`
			`else`
			`{`
			`#define NOVAL_TOK(name) \`
			`case T_ ## name: /std::cout << #name << std::endl;/ return RezParser::make_ ## name(loc)`
			`switch(token_id(tok))`
			`{`
			`case T_INTLIT: return RezParser::make_INTLIT(readInt(tok.get_value().c_str()), loc);`

start with semantics 2014-10-06 15:03:25 +00:00			`case T_CHARLIT: return RezParser::make_CHARLIT(readCharLit(tok.get_value().c_str()), loc);`
handle strings properly (remove quotes, handle escapes) 2014-10-08 22:52:17 +00:00			`case T_STRINGLIT: return RezParser::make_STRINGLIT(readStringLit(tok.get_value().c_str()), loc);`
Rez: successful parse. 2014-10-05 21:52:34 +00:00
			`NOVAL_TOK(LEFTBRACE);`
			`NOVAL_TOK(RIGHTBRACE);`
			`NOVAL_TOK(LEFTBRACKET);`
			`NOVAL_TOK(RIGHTBRACKET);`
			`NOVAL_TOK(LEFTPAREN);`
			`NOVAL_TOK(RIGHTPAREN);`
			`NOVAL_TOK(SEMICOLON);`
			`NOVAL_TOK(COMMA);`
			`NOVAL_TOK(PLUS);`
			`NOVAL_TOK(MINUS);`
			`NOVAL_TOK(DIVIDE);`
			`NOVAL_TOK(STAR);`
			`NOVAL_TOK(ASSIGN);`
			`NOVAL_TOK(COLON);`
			`NOVAL_TOK(SHIFTLEFT);`
			`NOVAL_TOK(SHIFTRIGHT);`
			`NOVAL_TOK(EQUAL);`
			`NOVAL_TOK(NOTEQUAL);`
			`NOVAL_TOK(AND);`
			`NOVAL_TOK(OR);`
			`NOVAL_TOK(XOR);`
			`NOVAL_TOK(COMPL);`

			`default:`

			`return RezParser::make_BADTOKEN(tok.get_value().c_str(), loc);`
			`}`

			`}`
			`}`
			`}`
			`return RezSymbol();`
			`}`