1
0
mirror of https://github.com/sehugg/8bitworkshop.git synced 2025-08-15 17:27:22 +00:00

tokenizer is no longer line-based

This commit is contained in:
Steven Hugg
2022-02-01 09:13:37 -06:00
parent 40cc5ee118
commit 4e5beb6c74
5 changed files with 245 additions and 229 deletions

View File

@@ -38,7 +38,7 @@
var numbers = /^(0x[\da-f]+|[\da-f]+h|[0-7]+o|[01]+b|\d+d?)\b/i;
var tags = /^\{\{.*\}\}/;
var comment = /\/\/.*/;
var mlcomment = /^---.+?---\b/i;
var mlcomment = /\/\*.*?\*\//s; // TODO
return {
startState: function () {
@@ -56,6 +56,9 @@
if (stream.match(comment)) {
return 'comment';
}
if (stream.match(mlcomment)) {
return 'comment';
}
var w;
if (stream.eatWhile(/\w/)) {

View File

@@ -8,6 +8,7 @@ export enum ECSTokenType {
Operator = 'delimiter',
QuotedString = 'quoted-string',
Integer = 'integer',
CodeFragment = 'code-fragment',
}
export class ECSCompiler extends Tokenizer {
@@ -19,15 +20,16 @@ export class ECSCompiler extends Tokenizer {
super();
//this.includeEOL = true;
this.setTokenRules([
{ type: TokenType.CodeFragment, regex: /---/ },
{ type: ECSTokenType.Ellipsis, regex: /\.\./ },
{ type: ECSTokenType.Operator, regex: /[#=,:(){}\[\]]/ },
{ type: ECSTokenType.QuotedString, regex: /".*?"/ },
{ type: ECSTokenType.CodeFragment, regex: /---.*?---/ },
{ type: ECSTokenType.Integer, regex: /[-]?0x[A-Fa-f0-9]+/ },
{ type: ECSTokenType.Integer, regex: /[-]?\$[A-Fa-f0-9]+/ },
{ type: ECSTokenType.Integer, regex: /[-]?\d+/ },
{ type: TokenType.Ident, regex: /[A-Za-z_][A-Za-z0-9_]*/ },
{ type: TokenType.Ignore, regex: /\/\/.*/ },
{ type: TokenType.Ignore, regex: /\/\/.*?[\n\r]/ },
{ type: TokenType.Ignore, regex: /\/\*.*?\*\// },
{ type: TokenType.Ignore, regex: /\s+/ },
]);
this.errorOnCatchAll = true;
@@ -43,7 +45,11 @@ export class ECSCompiler extends Tokenizer {
parseFile(text: string, path: string) {
this.tokenizeFile(text, path);
while (!this.isEOF()) {
this.annotate(() => this.parseTopLevel());
let top = this.parseTopLevel();
if (top) {
let t = top;
this.annotate(() => t); // TODO? typescript bug?
}
}
}
@@ -63,7 +69,7 @@ export class ECSCompiler extends Tokenizer {
return this.em.defineSystem(this.parseResource());
}
if (tok.str == 'comment') {
this.expectTokenTypes([TokenType.CodeFragment]);
this.expectTokenTypes([ECSTokenType.CodeFragment]);
return;
}
this.compileError(`Unexpected top-level keyword: ${tok.str}`);
@@ -97,7 +103,7 @@ export class ECSCompiler extends Tokenizer {
return { dtype: 'ref', query: this.parseQuery() } as RefType;
}
if (this.peekToken().str == 'array') {
let index : IntType;
let index : IntType | undefined = undefined;
this.expectToken('array');
if (this.peekToken().type == ECSTokenType.Integer) {
index = this.parseDataType() as IntType;
@@ -105,7 +111,7 @@ export class ECSCompiler extends Tokenizer {
this.expectToken('of');
return { dtype: 'array', index, elem: this.parseDataType() } as ArrayType;
}
this.compileError(`Unknown data type`); // TODO
this.internalError(); throw new Error();
}
parseDataValue(field: DataField) : DataValue {
@@ -123,6 +129,9 @@ export class ECSCompiler extends Tokenizer {
let id = e.id;
if (reftype) {
// TODO: make this a function? elo ehi etc?
if (!this.currentScope) {
this.compileError("This type can only exist inside of a scope."); throw new Error()
};
let atypes = this.em.archetypesMatching(reftype.query);
let entities = this.currentScope.entitiesMatching(atypes);
if (entities.length == 0) this.compileError(`This entitiy doesn't seem to fit the reference type.`);
@@ -130,7 +139,7 @@ export class ECSCompiler extends Tokenizer {
}
return id;
}
this.compileError(`Unknown data value`); // TODO
this.internalError(); throw new Error();
}
parseDataArray() {
@@ -186,7 +195,8 @@ export class ECSCompiler extends Tokenizer {
this.expectToken('do');
let select = this.expectTokens(['once', 'foreach', 'source', 'join']).str as SelectType; // TODO: type check?
let query = this.parseQuery();
let join = select == 'join' && this.parseQuery();
let join = undefined;
if (select == 'join') join = this.parseQuery();
let emits;
let limit;
if (this.peekToken().str == 'limit') {
@@ -225,8 +235,8 @@ export class ECSCompiler extends Tokenizer {
parseCode(): string {
// TODO: add $loc
let tok = this.expectTokenTypes([TokenType.CodeFragment]);
let code = tok.str;
let tok = this.expectTokenTypes([ECSTokenType.CodeFragment]);
let code = tok.str.substring(3, tok.str.length-3);
let lines = code.split('\n');
for (let i=0; i<lines.length; i++) {
lines[i] = ` .dbg line, "${this.path}", ${tok.$loc.line+i}\n` + lines[i];
@@ -236,7 +246,7 @@ export class ECSCompiler extends Tokenizer {
parseScope() : EntityScope {
let name = this.expectIdent().str;
let scope = this.em.newScope(name, this.currentScope);
let scope = this.em.newScope(name, this.currentScope || undefined);
this.currentScope = scope;
let cmd;
while ((cmd = this.expectTokens(['entity', 'comment', 'end']).str) != 'end') {
@@ -244,14 +254,15 @@ export class ECSCompiler extends Tokenizer {
this.annotate(() => this.parseEntity());
}
if (cmd == 'comment') {
this.expectTokenTypes([TokenType.CodeFragment]);
this.expectTokenTypes([ECSTokenType.CodeFragment]);
}
}
this.currentScope = scope.parent;
this.currentScope = scope.parent || null;
return scope;
}
parseEntity() : Entity {
if (!this.currentScope) { this.internalError(); throw new Error(); }
let name = '';
if (this.peekToken().type == TokenType.Ident) {
name = this.expectIdent().str;
@@ -267,7 +278,7 @@ export class ECSCompiler extends Tokenizer {
if (comps.length == 0) this.compileError(`I couldn't find a field named "${name}" for this entity.`)
if (comps.length > 1) this.compileError(`I found more than one field named "${name}" for this entity.`)
let field = comps[0].fields.find(f => f.name == name);
if (!field) this.internalError();
if (!field) { this.internalError(); throw new Error(); }
this.expectToken('=');
let value = this.parseDataValue(field);
if (cmd == 'const') this.currentScope.setConstValue(e, comps[0], name, value);
@@ -291,10 +302,14 @@ export class ECSCompiler extends Tokenizer {
}
parseEntityRef(reftype?: RefType) : Entity {
if (!this.currentScope) { this.internalError(); throw new Error(); }
this.expectToken('#');
let name = this.expectIdent().str;
let eref = this.currentScope.entities.find(e => e.name == name);
if (!eref) this.compileError(`I couldn't find an entity named "${name}" in this scope.`)
if (!eref) {
this.compileError(`I couldn't find an entity named "${name}" in this scope.`)
throw new Error();
}
return eref;
}

View File

@@ -26,7 +26,6 @@ export enum TokenType {
Ident = 'ident',
Comment = 'comment',
Ignore = 'ignore',
CodeFragment = 'code-fragment',
CatchAll = 'catch-all',
}
@@ -49,79 +48,78 @@ function re_escape(rule: TokenRule): string {
return `(${rule.regex.source})`;
}
export class Tokenizer {
export class TokenizerRuleSet {
rules: TokenRule[];
regex: RegExp;
constructor(rules: TokenRule[]) {
this.rules = rules.concat(CATCH_ALL_RULES);
var pattern = this.rules.map(re_escape).join('|');
this.regex = new RegExp(pattern, "gs"); // global, dotall
}
}
export class Tokenizer {
ruleset: TokenizerRuleSet;
lineindex: number[];
path: string;
lineno: number;
tokens: Token[];
lasttoken: Token;
errors: WorkerError[];
curlabel: string;
eol: Token;
includeEOL = false;
eof: Token;
errorOnCatchAll = false;
codeFragment : string | null = null;
codeFragmentStart : SourceLocation | null = null;
constructor() {
this.lineno = 0;
this.errors = [];
this.lineno = 0;
this.lineindex = [];
this.tokens = [];
}
setTokenRuleSet(ruleset: TokenizerRuleSet) {
this.ruleset = ruleset;
}
setTokenRules(rules: TokenRule[]) {
this.rules = rules.concat(CATCH_ALL_RULES);
var pattern = this.rules.map(re_escape).join('|');
this.regex = new RegExp(pattern, "g");
this.setTokenRuleSet(new TokenizerRuleSet(rules));
}
tokenizeFile(contents: string, path: string) {
this.path = path;
this.tokens = []; // can't have errors until this is set
let txtlines = contents.split(/\n|\r\n?/);
txtlines.forEach((line) => this._tokenize(line));
this._pushToken({ type: TokenType.EOF, str: "", $loc: { path: this.path, line: this.lineno } });
let m;
let re = /\n|\r\n?/g;
this.lineindex.push(0);
while (m = re.exec(contents)) {
this.lineindex.push(m.index);
}
tokenizeLine(line: string) : void {
this.lineno++;
this._tokenize(line);
this._tokenize(contents);
this.eof = { type: TokenType.EOF, str: "", $loc: { path: this.path, line: this.lineno } };
this.pushToken(this.eof);
}
_tokenize(line: string): void {
this.lineno++;
this.eol = { type: TokenType.EOL, str: "", $loc: { path: this.path, line: this.lineno, start: line.length } };
_tokenize(text: string): void {
// iterate over each token via re_toks regex
let m: RegExpMatchArray;
while (m = this.regex.exec(line)) {
this.lineno = 0;
while (m = this.ruleset.regex.exec(text)) {
let found = false;
// find line #
while (m.index >= this.lineindex[this.lineno]) {
this.lineno++;
}
// find out which capture group was matched, and thus token type
for (let i = 0; i < this.rules.length; i++) {
let rules = this.ruleset.rules;
for (let i = 0; i < rules.length; i++) {
let s: string = m[i + 1];
if (s != null) {
found = true;
let loc = { path: this.path, line: this.lineno, start: m.index, end: m.index + s.length };
let rule = this.rules[i];
let rule = rules[i];
// add token to list
switch (rule.type) {
case TokenType.CodeFragment:
// TODO: empty code fragment doesn't work
if (this.codeFragment != null) {
let codeLoc = mergeLocs(this.codeFragmentStart, loc);
this._pushToken({ str: this.codeFragment, type: rule.type, $loc: codeLoc });
this.codeFragmentStart = null;
this.codeFragment = null;
} else {
loc.line++;
this.codeFragmentStart = loc;
this.codeFragment = '';
return; // don't add any more tokens (TODO: check for trash?)
}
break;
case TokenType.CatchAll:
if (this.errorOnCatchAll && this.codeFragment == null) {
if (this.errorOnCatchAll) {
this.compileError(`I didn't expect the character "${m[0]}" here.`);
}
default:
if (this.codeFragment == null) {
this._pushToken({ str: s, type: rule.type, $loc: loc });
}
this.pushToken({ str: s, type: rule.type, $loc: loc });
case TokenType.Comment:
case TokenType.Ignore:
break;
@@ -133,14 +131,8 @@ export class Tokenizer {
this.compileError(`Could not parse token: <<${m[0]}>>`)
}
}
if (this.includeEOL) {
this._pushToken(this.eol);
}
if (this.codeFragment != null) {
this.codeFragment += line + '\n';
}
}
_pushToken(token: Token) {
pushToken(token: Token) {
this.tokens.push(token);
}
addError(msg: string, loc?: SourceLocation) {
@@ -161,10 +153,10 @@ export class Tokenizer {
}
peekToken(lookahead?: number): Token {
let tok = this.tokens[lookahead || 0];
return tok ? tok : this.eol;
return tok ? tok : this.eof;
}
consumeToken(): Token {
let tok = this.lasttoken = (this.tokens.shift() || this.eol);
let tok = this.lasttoken = (this.tokens.shift() || this.eof);
return tok;
}
expectToken(str: string, msg?: string): Token {

View File

@@ -293,7 +293,10 @@ function testCompiler() {
let c = new ECSCompiler();
try {
c.parseFile(`
// comment
/*
mju,fjeqowfjqewiofjqe
*/
component Kernel
lines: 0..255
bgcolor: 0..255

View File

@@ -144,26 +144,29 @@ describe('Tokenizer', function() {
{ type: 'qstring', regex: /".*?"/ },
{ type: 'integer', regex: /[-]?\d+/ },
{ type: 'ignore', regex: /\s+/ },
{ type: TokenType.CodeFragment, regex: /---/ },
]);
t.tokenizeFile("\n{\"key\" value\n \"number\" 531\n \"f\" (fn [x] (+ x 2))}\n", "test.file");
t.tokenizeFile("a\n{\"key\" value\n \"number\" 531\n\n \"f\" (fn [x] (+ x 2))}\n", "test.file");
assert.strictEqual(t.tokens.map(t => t.type).join(' '),
'delim qstring ident qstring integer qstring delim ident delim ident delim delim catch-all ident integer delim delim delim eof');
'ident delim qstring ident qstring integer qstring delim ident delim ident delim delim catch-all ident integer delim delim delim eof');
assert.strictEqual(t.tokens.map(t => t.str).join(' '),
'{ "key" value "number" 531 "f" ( fn [ x ] ( + x 2 ) ) } ');
assert.strictEqual(19, t.tokens.length);
assert.strictEqual('{', t.peekToken().str);
assert.strictEqual('{', t.expectToken('{').str);
'a { "key" value "number" 531 "f" ( fn [ x ] ( + x 2 ) ) } ');
assert.strictEqual(t.tokens.map(t => t.$loc.line).join(' '),
'1 2 2 2 3 3 5 5 5 5 5 5 5 5 5 5 5 5 5 6');
assert.strictEqual(20, t.tokens.length);
assert.strictEqual('a', t.peekToken().str);
assert.strictEqual('a', t.expectToken('a').str);
t.expectTokens(['foo', '{']);
t.pushbackToken(t.consumeToken());
assert.strictEqual('"key"', t.consumeToken().str);
assert.deepStrictEqual({ 'value': true }, t.parseModifiers(['foo', 'value', 'bar']));
assert.deepStrictEqual([], t.errors);
t.includeEOL = true;
t.tokenizeFile("\n{\"key\" value\n \"number\" 531\n \"f\" (fn [x] (+ x 2))}\n", "test.file");
assert.strictEqual(24, t.tokens.length);
assert.strictEqual(t.tokens.map(t => t.type).join(' '),
'eol delim qstring ident eol qstring integer eol qstring delim ident delim ident delim delim catch-all ident integer delim delim delim eol eol eof');
t.includeEOL = false;
t = new Tokenizer();
t.setTokenRules([
{ type: 'ident', regex: /[$A-Za-z_][A-Za-z0-9_-]*/ },
{ type: 'delim', regex: /[\(\)\{\}\[\]]/ },
{ type: 'code-fragment', regex: /---[^-]*---/ },
{ type: 'ignore', regex: /\s+/ },
]);
t.tokenizeFile("key value ---\nthis is\na fragment\n--- foo", "test.file");
assert.strictEqual(t.tokens.map(t => t.type).join(' '),
'ident ident code-fragment ident eof');