6502 stream parser -> Lezer grammar parser

2026-03-11 13:41:43 +00:00 · 2026-02-15 22:07:03 -08:00
parent 367cf6a09d
commit b4505dacc7
4 changed files with 267 additions and 92 deletions
--- a/16
+++ b/16
@@ -1,8 +1,21 @@

 TSC=./node_modules/typescript/bin/tsc --build
+LEZER=./node_modules/.bin/lezer-generator
 TMP=./tmp/dist

-buildtsc:
+buildgrammars:
+	mkdir -p gen/parser
+	$(LEZER) src/parser/lang-6502.grammar -o gen/parser/lang-6502.grammar.js
+
+watchgrammars:
+	while true; do \
+		if [ src/parser/lang-6502.grammar -nt gen/parser/lang-6502.grammar.js ]; then \
+			make buildgrammars; \
+		fi; \
+		sleep 1; \
+	done
+
+buildtsc: buildgrammars
 	npm run esbuild-clean
 	$(TSC) tsconfig.json
 	npm run esbuild
@@ -30,6 +43,7 @@ tsweb:
 	npm run esbuild-clean
 	(ip addr || ifconfig) | grep inet
 	$(TSC) -w --preserveWatchOutput &
+	make watchgrammars &
 	sleep 9999999 | npm run esbuild-worker -- --watch &
 	sleep 9999999 | npm run esbuild-ui -- --watch &
 	python3 scripts/serveit.py 2>> /dev/null #http.out
--- a/src/parser/lang-6502.grammar
+++ b/src/parser/lang-6502.grammar
@@ -0,0 +1,162 @@
+@top Program { Line* }
+
+@skip { space | Comment }
+
+Line {
+  Label? Statement? eol
+}
+
+Statement {
+  Instruction |
+  Directive |
+  MacroDef |
+  MacEnd |
+  ControlOp |
+  ErrorOp
+}
+
+Label { Identifier ":" | Identifier }
+
+Instruction {
+  Opcode Operand?
+}
+
+Register {
+  @specialize<Identifier, "X" | "Y" | "A" | "x" | "y" | "a">
+}
+
+Directive {
+  PseudoOp (Expression)*
+}
+
+PseudoOp {
+  @specialize<Identifier,
+    "ORG" | "EQU" | "END" | "org" | "equ" | "end" |
+    "ds" | "ds.b" | "ds.w" | "dc" | "dc.b" | "dc.w" | "seg" | "seg.u" |
+    "subroutine" |
+    "echo" | "repeat" | "repend" | "set" |
+    "processor" |
+    ".WORD" | ".word" | ".BYTE" | ".byte" | ".END" | ".end"
+  >
+}
+
+Mac { @specialize<Identifier, "mac"> }
+MacEnd { @specialize<Identifier, "endm"> }
+
+ControlOp { @specialize<Identifier, "if" | "else" | "endif"> }
+ErrorOp { @specialize<Identifier, "err"> }
+
+MacroDef {
+  Mac Identifier
+}
+
+CurrentAddress {
+  @specialize<Identifier, ".">
+}
+
+Opcode {
+  @specialize<Identifier,
+    "ADC" | "AND" | "ASL" | "BCC" | "BCS" | "BEQ" | "BIT" | "BMI" |
+    "BNE" | "BPL" | "BRK" | "BVC" | "BVS" | "CLC" | "CLD" | "CLI" |
+    "CLV" | "CMP" | "CPX" | "CPY" | "DEC" | "DEX" | "DEY" | "EOR" |
+    "INC" | "INX" | "INY" | "JMP" | "JSR" | "LDA" | "LDX" | "LDY" |
+    "LSR" | "NOP" | "ORA" | "PHA" | "PHP" | "PLA" | "PLP" | "ROL" |
+    "ROR" | "RTI" | "RTS" | "SBC" | "SEC" | "SED" | "SEI" | "STA" |
+    "STX" | "STY" | "TAX" | "TAY" | "TSX" | "TXA" | "TXS" | "TYA" |
+    "adc" | "and" | "asl" | "bcc" | "bcs" | "beq" | "bit" | "bmi" |
+    "bne" | "bpl" | "brk" | "bvc" | "bvs" | "clc" | "cld" | "cli" |
+    "clv" | "cmp" | "cpx" | "cpy" | "dec" | "dex" | "dey" | "eor" |
+    "inc" | "inx" | "iny" | "jmp" | "jsr" | "lda" | "ldx" | "ldy" |
+    "lsr" | "nop" | "ora" | "pha" | "php" | "pla" | "plp" | "rol" |
+    "ror" | "rti" | "rts" | "sbc" | "sec" | "sed" | "sei" | "sta" |
+    "stx" | "sty" | "tax" | "tay" | "tsx" | "txa" | "txs" | "tya"
+  >
+}
+
+Expression {
+  Expression !logic LogicOp Expression |
+  Expression !bit BitOp Expression |
+  Expression !compare (CompareOp | BinaryLt | BinaryGt) Expression |
+  Expression !term (ArithOp | Plus | Minus | Percent) Expression |
+  UnaryExpression |
+  Value |
+  "(" Expression ")"
+}
+
+UnaryExpression {
+  (Plus | Minus | Not | Tilde | UnaryLt | UnaryGt) Expression
+}
+
+BinaryLt { lt !bin }
+BinaryGt { gt !bin }
+UnaryLt { lt !un }
+UnaryGt { gt !un }
+
+Value {
+  Number |
+  Identifier |
+  CurrentAddress |
+  String |
+  Char
+}
+
+Operand {
+  "#" Expression |
+  "(" Expression Comma Register ")" |
+  Expression (Comma Register)? |
+  Register
+}
+
+@tokens {
+  Identifier { $[a-zA-Z_.] $[a-zA-Z0-9_.]* }
+
+  Number {
+    "$" $[0-9a-fA-F]+ |
+    "%" $[01]+ |
+    $[0-9]+
+  }
+
+  String { '"' (!["\\\n] | "\\" _)* '"' }
+
+  Char { "'" ![\n] "'"? }
+
+  Comment { ";" ![\n]* }
+
+  space { $[ \t]+ }
+  eol { $[\n\r]+ }
+
+  Comma { "," }
+  "#"
+  "(" ")"
+
+  ArithOp { "*" | "/" }
+  Percent { "%" }
+  Plus { "+" }
+  Minus { "-" }
+
+  BitOp { "&" | "|" | "^" | "<<" | ">>" }
+  Tilde { "~" }
+
+  LogicOp { "&&" | "||" }
+  Not { "!" }
+
+  CompareOp { "==" | "!=" | "<=" | ">=" }
+  lt { "<" }
+  gt { ">" }
+
+  @precedence { String, Char, Number, Percent, Identifier }
+}
+
+@precedence {
+  un,
+  term @left,
+  compare @left,
+  bit @left,
+  logic @left,
+  bin @left,
+  PseudoOp,
+  Opcode,
+  Label
+}
+
+@detectDelim
--- a/src/parser/lang-6502.ts
+++ b/src/parser/lang-6502.ts
@@ -1,96 +1,55 @@
-// CodeMirror 6 language support for 6502 assembly
-// Migrated from CodeMirror 5 mode
-// Original copyright (c) by Marijn Haverbeke and others
-// Distributed under an MIT license: https://codemirror.net/5/LICENSE
+import { LRLanguage, LanguageSupport, delimitedIndent, foldInside, foldNodeProp, indentNodeProp } from "@codemirror/language"
+import { styleTags, tags as t } from "@lezer/highlight"
+import { parser } from "../../gen/parser/lang-6502.grammar.js"

-import { StreamLanguage, StreamParser } from "@codemirror/language";
-import { LanguageSupport } from "@codemirror/language";
-
-// TODO: Migrate to CodeMirror 6 Lezer parser.
-const asm6502Parser: StreamParser<{ context: number }> = {
-  startState() {
-    return {
-      context: 0
-    };
-  },
-
-  token(stream, state) {
-    // Labels at start of line
-    if (!stream.column()) {
-      state.context = 0;
-      if (stream.eatWhile(/[\w.]/))
-        return 'labelName';
+export const Lezer6502: LRLanguage = LRLanguage.define({
+    parser: parser.configure({
+        props: [
+            indentNodeProp.add({
+                Application: delimitedIndent({ closing: ")", align: false })
+            }),
+            foldNodeProp.add({
+                Application: foldInside
+            }),
+            styleTags({
+                Identifier: t.variableName,
+                CurrentAddress: t.self,
+                PseudoOp: t.definition(t.variableName),
+                Opcode: t.keyword,
+                Label: t.labelName,
+                String: t.string,
+                Char: t.number,
+                Number: t.number,
+                Register: t.typeName,
+                Comment: t.lineComment,
+                ArithOp: t.arithmeticOperator,
+                Plus: t.arithmeticOperator,
+                Minus: t.arithmeticOperator,
+                Percent: t.arithmeticOperator,
+                BitOp: t.bitwiseOperator,
+                Tilde: t.bitwiseOperator,
+                LogicOp: t.logicOperator,
+                Not: t.logicOperator,
+                CompareOp: t.compareOperator,
+                BinaryLt: t.compareOperator,
+                BinaryGt: t.compareOperator,
+                UnaryLt: t.arithmeticOperator,
+                UnaryGt: t.arithmeticOperator,
+                Mac: t.definitionKeyword,
+                MacEnd: t.definitionKeyword,
+                "MacroDef/Identifier": t.macroName,
+                ControlOp: t.controlKeyword,
+                ErrorOp: t.keyword,
+                Comma: t.separator,
+                "( )": t.paren
+            })
+        ]
+    }),
+    languageData: {
+        commentTokens: { line: ";" }
    }
+})

-    if (stream.eatSpace())
-      return null;
-
-    var w;
-    if (stream.eatWhile(/\w/)) {
-      w = stream.current();
-      var cur = w.toLowerCase();
-
-      // Check for directives
-      var style = directives.get(cur);
-      if (style)
-        return style;
-
-      // Check for opcodes (3-letter mnemonics)
-      if (opcodes.test(w)) {
-        state.context = 4;
-        return 'keyword';
-      } else if (state.context == 4 && numbers.test(w)) {
-        return 'number';
-      } else if (stream.match(numbers)) {
-        return 'number';
-      } else {
-        return null;
-      }
-    } else if (stream.eat(';')) {
-      stream.skipToEnd();
-      return 'comment';
-    } else if (stream.eat('"')) {
-      while (w = stream.next()) {
-        if (w == '"')
-          break;
-
-        if (w == '\\')
-          stream.next();
-      }
-      return 'string';
-    } else if (stream.eat('\'')) {
-      if (stream.match(/\\?.'/) || stream.match(/\\?.'/))
-        return 'number';
-    } else if (stream.eat('$') || stream.eat('#')) {
-      if (stream.eatWhile(/[^;]/i))
-        return 'number';
-    } else if (stream.eat('%')) {
-      if (stream.eatWhile(/[01]/))
-        return 'number';
-    } else {
-      stream.next();
-    }
-    return null;
-  }
-};
-
-// Directive keywords
-const directives_list = [
-  'processor',
-  'byte', 'word', 'long',
-  'include', 'seg', 'dc', 'ds', 'dv', 'hex', 'err', 'org', 'rorg', 'echo', 'rend',
-  'align', 'subroutine', 'equ', 'eqm', 'set', 'mac', 'endm', 'mexit', 'ifconst',
-  'ifnconst', 'if', 'else', 'endif', 'eif', 'repeat', 'repend'
-];
-const directives = new Map<string, string>();
-directives_list.forEach(function (s) { directives.set(s, 'keyword'); });
-
-const opcodes = /^[a-z][a-z][a-z]\b/i;
-const numbers = /^([\da-f]+h|[0-7]+o|[01]+b|\d+d?)\b/i;
-
-/**
- * Language support for 6502 assembly language
- */
 export function asm6502(): LanguageSupport {
-  return new LanguageSupport(StreamLanguage.define(asm6502Parser));
+    return new LanguageSupport(Lezer6502)
 }
--- a/test/parsers/testparser6502.js
+++ b/test/parsers/testparser6502.js
@@ -0,0 +1,40 @@
+
+const assert = require('assert');
+const { EditorState } = require("@codemirror/state");
+const { syntaxTree } = require("@codemirror/language");
+const { asm6502 } = require("../../gen/parser/lang-6502.js");
+
+
+describe('6502 Parser', function () {
+
+    it('Should parse basic instructions', function () {
+        const code = `
+      lda #$00
+      sta $1234
+      rts
+    `;
+
+        // Create an editor state with the new parser
+        const state = EditorState.create({
+            doc: code,
+            extensions: [asm6502()]
+        });
+
+        // Check if the tree is available (basic check that parser didn't crash)
+        // In a real environment we might traverse the tree to check specific nodes
+        // but here we just want to ensure it instantiates and runs without throwing.
+        assert.ok(syntaxTree(state), "Syntax tree should be generated");
+    });
+
+    it('Should handle labels', function () {
+        const code = `
+    start:
+      jmp start
+    `;
+        const state = EditorState.create({
+            doc: code,
+            extensions: [asm6502()]
+        });
+        assert.ok(syntaxTree(state), "Syntax tree should be generated");
+    });
+});