mirror of
https://github.com/michaelcmartin/Ophis.git
synced 2024-05-29 00:41:29 +00:00
41bf01d035
Most of the work is handled by 2to3, but there's a few extra tricks needed to finish the job, mostly about picking the right bits to be Unicode and the right bits to be bytes.
446 lines
15 KiB
Python
446 lines
15 KiB
Python
"""Lexer and Parser
|
|
|
|
Constructs a list of IR nodes from a list of input strings."""
|
|
|
|
import Ophis.Errors as Err
|
|
import Ophis.Opcodes as Ops
|
|
import Ophis.IR as IR
|
|
import Ophis.CmdLine as Cmd
|
|
import sys
|
|
import os
|
|
import os.path
|
|
|
|
# Copyright 2002-2014 Michael C. Martin and additional contributors.
|
|
# You may use, modify, and distribute this file under the MIT
|
|
# license: See README for details.
|
|
|
|
|
|
loadedfiles = {}
|
|
templabelcount = 0
|
|
|
|
|
|
class Lexeme(object):
|
|
"Class for lexer tokens. Used by lexer and parser."
|
|
def __init__(self, type="UNKNOWN", value=None):
|
|
self.type = type.upper()
|
|
self.value = value
|
|
|
|
def __str__(self):
|
|
if self.value is None:
|
|
return self.type
|
|
else:
|
|
return self.type + ":" + str(self.value)
|
|
|
|
def __repr__(self):
|
|
return "Lexeme(" + repr(self.type) + ", " + repr(self.value) + ")"
|
|
|
|
def matches(self, other):
|
|
"1 if Lexemes a and b have the same type."
|
|
return self.type == other.type
|
|
|
|
|
|
bases = {"$": ("hexadecimal", 16),
|
|
"%": ("binary", 2),
|
|
"0": ("octal", 8)}
|
|
|
|
|
|
punctuation = "#,`<>():.+-*/&|^[]"
|
|
|
|
|
|
def lex(point, line):
|
|
"""Turns a line of source into a sequence of lexemes."""
|
|
Err.currentpoint = point
|
|
result = []
|
|
|
|
def is_opcode(op):
|
|
"Tests whether a string is an opcode or an identifier"
|
|
return op in Ops.opcodes
|
|
|
|
def add_token(token):
|
|
"Converts a substring into a single lexeme"
|
|
if token == "":
|
|
return
|
|
if token == "0":
|
|
result.append(Lexeme("NUM", 0))
|
|
return
|
|
firstchar = token[0]
|
|
rest = token[1:]
|
|
if firstchar == '"':
|
|
result.append(Lexeme("STRING", rest))
|
|
return
|
|
elif firstchar in bases:
|
|
try:
|
|
result.append(Lexeme("NUM", int(rest, bases[firstchar][1])))
|
|
return
|
|
except ValueError:
|
|
Err.log('Invalid ' + bases[firstchar][0] + ' constant: ' +
|
|
rest)
|
|
result.append(Lexeme("NUM", 0))
|
|
return
|
|
elif firstchar.isdigit():
|
|
try:
|
|
result.append(Lexeme("NUM", int(token)))
|
|
except ValueError:
|
|
Err.log('Identifiers may not begin with a number')
|
|
result.append(Lexeme("LABEL", "ERROR"))
|
|
return
|
|
elif firstchar == "'":
|
|
if len(rest) == 1:
|
|
result.append(Lexeme("NUM", ord(rest)))
|
|
else:
|
|
Err.log("Invalid character constant '" + rest + "'")
|
|
result.append(Lexeme("NUM", 0))
|
|
return
|
|
elif firstchar in punctuation:
|
|
if rest != "":
|
|
Err.log("Internal lexer error! '" + token + "' can't happen!")
|
|
result.append(Lexeme(firstchar))
|
|
return
|
|
else: # Label, opcode, or index register
|
|
id = token.lower()
|
|
if is_opcode(id):
|
|
result.append(Lexeme("OPCODE", id))
|
|
elif id == "x":
|
|
result.append(Lexeme("X"))
|
|
elif id == "y":
|
|
result.append(Lexeme("Y"))
|
|
elif id == "z":
|
|
result.append(Lexeme("Z"))
|
|
elif id == "sp":
|
|
result.append(Lexeme("SP"))
|
|
else:
|
|
result.append(Lexeme("LABEL", id))
|
|
return
|
|
# should never reach here
|
|
Err.log("Internal lexer error: add_token fall-through")
|
|
|
|
def add_EOL():
|
|
"Adds an end-of-line lexeme"
|
|
result.append(Lexeme("EOL"))
|
|
|
|
# Actual routine begins here
|
|
value = ""
|
|
quotemode = False
|
|
backslashmode = False
|
|
for c in line.strip():
|
|
if backslashmode:
|
|
backslashmode = False
|
|
value = value + c
|
|
elif c == "\\":
|
|
backslashmode = True
|
|
elif quotemode:
|
|
if c == '"':
|
|
quotemode = False
|
|
else:
|
|
value = value + c
|
|
elif c == ';':
|
|
add_token(value)
|
|
value = ""
|
|
break
|
|
elif c == '.' and value != "":
|
|
value = value + c
|
|
elif c.isspace():
|
|
add_token(value)
|
|
value = ""
|
|
elif c in punctuation:
|
|
add_token(value)
|
|
add_token(c)
|
|
value = ""
|
|
elif c == '"':
|
|
add_token(value)
|
|
value = '"'
|
|
quotemode = True
|
|
else:
|
|
value = value + c
|
|
if backslashmode:
|
|
Err.log("Backslashed newline")
|
|
if quotemode:
|
|
Err.log("Unterminated string constant")
|
|
add_token(value)
|
|
add_EOL()
|
|
return result
|
|
|
|
|
|
class ParseLine(object):
|
|
"Maintains the parse state of a line of code. Enables arbitrary lookahead."
|
|
|
|
def __init__(self, lexemes):
|
|
self.lexemes = lexemes
|
|
self.location = 0
|
|
|
|
def lookahead(self, i):
|
|
"""Returns the token i units ahead in the parse.
|
|
lookahead(0) returns the next token; trying to read off the end of
|
|
the sequence returns the last token in the sequence (usually EOL)."""
|
|
target = self.location + i
|
|
if target >= len(self.lexemes):
|
|
target = -1
|
|
return self.lexemes[target]
|
|
|
|
def pop(self):
|
|
"Returns and removes the next element in the line."
|
|
old = self.location
|
|
if self.location < len(self.lexemes) - 1:
|
|
self.location += 1
|
|
return self.lexemes[old]
|
|
|
|
def expect(self, *tokens):
|
|
"""Reads a token from the ParseLine line and returns it if it's of a
|
|
type in the sequence tokens. Otherwise, it logs an error."""
|
|
token = self.pop()
|
|
if token.type in tokens:
|
|
return token
|
|
if 'LABEL' in tokens:
|
|
if token.type in ['X', 'Y', 'Z', 'SP']:
|
|
token.value = token.type.lower()
|
|
token.type = 'LABEL'
|
|
return token
|
|
elif token.type == 'OPCODE':
|
|
token.type = 'LABEL'
|
|
return token
|
|
Err.log('Expected: "' + '", "'.join(tokens) + '"')
|
|
return token
|
|
|
|
|
|
pragma_modules = []
|
|
|
|
|
|
def parse_expr(line):
|
|
"Parses an Ophis arithmetic expression."
|
|
|
|
def atom():
|
|
"Parses lowest-priority expression components."
|
|
global templabelcount
|
|
next = line.lookahead(0).type
|
|
if next == "NUM":
|
|
return IR.ConstantExpr(line.expect("NUM").value)
|
|
elif next in ["LABEL", "X", "Y", "Z", "SP", "OPCODE"]:
|
|
return IR.LabelExpr(line.expect("LABEL").value)
|
|
elif next == "^":
|
|
line.expect("^")
|
|
return IR.PCExpr()
|
|
elif next == "[":
|
|
line.expect("[")
|
|
result = parse_expr(line)
|
|
line.expect("]")
|
|
return result
|
|
elif next == "+":
|
|
offset = 0
|
|
while next == "+":
|
|
offset += 1
|
|
line.expect("+")
|
|
next = line.lookahead(0).type
|
|
return IR.LabelExpr("*" + str(templabelcount + offset))
|
|
elif next == "-":
|
|
offset = 1
|
|
while next == "-":
|
|
offset -= 1
|
|
line.expect("-")
|
|
next = line.lookahead(0).type
|
|
return IR.LabelExpr("*" + str(templabelcount + offset))
|
|
elif next == ">":
|
|
line.expect(">")
|
|
return IR.HighByteExpr(atom())
|
|
elif next == "<":
|
|
line.expect("<")
|
|
return IR.LowByteExpr(atom())
|
|
else:
|
|
Err.log('Expected: expression')
|
|
|
|
def precedence_read(constructor, reader, separators):
|
|
"""Handles precedence. The reader argument is a function that returns
|
|
expressions that bind more tightly than these; separators is a list
|
|
of strings naming the operators at this precedence level. The
|
|
constructor argument is a class, indicating what node type holds
|
|
objects of this precedence level.
|
|
|
|
Returns a list of Expr objects with separator strings between them."""
|
|
result = [reader()] # first object
|
|
nextop = line.lookahead(0).type
|
|
while (nextop in separators):
|
|
line.expect(nextop)
|
|
result.append(nextop)
|
|
result.append(reader())
|
|
nextop = line.lookahead(0).type
|
|
if len(result) == 1:
|
|
return result[0]
|
|
return constructor(result)
|
|
|
|
def term():
|
|
"Parses * and /"
|
|
return precedence_read(IR.SequenceExpr, atom, ["*", "/"])
|
|
|
|
def arith():
|
|
"Parses + and -"
|
|
return precedence_read(IR.SequenceExpr, term, ["+", "-"])
|
|
|
|
def bits():
|
|
"Parses &, |, and ^"
|
|
return precedence_read(IR.SequenceExpr, arith, ["&", "|", "^"])
|
|
|
|
return bits()
|
|
|
|
|
|
def parse_line(ppt, lexemelist):
|
|
"Turn a line of source into an IR Node."
|
|
Err.currentpoint = ppt
|
|
result = []
|
|
line = ParseLine(lexemelist)
|
|
|
|
def aux():
|
|
"Accumulates all IR nodes defined by this line."
|
|
if line.lookahead(0).type == "EOL":
|
|
pass
|
|
elif line.lookahead(1).type == ":":
|
|
newlabel = line.expect("LABEL").value
|
|
line.expect(":")
|
|
result.append(IR.Node(ppt, "Label", newlabel, IR.PCExpr()))
|
|
aux()
|
|
elif line.lookahead(0).type == "*":
|
|
global templabelcount
|
|
templabelcount = templabelcount + 1
|
|
result.append(IR.Node(ppt, "Label", "*" + str(templabelcount),
|
|
IR.PCExpr()))
|
|
line.expect("*")
|
|
aux()
|
|
elif line.lookahead(0).type == "." or line.lookahead(0).type == "`":
|
|
which = line.expect(".", "`").type
|
|
if (which == "."):
|
|
pragma = line.expect("LABEL").value
|
|
else:
|
|
pragma = "invoke"
|
|
pragmaFunction = "pragma" + pragma.title()
|
|
for mod in pragma_modules:
|
|
if hasattr(mod, pragmaFunction):
|
|
getattr(mod, pragmaFunction)(ppt, line, result)
|
|
break
|
|
else:
|
|
Err.log("Unknown pragma " + pragma)
|
|
else: # Instruction
|
|
opcode = line.expect("OPCODE").value
|
|
arg2 = None
|
|
if line.lookahead(0).type == "#":
|
|
mode = "Immediate"
|
|
line.expect("#")
|
|
arg = parse_expr(line)
|
|
line.expect("EOL")
|
|
elif line.lookahead(0).type == "(":
|
|
line.expect("(")
|
|
arg = parse_expr(line)
|
|
if line.lookahead(0).type == ",":
|
|
line.expect(",")
|
|
if line.lookahead(0).type == "X":
|
|
mode = "PointerX"
|
|
line.expect("X")
|
|
line.expect(")")
|
|
line.expect("EOL")
|
|
else:
|
|
mode = "PointerSPY"
|
|
line.expect("SP")
|
|
line.expect(")")
|
|
line.expect(",")
|
|
line.expect("Y")
|
|
line.expect("EOL")
|
|
else:
|
|
line.expect(")")
|
|
tok = line.expect(",", "EOL").type
|
|
if tok == "EOL":
|
|
mode = "Pointer"
|
|
else:
|
|
if line.lookahead(0).type == "Y":
|
|
mode = "PointerY"
|
|
line.expect("Y")
|
|
line.expect("EOL")
|
|
else:
|
|
mode = "PointerZ"
|
|
line.expect("Z")
|
|
line.expect("EOL")
|
|
elif line.lookahead(0).type == "EOL":
|
|
mode = "Implied"
|
|
arg = None
|
|
else:
|
|
arg = parse_expr(line)
|
|
tok = line.expect("EOL", ",").type
|
|
if tok == ",":
|
|
# Parser has to special-case the BBXn instructions,
|
|
# Which uniquely take two addresses
|
|
if opcode[:3] in ["bbs", "bbr"]:
|
|
arg2 = parse_expr(line)
|
|
mode = "Memory2"
|
|
else:
|
|
tok = line.expect("X", "Y", "Z").type
|
|
if tok == "X":
|
|
mode = "MemoryX"
|
|
elif tok == "Y":
|
|
mode = "MemoryY"
|
|
else:
|
|
mode = "MemoryZ"
|
|
line.expect("EOL")
|
|
else:
|
|
mode = "Memory"
|
|
result.append(IR.Node(ppt, mode, opcode, arg, arg2))
|
|
|
|
aux()
|
|
result = [node for node in result if node is not IR.NullNode]
|
|
if len(result) == 0:
|
|
return IR.NullNode
|
|
if len(result) == 1:
|
|
return result[0]
|
|
return IR.SequenceNode(ppt, result)
|
|
|
|
|
|
context_directory = None
|
|
|
|
|
|
def parse_file(ppt, filename, load_once=False):
|
|
"Loads an Ophis source file, and returns an IR list."
|
|
global context_directory, loadedfiles
|
|
Err.currentpoint = ppt
|
|
old_context = context_directory
|
|
if filename != '-':
|
|
if context_directory is not None:
|
|
filename = os.path.abspath(os.path.join(context_directory,
|
|
filename))
|
|
if load_once and filename in loadedfiles:
|
|
if Cmd.print_loaded_files:
|
|
print("Skipping " + filename, file=sys.stderr)
|
|
return IR.NullNode
|
|
loadedfiles[filename] = True
|
|
if Cmd.print_loaded_files:
|
|
if filename != '-':
|
|
print("Loading " + filename, file=sys.stderr)
|
|
else:
|
|
print("Loading from standard input", file=sys.stderr)
|
|
try:
|
|
if filename != '-':
|
|
if context_directory is not None:
|
|
filename = os.path.join(context_directory, filename)
|
|
f = open(filename, "rt")
|
|
linelist = f.readlines()
|
|
f.close()
|
|
context_directory = os.path.abspath(os.path.dirname(filename))
|
|
else:
|
|
context_directory = os.getcwd()
|
|
linelist = sys.stdin.readlines()
|
|
pptlist = ["%s:%d" % (filename, i + 1) for i in range(len(linelist))]
|
|
lexlist = list(map(lex, pptlist, linelist))
|
|
IRlist = list(map(parse_line, pptlist, lexlist))
|
|
IRlist = [node for node in IRlist if node is not IR.NullNode]
|
|
context_directory = old_context
|
|
return IR.SequenceNode(ppt, IRlist)
|
|
except IOError:
|
|
Err.log("Could not read " + filename)
|
|
context_directory = old_context
|
|
return IR.NullNode
|
|
|
|
|
|
def parse(filenames):
|
|
"""Top level parsing routine, taking a source file name
|
|
list and returning an IR list."""
|
|
global templabelcount
|
|
templabelcount = 0
|
|
nodes = [parse_file("<Top Level>", f) for f in filenames]
|
|
if len(nodes) == 1:
|
|
return nodes[0]
|
|
return IR.SequenceNode("<Top level>", nodes)
|