Ophis/src/Ophis/Frontend.py

446 lines
15 KiB
Python

"""Lexer and Parser
Constructs a list of IR nodes from a list of input strings."""
import Ophis.Errors as Err
import Ophis.Opcodes as Ops
import Ophis.IR as IR
import Ophis.CmdLine as Cmd
import sys
import os
import os.path
# Copyright 2002-2014 Michael C. Martin and additional contributors.
# You may use, modify, and distribute this file under the MIT
# license: See README for details.
loadedfiles = {}
templabelcount = 0
class Lexeme(object):
"Class for lexer tokens. Used by lexer and parser."
def __init__(self, type="UNKNOWN", value=None):
self.type = type.upper()
self.value = value
def __str__(self):
if self.value is None:
return self.type
else:
return self.type + ":" + str(self.value)
def __repr__(self):
return "Lexeme(" + repr(self.type) + ", " + repr(self.value) + ")"
def matches(self, other):
"1 if Lexemes a and b have the same type."
return self.type == other.type
bases = {"$": ("hexadecimal", 16),
"%": ("binary", 2),
"0": ("octal", 8)}
punctuation = "#,`<>():.+-*/&|^[]"
def lex(point, line):
"""Turns a line of source into a sequence of lexemes."""
Err.currentpoint = point
result = []
def is_opcode(op):
"Tests whether a string is an opcode or an identifier"
return op in Ops.opcodes
def add_token(token):
"Converts a substring into a single lexeme"
if token == "":
return
if token == "0":
result.append(Lexeme("NUM", 0))
return
firstchar = token[0]
rest = token[1:]
if firstchar == '"':
result.append(Lexeme("STRING", rest))
return
elif firstchar in bases:
try:
result.append(Lexeme("NUM", int(rest, bases[firstchar][1])))
return
except ValueError:
Err.log('Invalid ' + bases[firstchar][0] + ' constant: ' +
rest)
result.append(Lexeme("NUM", 0))
return
elif firstchar.isdigit():
try:
result.append(Lexeme("NUM", int(token)))
except ValueError:
Err.log('Identifiers may not begin with a number')
result.append(Lexeme("LABEL", "ERROR"))
return
elif firstchar == "'":
if len(rest) == 1:
result.append(Lexeme("NUM", ord(rest)))
else:
Err.log("Invalid character constant '" + rest + "'")
result.append(Lexeme("NUM", 0))
return
elif firstchar in punctuation:
if rest != "":
Err.log("Internal lexer error! '" + token + "' can't happen!")
result.append(Lexeme(firstchar))
return
else: # Label, opcode, or index register
id = token.lower()
if is_opcode(id):
result.append(Lexeme("OPCODE", id))
elif id == "x":
result.append(Lexeme("X"))
elif id == "y":
result.append(Lexeme("Y"))
elif id == "z":
result.append(Lexeme("Z"))
elif id == "sp":
result.append(Lexeme("SP"))
else:
result.append(Lexeme("LABEL", id))
return
# should never reach here
Err.log("Internal lexer error: add_token fall-through")
def add_EOL():
"Adds an end-of-line lexeme"
result.append(Lexeme("EOL"))
# Actual routine begins here
value = ""
quotemode = False
backslashmode = False
for c in line.strip():
if backslashmode:
backslashmode = False
value = value + c
elif c == "\\":
backslashmode = True
elif quotemode:
if c == '"':
quotemode = False
else:
value = value + c
elif c == ';':
add_token(value)
value = ""
break
elif c == '.' and value != "":
value = value + c
elif c.isspace():
add_token(value)
value = ""
elif c in punctuation:
add_token(value)
add_token(c)
value = ""
elif c == '"':
add_token(value)
value = '"'
quotemode = True
else:
value = value + c
if backslashmode:
Err.log("Backslashed newline")
if quotemode:
Err.log("Unterminated string constant")
add_token(value)
add_EOL()
return result
class ParseLine(object):
"Maintains the parse state of a line of code. Enables arbitrary lookahead."
def __init__(self, lexemes):
self.lexemes = lexemes
self.location = 0
def lookahead(self, i):
"""Returns the token i units ahead in the parse.
lookahead(0) returns the next token; trying to read off the end of
the sequence returns the last token in the sequence (usually EOL)."""
target = self.location + i
if target >= len(self.lexemes):
target = -1
return self.lexemes[target]
def pop(self):
"Returns and removes the next element in the line."
old = self.location
if self.location < len(self.lexemes) - 1:
self.location += 1
return self.lexemes[old]
def expect(self, *tokens):
"""Reads a token from the ParseLine line and returns it if it's of a
type in the sequence tokens. Otherwise, it logs an error."""
token = self.pop()
if token.type in tokens:
return token
if 'LABEL' in tokens:
if token.type in ['X', 'Y', 'Z', 'SP']:
token.value = token.type.lower()
token.type = 'LABEL'
return token
elif token.type == 'OPCODE':
token.type = 'LABEL'
return token
Err.log('Expected: "' + '", "'.join(tokens) + '"')
return token
pragma_modules = []
def parse_expr(line):
"Parses an Ophis arithmetic expression."
def atom():
"Parses lowest-priority expression components."
global templabelcount
next = line.lookahead(0).type
if next == "NUM":
return IR.ConstantExpr(line.expect("NUM").value)
elif next in ["LABEL", "X", "Y", "Z", "SP", "OPCODE"]:
return IR.LabelExpr(line.expect("LABEL").value)
elif next == "^":
line.expect("^")
return IR.PCExpr()
elif next == "[":
line.expect("[")
result = parse_expr(line)
line.expect("]")
return result
elif next == "+":
offset = 0
while next == "+":
offset += 1
line.expect("+")
next = line.lookahead(0).type
return IR.LabelExpr("*" + str(templabelcount + offset))
elif next == "-":
offset = 1
while next == "-":
offset -= 1
line.expect("-")
next = line.lookahead(0).type
return IR.LabelExpr("*" + str(templabelcount + offset))
elif next == ">":
line.expect(">")
return IR.HighByteExpr(atom())
elif next == "<":
line.expect("<")
return IR.LowByteExpr(atom())
else:
Err.log('Expected: expression')
def precedence_read(constructor, reader, separators):
"""Handles precedence. The reader argument is a function that returns
expressions that bind more tightly than these; separators is a list
of strings naming the operators at this precedence level. The
constructor argument is a class, indicating what node type holds
objects of this precedence level.
Returns a list of Expr objects with separator strings between them."""
result = [reader()] # first object
nextop = line.lookahead(0).type
while (nextop in separators):
line.expect(nextop)
result.append(nextop)
result.append(reader())
nextop = line.lookahead(0).type
if len(result) == 1:
return result[0]
return constructor(result)
def term():
"Parses * and /"
return precedence_read(IR.SequenceExpr, atom, ["*", "/"])
def arith():
"Parses + and -"
return precedence_read(IR.SequenceExpr, term, ["+", "-"])
def bits():
"Parses &, |, and ^"
return precedence_read(IR.SequenceExpr, arith, ["&", "|", "^"])
return bits()
def parse_line(ppt, lexemelist):
"Turn a line of source into an IR Node."
Err.currentpoint = ppt
result = []
line = ParseLine(lexemelist)
def aux():
"Accumulates all IR nodes defined by this line."
if line.lookahead(0).type == "EOL":
pass
elif line.lookahead(1).type == ":":
newlabel = line.expect("LABEL").value
line.expect(":")
result.append(IR.Node(ppt, "Label", newlabel, IR.PCExpr()))
aux()
elif line.lookahead(0).type == "*":
global templabelcount
templabelcount = templabelcount + 1
result.append(IR.Node(ppt, "Label", "*" + str(templabelcount),
IR.PCExpr()))
line.expect("*")
aux()
elif line.lookahead(0).type == "." or line.lookahead(0).type == "`":
which = line.expect(".", "`").type
if (which == "."):
pragma = line.expect("LABEL").value
else:
pragma = "invoke"
pragmaFunction = "pragma" + pragma.title()
for mod in pragma_modules:
if hasattr(mod, pragmaFunction):
getattr(mod, pragmaFunction)(ppt, line, result)
break
else:
Err.log("Unknown pragma " + pragma)
else: # Instruction
opcode = line.expect("OPCODE").value
arg2 = None
if line.lookahead(0).type == "#":
mode = "Immediate"
line.expect("#")
arg = parse_expr(line)
line.expect("EOL")
elif line.lookahead(0).type == "(":
line.expect("(")
arg = parse_expr(line)
if line.lookahead(0).type == ",":
line.expect(",")
if line.lookahead(0).type == "X":
mode = "PointerX"
line.expect("X")
line.expect(")")
line.expect("EOL")
else:
mode = "PointerSPY"
line.expect("SP")
line.expect(")")
line.expect(",")
line.expect("Y")
line.expect("EOL")
else:
line.expect(")")
tok = line.expect(",", "EOL").type
if tok == "EOL":
mode = "Pointer"
else:
if line.lookahead(0).type == "Y":
mode = "PointerY"
line.expect("Y")
line.expect("EOL")
else:
mode = "PointerZ"
line.expect("Z")
line.expect("EOL")
elif line.lookahead(0).type == "EOL":
mode = "Implied"
arg = None
else:
arg = parse_expr(line)
tok = line.expect("EOL", ",").type
if tok == ",":
# Parser has to special-case the BBXn instructions,
# Which uniquely take two addresses
if opcode[:3] in ["bbs", "bbr"]:
arg2 = parse_expr(line)
mode = "Memory2"
else:
tok = line.expect("X", "Y", "Z").type
if tok == "X":
mode = "MemoryX"
elif tok == "Y":
mode = "MemoryY"
else:
mode = "MemoryZ"
line.expect("EOL")
else:
mode = "Memory"
result.append(IR.Node(ppt, mode, opcode, arg, arg2))
aux()
result = [node for node in result if node is not IR.NullNode]
if len(result) == 0:
return IR.NullNode
if len(result) == 1:
return result[0]
return IR.SequenceNode(ppt, result)
context_directory = None
def parse_file(ppt, filename, load_once=False):
"Loads an Ophis source file, and returns an IR list."
global context_directory, loadedfiles
Err.currentpoint = ppt
old_context = context_directory
if filename != '-':
if context_directory is not None:
filename = os.path.abspath(os.path.join(context_directory,
filename))
if load_once and filename in loadedfiles:
if Cmd.print_loaded_files:
print("Skipping " + filename, file=sys.stderr)
return IR.NullNode
loadedfiles[filename] = True
if Cmd.print_loaded_files:
if filename != '-':
print("Loading " + filename, file=sys.stderr)
else:
print("Loading from standard input", file=sys.stderr)
try:
if filename != '-':
if context_directory is not None:
filename = os.path.join(context_directory, filename)
f = open(filename, "rt")
linelist = f.readlines()
f.close()
context_directory = os.path.abspath(os.path.dirname(filename))
else:
context_directory = os.getcwd()
linelist = sys.stdin.readlines()
pptlist = ["%s:%d" % (filename, i + 1) for i in range(len(linelist))]
lexlist = list(map(lex, pptlist, linelist))
IRlist = list(map(parse_line, pptlist, lexlist))
IRlist = [node for node in IRlist if node is not IR.NullNode]
context_directory = old_context
return IR.SequenceNode(ppt, IRlist)
except IOError:
Err.log("Could not read " + filename)
context_directory = old_context
return IR.NullNode
def parse(filenames):
"""Top level parsing routine, taking a source file name
list and returning an IR list."""
global templabelcount
templabelcount = 0
nodes = [parse_file("<Top Level>", f) for f in filenames]
if len(nodes) == 1:
return nodes[0]
return IR.SequenceNode("<Top level>", nodes)