334 lines
9.6 KiB
Python
334 lines
9.6 KiB
Python
"""Lexer and Parser
|
|
|
|
Constructs a list of IR nodes from a list of input strings."""
|
|
|
|
from __future__ import nested_scopes
|
|
import Ophis.Errors as Err
|
|
import Ophis.Opcodes as Ops
|
|
import Ophis.IR as IR
|
|
import Ophis.CmdLine as Cmd
|
|
import os
|
|
|
|
# Copyright 2002 Michael C. Martin.
|
|
# You may use, modify, and distribute this file under the BSD
|
|
# license: See LICENSE.txt for details.
|
|
|
|
|
|
class Lexeme:
|
|
"Class for lexer tokens. Used by lexer and parser."
|
|
def __init__(self, type="UNKNOWN", value=None):
|
|
self.type = type.upper()
|
|
self.value = value
|
|
def __str__(self):
|
|
if self.value == None:
|
|
return self.type
|
|
else:
|
|
return self.type+":"+str(self.value)
|
|
def __repr__(self):
|
|
return "Lexeme("+`self.type`+", "+`self.value`+")"
|
|
def matches(self, other):
|
|
"1 if Lexemes a and b have the same type."
|
|
return self.type == other.type
|
|
|
|
bases = {"$":("hexadecimal", 16),
|
|
"%":("binary", 2),
|
|
"0":("octal", 8)}
|
|
|
|
punctuation = "#,`<>():.+-*/&|^[]"
|
|
|
|
def lex(point, line):
|
|
"""Turns a line of source into a sequence of lexemes."""
|
|
Err.currentpoint = point
|
|
result = []
|
|
def is_opcode(op):
|
|
"Tests whether a string is an opcode or an identifier"
|
|
return op in Ops.opcodes
|
|
def add_token(token):
|
|
"Converts a substring into a single lexeme"
|
|
if token == "":
|
|
return
|
|
if token == "0":
|
|
result.append(Lexeme("NUM", 0))
|
|
return
|
|
firstchar = token[0]
|
|
rest = token[1:]
|
|
if firstchar == '"':
|
|
result.append(Lexeme("STRING", rest))
|
|
return
|
|
elif firstchar in bases:
|
|
try:
|
|
result.append(Lexeme("NUM", long(rest, bases[firstchar][1])))
|
|
return
|
|
except ValueError:
|
|
Err.log('Invalid '+bases[firstchar][0]+' constant: '+rest)
|
|
result.append(Lexeme("NUM", 0))
|
|
return
|
|
elif firstchar.isdigit():
|
|
try:
|
|
result.append(Lexeme("NUM", long(token)))
|
|
except ValueError:
|
|
Err.log('Identifiers may not begin with a number')
|
|
result.append(Lexeme("LABEL", "ERROR"))
|
|
return
|
|
elif firstchar == "'":
|
|
if len(rest) == 1:
|
|
result.append(Lexeme("NUM", ord(rest)))
|
|
else:
|
|
Err.log("Invalid character constant '"+rest+"'")
|
|
result.append(Lexeme("NUM", 0))
|
|
return
|
|
elif firstchar in punctuation:
|
|
if rest != "":
|
|
Err.log("Internal lexer error! '"+token+"' can't happen!")
|
|
result.append(Lexeme(firstchar))
|
|
return
|
|
else: # Label, opcode, or index register
|
|
id = token.lower()
|
|
if is_opcode(id):
|
|
result.append(Lexeme("OPCODE", id))
|
|
elif id == "x":
|
|
result.append(Lexeme("X"))
|
|
elif id == "y":
|
|
result.append(Lexeme("Y"))
|
|
else:
|
|
result.append(Lexeme("LABEL", id))
|
|
return
|
|
# should never reach here
|
|
Err.log("Internal lexer error: add_token fall-through")
|
|
def add_EOL():
|
|
"Adds an end-of-line lexeme"
|
|
result.append(Lexeme("EOL"))
|
|
# Actual routine begins here
|
|
value = ""
|
|
quotemode = 0
|
|
backslashmode = 0
|
|
for c in line.strip():
|
|
if backslashmode:
|
|
backslashmode = 0
|
|
value = value + c
|
|
elif c == "\\":
|
|
backslashmode = 1
|
|
elif quotemode:
|
|
if c == '"':
|
|
quotemode = 0
|
|
else:
|
|
value = value + c
|
|
elif c == ';':
|
|
add_token(value)
|
|
value = ""
|
|
break
|
|
elif c.isspace():
|
|
add_token(value)
|
|
value = ""
|
|
elif c in punctuation:
|
|
add_token(value)
|
|
add_token(c)
|
|
value = ""
|
|
elif c == '"':
|
|
add_token(value)
|
|
value = '"'
|
|
quotemode = 1
|
|
else:
|
|
value = value + c
|
|
if backslashmode:
|
|
Err.log("Backslashed newline")
|
|
if quotemode:
|
|
Err.log("Unterminated string constant")
|
|
add_token(value)
|
|
add_EOL()
|
|
return result
|
|
|
|
class ParseLine:
|
|
"Maintains the parse state of a line of code. Enables arbitrary lookahead."
|
|
def __init__(self, lexemes):
|
|
self.lexemes = lexemes
|
|
self.location = 0
|
|
def lookahead(self, i):
|
|
"""Returns the token i units ahead in the parse.
|
|
lookahead(0) returns the next token; trying to read off the end of
|
|
the sequence returns the last token in the sequence (usually EOL)."""
|
|
target = self.location+i
|
|
if target >= len(self.lexemes): target = -1
|
|
return self.lexemes[target]
|
|
def pop(self):
|
|
"Returns and removes the next element in the line."
|
|
old = self.location
|
|
if self.location < len(self.lexemes)-1: self.location += 1
|
|
return self.lexemes[old]
|
|
def expect(self, *tokens):
|
|
"""Reads a token from the ParseLine line and returns it if it's of a type
|
|
in the sequence tokens. Otherwise, it logs an error."""
|
|
token = self.pop()
|
|
if token.type not in tokens:
|
|
Err.log('Expected: "'+'", "'.join(tokens)+'"')
|
|
return token
|
|
|
|
pragma_modules = []
|
|
|
|
def parse_expr(line):
|
|
"Parses an Ophis arithmetic expression."
|
|
def atom():
|
|
"Parses lowest-priority expression components."
|
|
next = line.lookahead(0).type
|
|
if next == "NUM":
|
|
return IR.ConstantExpr(line.expect("NUM").value)
|
|
elif next == "LABEL":
|
|
return IR.LabelExpr(line.expect("LABEL").value)
|
|
elif next == "^":
|
|
line.expect("^")
|
|
return IR.PCExpr()
|
|
elif next == "[":
|
|
line.expect("[")
|
|
result = parse_expr(line)
|
|
line.expect("]")
|
|
return result
|
|
elif next == "+":
|
|
offset = 0
|
|
while next == "+":
|
|
offset += 1
|
|
line.expect("+")
|
|
next = line.lookahead(0).type
|
|
return IR.LabelExpr("*"+str(templabelcount+offset))
|
|
elif next == "-":
|
|
offset = 1
|
|
while next == "-":
|
|
offset -= 1
|
|
line.expect("-")
|
|
next = line.lookahead(0).type
|
|
return IR.LabelExpr("*"+str(templabelcount+offset))
|
|
elif next == ">":
|
|
line.expect(">")
|
|
return IR.HighByteExpr(atom())
|
|
elif next == "<":
|
|
line.expect("<")
|
|
return IR.LowByteExpr(atom())
|
|
else:
|
|
Err.log('Expected: expression')
|
|
def precedence_read(constructor, reader, separators):
|
|
"""Handles precedence. The reader argument is a function that returns
|
|
expressions that bind more tightly than these; separators is a list
|
|
of strings naming the operators at this precedence level. The
|
|
constructor argument is a class, indicating what node type holds
|
|
objects of this precedence level.
|
|
|
|
Returns a list of Expr objects with separator strings between them."""
|
|
result = [reader()] # first object
|
|
nextop = line.lookahead(0).type
|
|
while (nextop in separators):
|
|
line.expect(nextop)
|
|
result.append(nextop)
|
|
result.append(reader())
|
|
nextop = line.lookahead(0).type
|
|
if len(result) == 1: return result[0]
|
|
return constructor(result)
|
|
def term():
|
|
"Parses * and /"
|
|
return precedence_read(IR.SequenceExpr, atom, ["*", "/"])
|
|
def arith():
|
|
"Parses + and -"
|
|
return precedence_read(IR.SequenceExpr, term, ["+", "-"])
|
|
def bits():
|
|
"Parses &, |, and ^"
|
|
return precedence_read(IR.SequenceExpr, arith, ["&", "|", "^"])
|
|
return bits()
|
|
|
|
def parse_line(ppt, lexemelist):
|
|
"Turn a line of source into an IR Node."
|
|
Err.currentpoint = ppt
|
|
result = []
|
|
line = ParseLine(lexemelist)
|
|
def aux():
|
|
"Accumulates all IR nodes defined by this line."
|
|
if line.lookahead(0).type == "EOL":
|
|
pass
|
|
elif line.lookahead(1).type == ":":
|
|
newlabel=line.expect("LABEL").value
|
|
line.expect(":")
|
|
result.append(IR.Node(ppt, "Label", newlabel, IR.PCExpr()))
|
|
aux()
|
|
elif line.lookahead(0).type == "*":
|
|
global templabelcount
|
|
templabelcount = templabelcount + 1
|
|
result.append(IR.Node(ppt, "Label", "*"+str(templabelcount), IR.PCExpr()))
|
|
line.expect("*")
|
|
aux()
|
|
elif line.lookahead(0).type == "." or line.lookahead(0).type == "`":
|
|
which = line.expect(".", "`").type
|
|
if (which == "."): pragma = line.expect("LABEL").value
|
|
else: pragma = "invoke"
|
|
pragmaFunction = "pragma"+pragma.title()
|
|
for mod in pragma_modules:
|
|
if hasattr(mod, pragmaFunction):
|
|
getattr(mod, pragmaFunction)(ppt, line, result)
|
|
break
|
|
else:
|
|
Err.log("Unknown pragma "+pragma)
|
|
|
|
else: # Instruction
|
|
opcode = line.expect("OPCODE").value
|
|
if line.lookahead(0).type == "#":
|
|
mode = "Immediate"
|
|
line.expect("#")
|
|
arg = parse_expr(line)
|
|
line.expect("EOL")
|
|
elif line.lookahead(0).type == "(":
|
|
line.expect("(")
|
|
arg = parse_expr(line)
|
|
if line.lookahead(0).type == ",":
|
|
mode = "PointerX"
|
|
line.expect(",")
|
|
line.expect("X")
|
|
line.expect(")")
|
|
line.expect("EOL")
|
|
else:
|
|
line.expect(")")
|
|
tok = line.expect(",", "EOL").type
|
|
if tok == "EOL":
|
|
mode = "Pointer"
|
|
else:
|
|
mode = "PointerY"
|
|
line.expect("Y")
|
|
line.expect("EOL")
|
|
elif line.lookahead(0).type == "EOL":
|
|
mode = "Implied"
|
|
arg = None
|
|
else:
|
|
arg = parse_expr(line)
|
|
tok = line.expect("EOL", ",").type
|
|
if tok == ",":
|
|
tok = line.expect("X", "Y").type
|
|
if tok == "X": mode = "MemoryX"
|
|
else: mode = "MemoryY"
|
|
line.expect("EOL")
|
|
else: mode = "Memory"
|
|
result.append(IR.Node(ppt, mode, opcode, arg))
|
|
aux()
|
|
result = [node for node in result if node is not IR.NullNode]
|
|
if len(result) == 0: return IR.NullNode
|
|
if len(result) == 1: return result[0]
|
|
return IR.SequenceNode(ppt, result)
|
|
|
|
def parse_file(ppt, filename):
|
|
"Loads a .P65 source file, and returns an IR list."
|
|
Err.currentpoint = ppt
|
|
if Cmd.verbose > 0: print "Loading "+filename
|
|
try:
|
|
f = file(filename)
|
|
linelist = f.readlines()
|
|
f.close()
|
|
pptlist = ["%s:%d" % (filename, i+1) for i in range(len(linelist))]
|
|
lexlist = map(lex, pptlist, linelist)
|
|
IRlist = map(parse_line, pptlist, lexlist)
|
|
IRlist = [node for node in IRlist if node is not IR.NullNode]
|
|
return IR.SequenceNode(ppt, IRlist)
|
|
except IOError:
|
|
Err.log ("Could not read "+filename)
|
|
return IR.NullNode
|
|
|
|
def parse(filename):
|
|
"Top level parsing routine, taking a source file name and returning an IR list."
|
|
global templabelcount
|
|
templabelcount = 0
|
|
return parse_file("<Top Level>", filename)
|
|
|