Ophis/src/Ophis/Frontend.py

334 lines
9.6 KiB
Python

"""Lexer and Parser
Constructs a list of IR nodes from a list of input strings."""
from __future__ import nested_scopes
import Ophis.Errors as Err
import Ophis.Opcodes as Ops
import Ophis.IR as IR
import Ophis.CmdLine as Cmd
import os
# Copyright 2002 Michael C. Martin.
# You may use, modify, and distribute this file under the BSD
# license: See LICENSE.txt for details.
class Lexeme:
"Class for lexer tokens. Used by lexer and parser."
def __init__(self, type="UNKNOWN", value=None):
self.type = type.upper()
self.value = value
def __str__(self):
if self.value == None:
return self.type
else:
return self.type+":"+str(self.value)
def __repr__(self):
return "Lexeme("+`self.type`+", "+`self.value`+")"
def matches(self, other):
"1 if Lexemes a and b have the same type."
return self.type == other.type
bases = {"$":("hexadecimal", 16),
"%":("binary", 2),
"0":("octal", 8)}
punctuation = "#,`<>():.+-*/&|^[]"
def lex(point, line):
"""Turns a line of source into a sequence of lexemes."""
Err.currentpoint = point
result = []
def is_opcode(op):
"Tests whether a string is an opcode or an identifier"
return op in Ops.opcodes
def add_token(token):
"Converts a substring into a single lexeme"
if token == "":
return
if token == "0":
result.append(Lexeme("NUM", 0))
return
firstchar = token[0]
rest = token[1:]
if firstchar == '"':
result.append(Lexeme("STRING", rest))
return
elif firstchar in bases:
try:
result.append(Lexeme("NUM", long(rest, bases[firstchar][1])))
return
except ValueError:
Err.log('Invalid '+bases[firstchar][0]+' constant: '+rest)
result.append(Lexeme("NUM", 0))
return
elif firstchar.isdigit():
try:
result.append(Lexeme("NUM", long(token)))
except ValueError:
Err.log('Identifiers may not begin with a number')
result.append(Lexeme("LABEL", "ERROR"))
return
elif firstchar == "'":
if len(rest) == 1:
result.append(Lexeme("NUM", ord(rest)))
else:
Err.log("Invalid character constant '"+rest+"'")
result.append(Lexeme("NUM", 0))
return
elif firstchar in punctuation:
if rest != "":
Err.log("Internal lexer error! '"+token+"' can't happen!")
result.append(Lexeme(firstchar))
return
else: # Label, opcode, or index register
id = token.lower()
if is_opcode(id):
result.append(Lexeme("OPCODE", id))
elif id == "x":
result.append(Lexeme("X"))
elif id == "y":
result.append(Lexeme("Y"))
else:
result.append(Lexeme("LABEL", id))
return
# should never reach here
Err.log("Internal lexer error: add_token fall-through")
def add_EOL():
"Adds an end-of-line lexeme"
result.append(Lexeme("EOL"))
# Actual routine begins here
value = ""
quotemode = 0
backslashmode = 0
for c in line.strip():
if backslashmode:
backslashmode = 0
value = value + c
elif c == "\\":
backslashmode = 1
elif quotemode:
if c == '"':
quotemode = 0
else:
value = value + c
elif c == ';':
add_token(value)
value = ""
break
elif c.isspace():
add_token(value)
value = ""
elif c in punctuation:
add_token(value)
add_token(c)
value = ""
elif c == '"':
add_token(value)
value = '"'
quotemode = 1
else:
value = value + c
if backslashmode:
Err.log("Backslashed newline")
if quotemode:
Err.log("Unterminated string constant")
add_token(value)
add_EOL()
return result
class ParseLine:
"Maintains the parse state of a line of code. Enables arbitrary lookahead."
def __init__(self, lexemes):
self.lexemes = lexemes
self.location = 0
def lookahead(self, i):
"""Returns the token i units ahead in the parse.
lookahead(0) returns the next token; trying to read off the end of
the sequence returns the last token in the sequence (usually EOL)."""
target = self.location+i
if target >= len(self.lexemes): target = -1
return self.lexemes[target]
def pop(self):
"Returns and removes the next element in the line."
old = self.location
if self.location < len(self.lexemes)-1: self.location += 1
return self.lexemes[old]
def expect(self, *tokens):
"""Reads a token from the ParseLine line and returns it if it's of a type
in the sequence tokens. Otherwise, it logs an error."""
token = self.pop()
if token.type not in tokens:
Err.log('Expected: "'+'", "'.join(tokens)+'"')
return token
pragma_modules = []
def parse_expr(line):
"Parses an Ophis arithmetic expression."
def atom():
"Parses lowest-priority expression components."
next = line.lookahead(0).type
if next == "NUM":
return IR.ConstantExpr(line.expect("NUM").value)
elif next == "LABEL":
return IR.LabelExpr(line.expect("LABEL").value)
elif next == "^":
line.expect("^")
return IR.PCExpr()
elif next == "[":
line.expect("[")
result = parse_expr(line)
line.expect("]")
return result
elif next == "+":
offset = 0
while next == "+":
offset += 1
line.expect("+")
next = line.lookahead(0).type
return IR.LabelExpr("*"+str(templabelcount+offset))
elif next == "-":
offset = 1
while next == "-":
offset -= 1
line.expect("-")
next = line.lookahead(0).type
return IR.LabelExpr("*"+str(templabelcount+offset))
elif next == ">":
line.expect(">")
return IR.HighByteExpr(atom())
elif next == "<":
line.expect("<")
return IR.LowByteExpr(atom())
else:
Err.log('Expected: expression')
def precedence_read(constructor, reader, separators):
"""Handles precedence. The reader argument is a function that returns
expressions that bind more tightly than these; separators is a list
of strings naming the operators at this precedence level. The
constructor argument is a class, indicating what node type holds
objects of this precedence level.
Returns a list of Expr objects with separator strings between them."""
result = [reader()] # first object
nextop = line.lookahead(0).type
while (nextop in separators):
line.expect(nextop)
result.append(nextop)
result.append(reader())
nextop = line.lookahead(0).type
if len(result) == 1: return result[0]
return constructor(result)
def term():
"Parses * and /"
return precedence_read(IR.SequenceExpr, atom, ["*", "/"])
def arith():
"Parses + and -"
return precedence_read(IR.SequenceExpr, term, ["+", "-"])
def bits():
"Parses &, |, and ^"
return precedence_read(IR.SequenceExpr, arith, ["&", "|", "^"])
return bits()
def parse_line(ppt, lexemelist):
"Turn a line of source into an IR Node."
Err.currentpoint = ppt
result = []
line = ParseLine(lexemelist)
def aux():
"Accumulates all IR nodes defined by this line."
if line.lookahead(0).type == "EOL":
pass
elif line.lookahead(1).type == ":":
newlabel=line.expect("LABEL").value
line.expect(":")
result.append(IR.Node(ppt, "Label", newlabel, IR.PCExpr()))
aux()
elif line.lookahead(0).type == "*":
global templabelcount
templabelcount = templabelcount + 1
result.append(IR.Node(ppt, "Label", "*"+str(templabelcount), IR.PCExpr()))
line.expect("*")
aux()
elif line.lookahead(0).type == "." or line.lookahead(0).type == "`":
which = line.expect(".", "`").type
if (which == "."): pragma = line.expect("LABEL").value
else: pragma = "invoke"
pragmaFunction = "pragma"+pragma.title()
for mod in pragma_modules:
if hasattr(mod, pragmaFunction):
getattr(mod, pragmaFunction)(ppt, line, result)
break
else:
Err.log("Unknown pragma "+pragma)
else: # Instruction
opcode = line.expect("OPCODE").value
if line.lookahead(0).type == "#":
mode = "Immediate"
line.expect("#")
arg = parse_expr(line)
line.expect("EOL")
elif line.lookahead(0).type == "(":
line.expect("(")
arg = parse_expr(line)
if line.lookahead(0).type == ",":
mode = "PointerX"
line.expect(",")
line.expect("X")
line.expect(")")
line.expect("EOL")
else:
line.expect(")")
tok = line.expect(",", "EOL").type
if tok == "EOL":
mode = "Pointer"
else:
mode = "PointerY"
line.expect("Y")
line.expect("EOL")
elif line.lookahead(0).type == "EOL":
mode = "Implied"
arg = None
else:
arg = parse_expr(line)
tok = line.expect("EOL", ",").type
if tok == ",":
tok = line.expect("X", "Y").type
if tok == "X": mode = "MemoryX"
else: mode = "MemoryY"
line.expect("EOL")
else: mode = "Memory"
result.append(IR.Node(ppt, mode, opcode, arg))
aux()
result = [node for node in result if node is not IR.NullNode]
if len(result) == 0: return IR.NullNode
if len(result) == 1: return result[0]
return IR.SequenceNode(ppt, result)
def parse_file(ppt, filename):
"Loads a .P65 source file, and returns an IR list."
Err.currentpoint = ppt
if Cmd.verbose > 0: print "Loading "+filename
try:
f = file(filename)
linelist = f.readlines()
f.close()
pptlist = ["%s:%d" % (filename, i+1) for i in range(len(linelist))]
lexlist = map(lex, pptlist, linelist)
IRlist = map(parse_line, pptlist, lexlist)
IRlist = [node for node in IRlist if node is not IR.NullNode]
return IR.SequenceNode(ppt, IRlist)
except IOError:
Err.log ("Could not read "+filename)
return IR.NullNode
def parse(filename):
"Top level parsing routine, taking a source file name and returning an IR list."
global templabelcount
templabelcount = 0
return parse_file("<Top Level>", filename)