prog8/il65/plylex.py

373 lines
8.1 KiB
Python
Raw Normal View History

2018-01-07 01:36:27 +00:00
"""
2018-01-08 02:31:23 +00:00
Programming Language for 6502/6510 microprocessors, codename 'Sick'
2018-01-07 01:36:27 +00:00
This is the lexer of the IL65 code, that generates a stream of tokens for the parser.
2018-01-08 02:31:23 +00:00
Written by Irmen de Jong (irmen@razorvine.net) - license: GNU GPL 3.0
2018-01-07 01:36:27 +00:00
"""
2018-01-11 23:55:47 +00:00
import ast
2018-01-03 20:43:19 +00:00
import sys
import ply.lex
2018-01-07 22:45:42 +00:00
import attr
@attr.s(slots=True, frozen=True)
class SourceRef:
file = attr.ib(type=str)
line = attr.ib(type=int)
column = attr.ib(type=int, default=0)
def __str__(self) -> str:
if self.column:
return "{:s}:{:d}:{:d}".format(self.file, self.line, self.column)
if self.line:
return "{:s}:{:d}".format(self.file, self.line)
return self.file
2018-01-07 18:14:21 +00:00
2018-01-03 20:43:19 +00:00
# token names
tokens = (
"INTEGER",
"FLOATINGPOINT",
"DOTTEDNAME",
"NAME",
"IS",
"CLOBBEREDREGISTER",
"REGISTER",
"COMMENT",
"DIRECTIVE",
"AUGASSIGN",
2018-01-05 21:52:23 +00:00
"EQUALS",
"NOTEQUALS",
2018-01-03 20:43:19 +00:00
"RARROW",
"RETURN",
"VARTYPE",
"SUB",
"DATATYPE",
"CHARACTER",
"STRING",
"BOOLEAN",
"GOTO",
"INCR",
"DECR",
"LT",
"GT",
"LE",
"GE",
2018-01-05 21:52:23 +00:00
"BITAND",
"BITOR",
"BITXOR",
"BITINVERT",
2018-01-18 22:33:02 +00:00
"SHIFTLEFT",
"SHIFTRIGHT",
2018-01-05 21:52:23 +00:00
"LOGICAND",
"LOGICOR",
"LOGICXOR",
2018-01-05 21:52:23 +00:00
"LOGICNOT",
2018-01-09 23:44:11 +00:00
"INTEGERDIVIDE",
2018-01-18 22:33:02 +00:00
"MODULO",
2018-01-05 21:52:23 +00:00
"POWER",
2018-01-03 20:43:19 +00:00
"LABEL",
"IF",
"PRESERVEREGS",
"INLINEASM",
2018-01-05 21:52:23 +00:00
"ENDL"
2018-01-03 20:43:19 +00:00
)
literals = ['+', '-', '*', '/', '(', ')', '[', ']', '{', '}', '.', ',', '!', '?', ':']
# regex rules for simple tokens
2018-01-18 22:33:02 +00:00
t_SHIFTLEFT = r"<<"
t_SHIFTRIGHT = r">>"
2018-01-09 23:44:11 +00:00
t_INTEGERDIVIDE = r"//"
2018-01-05 21:52:23 +00:00
t_BITAND = r"&"
t_BITOR = r"\|"
t_BITXOR = r"\^"
t_BITINVERT = r"~"
2018-01-03 20:43:19 +00:00
t_IS = r"="
t_AUGASSIGN = r"\+=|-=|/=|//=|\*=|\*\*=|<<=|>>=|&=|\|=|\^="
2018-01-03 20:43:19 +00:00
t_DECR = r"--"
t_INCR = r"\+\+"
2018-01-05 21:52:23 +00:00
t_EQUALS = r"=="
t_NOTEQUALS = r"!="
2018-01-03 20:43:19 +00:00
t_LT = r"<"
t_GT = r">"
t_LE = r"<="
t_GE = r">="
t_IF = "if(_[a-z]+)?"
t_RARROW = r"->"
2018-01-05 21:52:23 +00:00
t_POWER = r"\*\*"
2018-01-03 20:43:19 +00:00
# ignore inline whitespace
t_ignore = " \t"
t_inlineasm_ignore = " \t\r\n"
# states for allowing %asm inclusion of raw assembly
states = (
('inlineasm', 'exclusive'),
)
# reserved words
reserved = {
"sub": "SUB",
"var": "VARTYPE",
"memory": "VARTYPE",
"const": "VARTYPE",
"goto": "GOTO",
"return": "RETURN",
"true": "BOOLEAN",
"false": "BOOLEAN",
2018-01-05 21:52:23 +00:00
"not": "LOGICNOT",
"and": "LOGICAND",
"or": "LOGICOR",
"xor": "LOGICXOR",
2018-01-18 22:33:02 +00:00
"mod": "MODULO",
2018-01-03 20:43:19 +00:00
"AX": "REGISTER",
"AY": "REGISTER",
"XY": "REGISTER",
"SC": "REGISTER",
"SI": "REGISTER",
"SZ": "REGISTER",
"A": "REGISTER",
"X": "REGISTER",
"Y": "REGISTER",
"if": "IF",
"if_true": "IF",
"if_not": "IF",
2018-01-05 21:52:23 +00:00
"if_zero": "IF",
2018-01-03 20:43:19 +00:00
"if_ne": "IF",
"if_eq": "IF",
"if_cc": "IF",
"if_cs": "IF",
"if_vc": "IF",
"if_vs": "IF",
2018-01-05 21:52:23 +00:00
"if_ge": "IF",
"if_le": "IF",
2018-01-03 20:43:19 +00:00
"if_gt": "IF",
"if_lt": "IF",
"if_pos": "IF",
"if_get": "IF",
}
# rules for tokens with some actions
def t_inlineasm(t):
2018-02-08 20:10:52 +00:00
r"""%asm\s*\{[^\S\n]*"""
2018-01-03 20:43:19 +00:00
t.lexer.code_start = t.lexer.lexpos # Record start position
t.lexer.level = 1 # initial brace level
t.lexer.begin("inlineasm") # enter state 'inlineasm'
def t_inlineasm_lbrace(t):
2018-02-08 20:10:52 +00:00
r"""\{"""
2018-01-03 20:43:19 +00:00
t.lexer.level += 1
def t_inlineasm_rbrace(t):
2018-02-08 20:10:52 +00:00
r"""\}"""
2018-01-03 20:43:19 +00:00
t.lexer.level -= 1
2018-01-05 01:41:38 +00:00
# if closing brace, return code fragment
2018-01-03 20:43:19 +00:00
if t.lexer.level == 0:
t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos-1]
t.type = "INLINEASM"
t.lexer.lineno += t.value.count("\n")
t.lexer.begin("INITIAL") # back to normal lexing rules
return t
def t_inlineasm_comment(t):
2018-02-08 20:10:52 +00:00
r""";[^\n]*"""
2018-01-03 20:43:19 +00:00
pass
def t_inlineasm_string(t):
r"""(?x) # verbose mode
(?<!\\) # not preceded by a backslash
" # a literal double-quote
.*? # 1-or-more characters
(?<!\\) # not preceded by a backslash
" # a literal double-quote
|
(?<!\\) # not preceded by a backslash
' # a literal single quote
.*? # 1-or-more characters
(?<!\\) # not preceded by a backslash
' # a literal double-quote
"""
pass
def t_inlineasm_nonspace(t):
2018-02-08 20:10:52 +00:00
r"""[^\s\{\}\'\"]+"""
2018-01-03 20:43:19 +00:00
pass
def t_inlineasm_error(t):
# For bad characters, we just skip over it
t.lexer.skip(1)
def t_CLOBBEREDREGISTER(t):
2018-02-08 20:10:52 +00:00
r"""(AX|AY|XY|A|X|Y)\?"""
2018-01-03 20:43:19 +00:00
t.value = t.value[:-1]
return t
def t_DATATYPE(t):
2018-02-08 20:10:52 +00:00
r"""\.byte|\.wordarray|\.float|\.array|\.word|\.text|\.stext|\.ptext|\.pstext|\.matrix"""
2018-01-03 20:43:19 +00:00
t.value = t.value[1:]
return t
def t_LABEL(t):
2018-02-08 20:10:52 +00:00
r"""[a-zA-Z_]\w*\s*:"""
2018-01-03 20:43:19 +00:00
t.value = t.value[:-1].strip()
return t
2018-01-09 23:44:11 +00:00
def t_BOOLEAN(t):
2018-02-08 20:10:52 +00:00
r"""true|false"""
2018-01-09 23:44:11 +00:00
t.value = t.value == "true"
return t
2018-01-03 20:43:19 +00:00
def t_DOTTEDNAME(t):
2018-02-08 20:10:52 +00:00
r"""[a-zA-Z_]\w*(\.[a-zA-Z_]\w*)+"""
first, second = t.value.split(".")
if first in reserved or second in reserved:
custom_error(t, "reserved word as part of dotted name")
return None
2018-01-03 20:43:19 +00:00
return t
def t_NAME(t):
2018-02-08 20:10:52 +00:00
r"""[a-zA-Z_]\w*"""
2018-01-03 20:43:19 +00:00
t.type = reserved.get(t.value, "NAME") # check for reserved words
return t
2018-01-05 21:52:23 +00:00
def t_DIRECTIVE(t):
2018-02-08 20:10:52 +00:00
r"""%[a-z]+\b"""
2018-01-05 21:52:23 +00:00
t.value = t.value[1:]
return t
2018-01-03 20:43:19 +00:00
def t_STRING(t):
r"""(?x) # verbose mode
(?<!\\) # not preceded by a backslash
" # a literal double-quote
.*? # 1-or-more characters
(?<!\\) # not preceded by a backslash
" # a literal double-quote
|
(?<!\\) # not preceded by a backslash
' # a literal single quote
.*? # 1-or-more characters
(?<!\\) # not preceded by a backslash
' # a literal double-quote
"""
2018-01-11 23:55:47 +00:00
t.value = ast.literal_eval(t.value)
2018-01-03 20:43:19 +00:00
if len(t.value) == 1:
t.type = "CHARACTER"
if len(t.value) == 2 and t.value[0] == '\\':
t.type = "CHARACTER"
return t
def t_FLOATINGPOINT(t):
2018-02-08 20:10:52 +00:00
r"""((?: (?: \d* \. \d+ ) | (?: \d+ \.? ) )(?: [Ee] [+-]? \d+ ) ?)(?![a-z])"""
2018-01-03 20:43:19 +00:00
try:
t.value = int(t.value)
t.type = "INTEGER"
except ValueError:
t.value = float(t.value)
return t
def t_INTEGER(t):
2018-02-08 20:10:52 +00:00
r"""\$?[a-fA-F\d]+ | [\$%]?\d+ | %?[01]+"""
2018-01-03 20:43:19 +00:00
sign = 1
if t.value[0] in "+-":
sign = -1 if t.value[0] == "-" else 1
t.value = t.value[1:]
if t.value[0] == '$':
t.value = int(t.value[1:], 16) * sign
elif t.value[0] == '%':
t.value = int(t.value[1:], 2) * sign
else:
t.value = int(t.value) * sign
return t
def t_COMMENT(t):
2018-02-08 20:10:52 +00:00
r"""[ \t]*;[^\n]*""" # dont eat newline
2018-01-05 21:52:23 +00:00
return None # don't process comments
2018-01-03 20:43:19 +00:00
def t_PRESERVEREGS(t):
2018-02-08 20:10:52 +00:00
r"""!\s*[AXY]{0,3}\s*(?!=)"""
2018-01-03 20:43:19 +00:00
t.value = t.value[1:-1].strip()
return t
2018-01-05 21:52:23 +00:00
def t_ENDL(t):
2018-02-08 20:10:52 +00:00
r"""\n+"""
2018-01-03 20:43:19 +00:00
t.lexer.lineno += len(t.value)
2018-01-07 18:14:21 +00:00
t.value = "\n"
2018-01-05 21:52:23 +00:00
return t # end of lines are significant to the parser
2018-01-03 20:43:19 +00:00
def t_error(t):
line, col = t.lineno, find_tok_column(t)
2018-01-07 18:14:21 +00:00
filename = getattr(t.lexer, "source_filename", "<unknown-file>")
sref = SourceRef(filename, line, col)
2018-01-07 22:45:42 +00:00
if hasattr(t.lexer, "error_function"):
t.lexer.error_function(sref, "illegal character '{:s}'", t.value[0])
else:
2018-01-14 23:20:36 +00:00
print("{}: illegal character '{:s}'".format(sref, t.value[0]))
2018-01-03 20:43:19 +00:00
t.lexer.skip(1)
def custom_error(t, message):
line, col = t.lineno, find_tok_column(t)
filename = getattr(t.lexer, "source_filename", "<unknown-file>")
sref = SourceRef(filename, line, col)
if hasattr(t.lexer, "error_function"):
t.lexer.error_function(sref, message)
else:
2018-01-14 23:20:36 +00:00
print(sref, message)
t.lexer.skip(1)
2018-01-03 20:43:19 +00:00
def find_tok_column(token):
2018-02-08 20:10:52 +00:00
"""Find the column of the token in its line."""
2018-01-03 20:43:19 +00:00
last_cr = lexer.lexdata.rfind('\n', 0, token.lexpos)
chunk = lexer.lexdata[last_cr:token.lexpos]
return len(chunk.expandtabs())
2018-01-03 20:43:19 +00:00
2018-01-08 00:51:36 +00:00
def print_warning(text: str, sourceref: SourceRef = None) -> None:
if sourceref:
print_bold("warning: {}: {:s}".format(sourceref, text))
else:
print_bold("warning: " + text)
def print_bold(text: str) -> None:
if sys.stdout.isatty():
print("\x1b[1m" + text + "\x1b[0m", flush=True)
else:
print(text)
2018-01-03 20:43:19 +00:00
lexer = ply.lex.lex()
if __name__ == "__main__":
ply.lex.runmain()