From 836227f9daa108c6a4abae855ad401785657a2e2 Mon Sep 17 00:00:00 2001
From: marcobaye <marcobaye@4df02467-bbd4-4a76-a152-e7ce94205b78>
Date: Sun, 14 Feb 2021 21:32:32 +0000
Subject: [PATCH] added python script to convert MASM to ACME syntax.

git-svn-id: https://svn.code.sf.net/p/acme-crossass/code-0/trunk@319 4df02467-bbd4-4a76-a152-e7ce94205b78
---
 contrib/toacme/masm2acme.py | 277 ++++++++++++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100755 contrib/toacme/masm2acme.py

diff --git a/contrib/toacme/masm2acme.py b/contrib/toacme/masm2acme.py
new file mode 100755
index 0000000..96d0f22
--- /dev/null
+++ b/contrib/toacme/masm2acme.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+import sys
+
+def line_preprocess(line):
+    "split line into comment, strings and everything else"
+    result = []
+    part = ""
+    comment = None
+    quotes = None
+    for char in line:
+        # are we inside comment?
+        if comment:
+            comment += char
+            continue
+        # are we inside quotes?
+        if quotes:
+            part += char
+            if char == quotes:
+                # end of quotes
+                # if previous part was also quoted, we have to combine them,
+                # because "a""b" really means a"b
+                if result and result[-1][-1] == quotes:
+                    part = result[-1][:-1] + "\\" + part
+                    result.pop()
+                # use singlequotes for 1-char strings
+                if len(part) == 3:
+                    part = "'" + part[1] + "'"
+                # escape backslash, singlequote, doublequote
+                if part == "'\\'":
+                    part = "'\\\\'"
+                elif part == "'''":
+                    part = "'\\''"
+                elif part == '"\\""':
+                    part = "'\"'"
+                result.append(part) # move quoted string to result list
+                part = ""
+                quotes = None
+            continue
+        # not in quotes
+        if char == '"' or char == "'":
+            # new quotes, so finish old part
+            if part and part != ' ':
+                result.append(part)
+            # ...and start new one
+            part = char
+            quotes = char
+            continue
+        # comment?
+        if char == ';':
+            # finish old part
+            if part and part != ' ':
+                result.append(part)
+            part = ""
+            # ...and start comment
+            comment = char
+            continue
+        ## tab-to-space:
+        #if char == '\t':
+        #    char = ' '
+        # skip blanks after blank
+        if part.endswith(' ') and char == ' ':
+            pass
+        else:
+            # all other characters:
+            part += char
+    # quotes still open at end of line?
+    if quotes:
+        raise Exception("Unterminated string constant in input data")
+    # append last part
+    if part:
+        result.append(part)
+    return result, comment
+
+def single_out(items, substring):
+    "split any item containing substring into first part, substring, second part. empty parts are dropped."
+    result = []
+    for i in items:
+        while substring in i:
+            parts = i.partition(substring)
+            if parts[0]:
+                result.append(parts[0])
+            result.append(substring)
+            i = parts[2]
+        if i:
+            result.append(i)
+    return result
+
+def unquoted_tokenize(part):
+    "split part into tokens (so do not pass string literals!)"
+    # split at spaces (and throw away all spaces)
+    items = part.split()
+    # split at commas, braces, ...
+    items = single_out(items, ',')
+    items = single_out(items, '/')
+    items = single_out(items, '=')
+    items = single_out(items, '+')
+    items = single_out(items, '-')
+    items = single_out(items, '*')
+    return items
+
+opcodes_to_keep = [
+    # std 6502:
+    "brk", "rti", "rts", "nop",
+    "php", "plp", "pha", "pla",
+    "bpl", "bmi", "bvc", "bvs", "bcc", "bcs", "bne", "beq",
+    "clc", "sec", "cli", "sei", "clv", "cld", "sed",
+    "dex", "dey", "inx", "iny",
+    "tax", "tay", "txa", "tya", "tsx", "txs",
+    # new in 65c02:
+    "phx", "plx", "phy", "ply", "bra"   # inc, dec
+]
+
+opcodes_with_arg = [
+    # std 6502:
+    "ora", "and", "eor", "adc", "sta", "lda", "cmp", "sbc",
+    "asl", "rol", "lsr", "ror", "dec", "inc",
+    "ldx", "stx", "cpx", "ldy", "sty", "cpy",
+    "jsr", "jmp", "bit",
+    # new in 65c02:
+    "tsb", "trb", "stz"
+]
+
+token_substitutions = {
+    ".": "*",   # program counter
+    ":not:": "not", # operator
+    ":eor:": "xor", # operator
+    ":msb:": ">",   # operator?
+}
+
+opcodes_to_replace = {
+    "org": "*=",
+    "cpu": ";!cpu",         # TODO: support properly!
+    "=": "!tx",
+    "$": "!wo", # actually & instead of $, but substitution was done earlier
+    "end": "!eof",
+    "assert": "+assert",
+    "lnk": ";!source",      # TODO: support properly!
+    "asla": "\tasl",
+    "lsra": "\tlsr",
+    "rola": "\trol",
+    "rora": "\tror",
+    "dea": "\tdec",    # 65c02
+    "ina": "\tinc",    # 65c02
+}
+
+opcodes_to_rename = {
+    "clr": "stz"    # 65c02
+}
+
+def convert_opcodes(parts):
+    "convert mnemonics and pseudo opcodes"
+    if not parts:
+        return parts
+    op = parts[0]
+    parts = parts[1:]
+    if op in opcodes_to_keep:
+        return ["\t" + op] + parts
+    if op in opcodes_to_replace:
+        return [opcodes_to_replace[op]] + parts
+    # wtf?!
+    if op == "jmi":
+        op = "jmpi"
+    elif op == "jmix":
+        op = "jmpxi"
+    # convert addressing modes
+    if len(op) > 3:
+        oldop = op
+        am = op[3:]
+        op = op[:3]
+        if am == "im":
+            if parts[0] == '/':
+                parts[0] = "#>"
+            elif parts[0][0] >= 'a' and parts[0][0] <= 'z':
+                parts[0] = "#< " + parts[0]
+            else:
+                parts[0] = "#" + parts[0]
+        elif am == "ax" or am == "zx":
+            parts[-1] = parts[-1] + ", x"
+        elif am == "ay" or am == "zy":
+            parts[-1] = parts[-1] + ", y"
+        elif am == "xi":
+            parts[0] = "(" + parts[0]
+            parts[-1] = parts[-1] + ", x)"
+        elif am == "iy":
+            parts[0] = "(" + parts[0]
+            parts[-1] = parts[-1] + "), y"
+        elif am == "i":
+            parts[0] = "(" + parts[0]
+            parts[-1] = parts[-1] + ")"
+        else:
+            op = oldop
+    # convert
+    if op in opcodes_to_rename:
+        op = opcodes_to_rename[op]
+    if op in opcodes_with_arg:
+        return ["\t" + op] + parts
+    return [op] + parts
+
+def process_code(parts):
+    "split code parts at special characters"
+    prefix = ""
+    # remember if line starts with space
+    indented = (parts[0][0] == " ")
+    # because now spaces are dropped
+    result = []
+    for part in parts:
+        # do not process quoted strings any further
+        if part.startswith("'") or part.startswith('"'):
+            result.append(part)
+            continue
+        # convert to lower case
+        part = part.lower()
+        # substitute: & becomes $
+        part = "$".join(part.split("&"))
+        # all other parts are split up into tokens
+        result.extend(unquoted_tokenize(part))
+    # convert some tokens (string literals are not in danger, as they include quotes)
+    parts = result
+    result = []
+    for part in parts:
+        if part in token_substitutions:
+            part = token_substitutions[part]
+        result.append(part)
+    # now convert
+    label = ""
+    if indented:
+        # code
+        result = convert_opcodes(result)
+    else:
+        # label or symbol definition
+        if len(result) > 1 and result[1] == "*":
+            # symbol definition
+            symdef = result[0] + "\t="
+            result = [symdef] + convert_opcodes(result[2:])
+        else:
+            # label
+            label = result[0]
+            result = convert_opcodes(result[1:])
+    if result:
+        label = label + "\t"
+    return label, result
+
+def process_line(line):
+    "process a single line of input and return converted version"
+    # remove line ending, if there is one. don't care if NL or CR or combination
+    while len(line) != 0 and (line[-1] == "\r" or line[-1] == "\n"):
+        line = line[:-1]
+    # step 1: split into strings, comments and everything else
+    codeparts, comment = line_preprocess(line)
+    # step 2: if there is anything before comment, process that
+    if codeparts:
+        prefix, codeparts = process_code(codeparts)
+        # reassemble line
+        line = prefix + " ".join(codeparts)
+    else:
+        line = ""
+    if comment:
+        line = line + comment
+    return line + "\n"
+
+def convert_file(input, output):
+    "convert input file to output file line-by-line"
+    with open(input, "rt") as infile:
+        with open(output, "wt") as outfile:
+            outfile.write(";ACME 0.97\n")
+            for line in infile:
+                outfile.write(process_line(line))
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        sys.exit(
+"Error: wrong number of arguments\n"
+"\n"
+"masm2acme.py converts a file from MASM to ACME syntax.\n"
+"Usage: masm2acme.py INPUTFILE OUTPUTFILE\n"
+        )
+    convert_file(sys.argv[1], sys.argv[2])