From 76101d7f8dd254b944e5ac5fdfed9385e971da9e Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Tue, 5 Jan 2021 22:56:52 +0100
Subject: [PATCH] assem

---
 docs/source/todo.rst                   |   2 +
 examples/cx16/assembler/Makefile       |   8 +
 examples/cx16/assembler/assem.p8       | 102 ++++---
 examples/cx16/assembler/gen_opcodes.py | 358 +++++++++++++------------
 examples/cx16/assembler/hashes.py      |  17 ++
 examples/cx16/assembler/perfecthash.py | 180 +++++++++++++
 6 files changed, 450 insertions(+), 217 deletions(-)
 create mode 100644 examples/cx16/assembler/Makefile
 create mode 100644 examples/cx16/assembler/hashes.py
 create mode 100644 examples/cx16/assembler/perfecthash.py

diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 693c3982e..2e829a984 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -3,6 +3,8 @@ TODO
 ====
 
 - move all str* builtin functions to a strings library module, mem* to the sys module. update docs.
+- use (zp) addressing mode on 65c02 specific code rather than ldy#0 / lda (zp),y
+- optimize pointer access code @(pointer)? use a subroutine? macro?  65c02 vs 6502?
 - can we get rid of the --longOptionName command line options and only keep the short versions? https://github.com/Kotlin/kotlinx-cli/issues/50
 - detect variables that are written but never read - mark those as unused too and remove them, such as uword unused = memory("unused222", 20) - also remove the memory slab allocation
 - hoist all variable declarations up to the subroutine scope *before* even the constant folding takes place (to avoid undefined symbol errors when referring to a variable from another nested scope in the subroutine)
diff --git a/examples/cx16/assembler/Makefile b/examples/cx16/assembler/Makefile
new file mode 100644
index 000000000..1c3a152cf
--- /dev/null
+++ b/examples/cx16/assembler/Makefile
@@ -0,0 +1,8 @@
+all: perfecthash.c opcodes.asm
+
+perfecthash.c:  gen_opcodes.py
+	python gen_opcodes.py --mnemlist | gperf --no-strlen --null-strings -7 -C -E -G -m 100 > perfecthash.c
+
+opcodes.asm:  gen_opcodes.py
+	python gen_opcodes.py --parser > opcodes.asm
+
diff --git a/examples/cx16/assembler/assem.p8 b/examples/cx16/assembler/assem.p8
index b3700d898..22c06a628 100644
--- a/examples/cx16/assembler/assem.p8
+++ b/examples/cx16/assembler/assem.p8
@@ -61,12 +61,8 @@ textparse {
             return
         }
 
-        uword value = parse_number(word_addrs[2])
+        uword value = conv.any2uword(word_addrs[2])
         if strcmp("*", word_addrs[0])==0 {
-            if value == $ffff {
-                txt.print("?invalid address\n")
-                return
-            }
             program_counter = value
         } else {
             set_symbol(word_addrs[0], value)
@@ -151,19 +147,6 @@ textparse {
                     emit(lsb(cx16.r0))
                     emit(msb(cx16.r0))
                 }
-                repeat 2-num_operand_bytes {
-                    txt.print("   ")
-                }
-                txt.chrout(' ')
-                txt.print(word_addrs[0])
-                if word_addrs[1] {
-                    txt.chrout(' ')
-                    txt.print(word_addrs[1])
-                }
-                if word_addrs[2] {
-                    txt.chrout(' ')
-                    txt.print(word_addrs[2])
-                }
                 txt.chrout('\n')
             }
         } else {
@@ -207,7 +190,6 @@ textparse {
         ; -- returns true/false success status,  the value is in cx16.r0 if succesful
         ; TODO number parsing error detection
         ; TODO optimize this (coalesce various parsing options)
-        ; TODO fix number parsing by ending the number with \0 after the last digit
 
         when addr_mode {
             instructions.am_Imp, instructions.am_Acc -> {
@@ -216,49 +198,76 @@ textparse {
             }
             instructions.am_Imm -> {
                 ; lda #$12
-                cx16.r0 = parse_number(operand_ptr+1)
+                terminate_number(operand_ptr+1)
+                cx16.r0 = conv.any2uword(operand_ptr+1)
                 debug_print_value(operand_ptr+1)
                 return true
             }
-            instructions.am_Zp, instructions.am_Zpr -> {
-                ; lda  $02 / brr0 $12,label
-                cx16.r0 = parse_number(operand_ptr)
+            instructions.am_Zp -> {
+                ; lda  $02
+                terminate_number(operand_ptr)
+                cx16.r0 = conv.any2uword(operand_ptr)
+                debug_print_value(operand_ptr)
+                return true
+            }
+            instructions.am_Zpr -> {
+                ; brr0 $12,label
+                ; TODO parse the label, relative offset
+                terminate_number(operand_ptr)
+                cx16.r0 = conv.any2uword(operand_ptr)
                 debug_print_value(operand_ptr)
                 return true
             }
             instructions.am_ZpX, instructions.am_ZpY -> {
                 ; lda $02,x / lda $02,y
-                cx16.r0 = parse_number(operand_ptr)
+                ; TODO parse the ,x/y
+                terminate_number(operand_ptr)
+                cx16.r0 = conv.any2uword(operand_ptr)
                 debug_print_value(operand_ptr)
                 return true
             }
             instructions.am_Rel -> {
-                cx16.r0 = parse_number(operand_ptr)
+                ; bcc  $c000
+                terminate_number(operand_ptr)
+                cx16.r0 = conv.any2uword(operand_ptr)
                 ; TODO calcualate relative offset to current programcounter
                 debug_print_value(operand_ptr)
                 return true
             }
             instructions.am_Abs -> {
                 ; jmp $1234
-                cx16.r0 = parse_number(operand_ptr)
+                terminate_number(operand_ptr)
+                cx16.r0 = conv.any2uword(operand_ptr)
                 debug_print_value(operand_ptr)
                 return true
             }
             instructions.am_AbsX, instructions.am_AbsY -> {
                 ; sta $3000,x / sta $3000,y
-                cx16.r0 = parse_number(operand_ptr)
+                ; TODO parse the ,x/,y
+                terminate_number(operand_ptr)
+                cx16.r0 = conv.any2uword(operand_ptr)
                 debug_print_value(operand_ptr)
                 return true
             }
             instructions.am_Ind  -> {
                 ; jmp ($fffc)
-                cx16.r0 = parse_number(operand_ptr+1)
+                terminate_number(operand_ptr+1)
+                cx16.r0 = conv.any2uword(operand_ptr+1)
                 debug_print_value(operand_ptr+1)
                 return true
             }
-            instructions.am_IzX, instructions.am_IzY, instructions.am_Izp, instructions.am_IaX  -> {
-                ; lda ($02,x) / lda ($02),y / lda ($02) / jmp ($a000,x)
-                cx16.r0 = parse_number(operand_ptr+1)
+            instructions.am_IzX, instructions.am_IzY, instructions.am_IaX  -> {
+                ; lda ($02,x) / lda ($02),y / jmp ($a000,x)
+                ; TODO parse the ,x/,y
+                terminate_number(operand_ptr+1)
+                cx16.r0 = conv.any2uword(operand_ptr+1)
+                debug_print_value(operand_ptr+1)
+                return true
+            }
+            instructions.am_Izp  -> {
+                ; lda ($02)
+                terminate_number(operand_ptr+1)
+                cx16.r0 = conv.any2uword(operand_ptr+1)
                 debug_print_value(operand_ptr+1)
                 return true
             }
@@ -275,14 +284,20 @@ textparse {
         }
     }
 
-
-    sub parse_number(uword strptr) -> uword {
-        ; TODO move to conv module and optimize
-        if @(strptr)=='$'
-            return conv.hex2uword(strptr)
-        if @(strptr)=='%'
-            return conv.bin2uword(strptr)
-        return conv.str2uword(strptr)
+    sub terminate_number(uword strptr) {
+        ; replace the first terminating character after a number (such as a , or close parens)
+        ;  with a 0 to terminate the number and make the parse routine happy.
+        ; TODO remove this once the various conv routines are more robust and stop at a non-digit
+        repeat {
+            when @(strptr) {
+                0 -> return
+                ',', ')', ' ', 9, '\n' -> {
+                    @(strptr) = 0
+                    return
+                }
+            }
+            strptr++
+        }
     }
 
     sub split_input() {
@@ -442,8 +457,9 @@ instructions {
             'a' -> {
                 if @(operand_ptr+1) == 0
                     return am_Acc
-                ; some expression TODO
-                return am_Invalid
+                ; some expression
+                ; zp or absolute depends on the value of the symbol referenced
+                return am_Invalid       ; TODO
             }
             '#' -> {
                 if @(operand_ptr+1)
@@ -452,24 +468,28 @@ instructions {
             }
             '(' -> {
                 ; some indirect TODO
+                ; can be (zp), (zp,x), (zp),y, (abs), (abs,x)
                 if @(operand_ptr+1)
                     return am_Ind
                 return am_Invalid
             }
             '$' -> {
                 ; hex address TODO
+                ; can be followed by ,x or ,y
                 if @(operand_ptr+1)
                     return am_Abs
                 return am_Invalid
             }
             '%' -> {
                 ; bin address TODO
+                ; can be followed by ,x or ,y
                 if @(operand_ptr+1)
                     return am_Abs
                 return am_Invalid
             }
             '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
                 ; absolute or indexed address TODO
+                ; can be followed by ,x or ,y
                 return am_Abs
             }
         }
diff --git a/examples/cx16/assembler/gen_opcodes.py b/examples/cx16/assembler/gen_opcodes.py
index c91705bab..eae72d2e3 100644
--- a/examples/cx16/assembler/gen_opcodes.py
+++ b/examples/cx16/assembler/gen_opcodes.py
@@ -1,3 +1,4 @@
+import sys
 from collections import Counter
 from enum import IntEnum
 
@@ -299,198 +300,203 @@ for ins in Instructions:
     else:
         InstructionsByMode[ins[2]].append((ins[1], ins[0]))
 
-# build the name->modes table
 
-print("; generated by opcodes.py")
-print("; addressing modes:")
-for mode in AddrMode:
-    print(";", mode.value, "=", mode.name)
-print()
+def generate_mnemonics_parser():
+    print("; generated by opcodes.py")
+    print("; addressing modes:")
+    for mode in AddrMode:
+        print(";", mode.value, "=", mode.name)
+    print()
 
-print("""
-    .enc "petscii"  ;define an ascii to petscii encoding
-    .cdef " @", 32  ;characters
-    .cdef "AZ", $c1
-    .cdef "az", $41
-    .cdef "[[", $5b
-    .cdef "]]", $5d
-    .edef "<nothing>", [];replace with no bytes
-""")
+    print("""
+        .enc "petscii"  ;define an ascii to petscii encoding
+        .cdef " @", 32  ;characters
+        .cdef "AZ", $c1
+        .cdef "az", $41
+        .cdef "[[", $5b
+        .cdef "]]", $5d
+        .edef "<nothing>", [];replace with no bytes
+    """)
 
-for instr in sorted(InstructionsByName.items()):
-    print("i_" + instr[0] + ":\n\t.byte  ", end="")
-    if len(instr[1]) == 1:
-        # many instructions have just 1 addressing mode, save space for those
-        info = instr[1].popitem()
-        print("1,", info[0].value,",", info[1])
-    else:
-        print("0, ", end='')
-        mode_opcodes = []
-        for mode in AddrMode:
-            if mode in instr[1]:
-                mode_opcodes.append(instr[1][mode])
-            else:
-                mode_opcodes.append(0)
-        print(",".join(str(o) for o in mode_opcodes), end="")
-        print()
+    for instr in sorted(InstructionsByName.items()):
+        print("i_" + instr[0] + ":\n\t.byte  ", end="")
+        if len(instr[1]) == 1:
+            # many instructions have just 1 addressing mode, save space for those
+            info = instr[1].popitem()
+            print("1,", info[0].value,",", info[1])
+        else:
+            print("0, ", end='')
+            mode_opcodes = []
+            for mode in AddrMode:
+                if mode in instr[1]:
+                    mode_opcodes.append(instr[1][mode])
+                else:
+                    mode_opcodes.append(0)
+            print(",".join(str(o) for o in mode_opcodes), end="")
+            print()
 
+    def determine_mnemonics():
+        mnemonics = list(sorted(set(ins[1] for ins in Instructions)))
 
-def determine_mnemonics():
-    mnemonics = list(sorted(set(ins[1] for ins in Instructions)))
+        # opcodes histogram (ordered by occurrence)  (in kernal + basic roms of the c64):
+        opcode_occurrences = [
+            (32, 839), (133, 502), (165, 488), (0, 429), (208, 426), (169, 390), (76, 324), (240, 322), (2, 314), (160, 245),
+            (96, 228), (3, 201), (1, 191), (255, 186), (144, 182), (170, 175), (162, 169), (177, 165), (104, 159), (164, 158),
+            (132, 157), (201, 156), (72, 151), (141, 150), (200, 146), (173, 144), (166, 139), (176, 139), (16, 138),
+            (134, 138), (73, 127), (24, 119), (101, 113), (69, 109), (13, 107), (34, 104), (145, 103), (4, 102), (168, 101),
+            (221, 98), (230, 93), (48, 91), (189, 87), (41, 86), (6, 86), (9, 86), (8, 85), (79, 85), (138, 80), (10, 80),
+            (7, 79), (185, 77), (56, 75), (44, 75), (78, 74), (105, 73), (5, 73), (174, 73), (220, 71), (198, 69), (232, 69),
+            (36, 69), (202, 67), (152, 67), (95, 67), (100, 65), (102, 65), (247, 65), (188, 64), (136, 64), (84, 64),
+            (122, 62), (128, 61), (80, 61), (186, 60), (82, 59), (97, 58), (15, 57), (70, 57), (229, 56), (19, 55), (40, 54),
+            (183, 54), (65, 54), (233, 53), (180, 53), (12, 53), (171, 53), (197, 53), (83, 52), (248, 52), (112, 51),
+            (237, 51), (89, 50), (11, 50), (158, 50), (74, 49), (224, 48), (20, 47), (238, 47), (108, 46), (234, 46),
+            (251, 46), (254, 46), (184, 45), (14, 44), (163, 44), (226, 43), (211, 43), (88, 43), (98, 42), (17, 42),
+            (153, 42), (243, 41), (228, 41), (99, 41), (253, 41), (209, 41), (187, 39), (123, 39), (67, 39), (196, 38),
+            (68, 38), (35, 38), (172, 38), (175, 38), (161, 38), (85, 38), (191, 37), (113, 37), (182, 37), (151, 37),
+            (71, 36), (181, 35), (214, 35), (121, 35), (157, 35), (178, 35), (77, 35), (42, 34), (212, 33), (18, 33),
+            (127, 33), (241, 33), (21, 33), (249, 32), (23, 31), (245, 30), (142, 30), (55, 29), (140, 29), (46, 29),
+            (192, 29), (179, 29), (252, 29), (115, 29), (22, 29), (43, 28), (215, 28), (45, 28), (246, 28), (38, 28),
+            (86, 27), (225, 27), (25, 26), (239, 26), (58, 26), (167, 26), (147, 26), (217, 26), (149, 25), (30, 25),
+            (206, 25), (28, 24), (47, 24), (37, 24), (155, 24), (129, 23), (148, 23), (111, 23), (29, 23), (39, 23),
+            (51, 22), (193, 22), (236, 22), (120, 22), (64, 22), (204, 21), (210, 21), (244, 21), (52, 21), (66, 21),
+            (114, 20), (250, 20), (106, 20), (93, 19), (199, 19), (218, 19), (154, 19), (205, 19), (50, 19), (159, 19),
+            (194, 19), (49, 19), (190, 19), (103, 18), (216, 18), (213, 18), (107, 18), (131, 18), (63, 18), (94, 18),
+            (91, 17), (242, 17), (109, 17), (53, 16), (227, 16), (139, 16), (31, 16), (75, 16), (60, 16), (195, 15),
+            (231, 15), (62, 15), (59, 15), (87, 14), (207, 14), (27, 14), (90, 14), (110, 13), (223, 13), (57, 13),
+            (118, 12), (26, 12), (203, 12), (81, 12), (156, 12), (54, 12), (235, 12), (146, 11), (135, 11), (126, 11),
+            (150, 11), (130, 11), (143, 10), (61, 10), (219, 10), (124, 9), (222, 9), (125, 9), (119, 7), (137, 7),
+            (33, 7), (117, 5), (92, 4), (116, 3)
+        ]
 
-    # opcodes histogram (ordered by occurrence)  (in kernal + basic roms of the c64):
-    opcode_occurrences = [
-        (32, 839), (133, 502), (165, 488), (0, 429), (208, 426), (169, 390), (76, 324), (240, 322), (2, 314), (160, 245),
-        (96, 228), (3, 201), (1, 191), (255, 186), (144, 182), (170, 175), (162, 169), (177, 165), (104, 159), (164, 158),
-        (132, 157), (201, 156), (72, 151), (141, 150), (200, 146), (173, 144), (166, 139), (176, 139), (16, 138),
-        (134, 138), (73, 127), (24, 119), (101, 113), (69, 109), (13, 107), (34, 104), (145, 103), (4, 102), (168, 101),
-        (221, 98), (230, 93), (48, 91), (189, 87), (41, 86), (6, 86), (9, 86), (8, 85), (79, 85), (138, 80), (10, 80),
-        (7, 79), (185, 77), (56, 75), (44, 75), (78, 74), (105, 73), (5, 73), (174, 73), (220, 71), (198, 69), (232, 69),
-        (36, 69), (202, 67), (152, 67), (95, 67), (100, 65), (102, 65), (247, 65), (188, 64), (136, 64), (84, 64),
-        (122, 62), (128, 61), (80, 61), (186, 60), (82, 59), (97, 58), (15, 57), (70, 57), (229, 56), (19, 55), (40, 54),
-        (183, 54), (65, 54), (233, 53), (180, 53), (12, 53), (171, 53), (197, 53), (83, 52), (248, 52), (112, 51),
-        (237, 51), (89, 50), (11, 50), (158, 50), (74, 49), (224, 48), (20, 47), (238, 47), (108, 46), (234, 46),
-        (251, 46), (254, 46), (184, 45), (14, 44), (163, 44), (226, 43), (211, 43), (88, 43), (98, 42), (17, 42),
-        (153, 42), (243, 41), (228, 41), (99, 41), (253, 41), (209, 41), (187, 39), (123, 39), (67, 39), (196, 38),
-        (68, 38), (35, 38), (172, 38), (175, 38), (161, 38), (85, 38), (191, 37), (113, 37), (182, 37), (151, 37),
-        (71, 36), (181, 35), (214, 35), (121, 35), (157, 35), (178, 35), (77, 35), (42, 34), (212, 33), (18, 33),
-        (127, 33), (241, 33), (21, 33), (249, 32), (23, 31), (245, 30), (142, 30), (55, 29), (140, 29), (46, 29),
-        (192, 29), (179, 29), (252, 29), (115, 29), (22, 29), (43, 28), (215, 28), (45, 28), (246, 28), (38, 28),
-        (86, 27), (225, 27), (25, 26), (239, 26), (58, 26), (167, 26), (147, 26), (217, 26), (149, 25), (30, 25),
-        (206, 25), (28, 24), (47, 24), (37, 24), (155, 24), (129, 23), (148, 23), (111, 23), (29, 23), (39, 23),
-        (51, 22), (193, 22), (236, 22), (120, 22), (64, 22), (204, 21), (210, 21), (244, 21), (52, 21), (66, 21),
-        (114, 20), (250, 20), (106, 20), (93, 19), (199, 19), (218, 19), (154, 19), (205, 19), (50, 19), (159, 19),
-        (194, 19), (49, 19), (190, 19), (103, 18), (216, 18), (213, 18), (107, 18), (131, 18), (63, 18), (94, 18),
-        (91, 17), (242, 17), (109, 17), (53, 16), (227, 16), (139, 16), (31, 16), (75, 16), (60, 16), (195, 15),
-        (231, 15), (62, 15), (59, 15), (87, 14), (207, 14), (27, 14), (90, 14), (110, 13), (223, 13), (57, 13),
-        (118, 12), (26, 12), (203, 12), (81, 12), (156, 12), (54, 12), (235, 12), (146, 11), (135, 11), (126, 11),
-        (150, 11), (130, 11), (143, 10), (61, 10), (219, 10), (124, 9), (222, 9), (125, 9), (119, 7), (137, 7),
-        (33, 7), (117, 5), (92, 4), (116, 3)
-    ]
+        cnt = Counter()
+        for opcode, amount in opcode_occurrences:
+            cnt[AllInstructions[opcode][1]] += amount
+        cnt["nop"] = 13
+        cnt["tsb"] = 13
 
-    cnt = Counter()
-    for opcode, amount in opcode_occurrences:
-        cnt[AllInstructions[opcode][1]] += amount
-    cnt["nop"] = 13
-    cnt["tsb"] = 13
+        four_letter_mnemonics = list(sorted([ins[1] for ins in AllInstructions if len(ins[1])>3]))
+        for ins4 in four_letter_mnemonics:
+            del cnt[ins4]
+            cnt[ins4] = 1
+        mnem2 = [c[0] for c in cnt.most_common()]
+        if len(mnem2)!=len(mnemonics):
+            raise ValueError("mnem count mismatch")
+        return mnem2
 
-    four_letter_mnemonics = list(sorted([ins[1] for ins in AllInstructions if len(ins[1])>3]))
-    for ins4 in four_letter_mnemonics:
-        del cnt[ins4]
-        cnt[ins4] = 1
-    mnem2 = [c[0] for c in cnt.most_common()]
-    if len(mnem2)!=len(mnemonics):
-        raise ValueError("mnem count mismatch")
-    return mnem2
+    mnemonics = determine_mnemonics()
 
+    def first_letters():
+        firstletters = {m[0]: 0 for m in mnemonics}
+        return firstletters.keys()
 
-mnemonics = determine_mnemonics()
+    def second_letters(firstletter):
+        secondletters = {m[1]: 0 for m in mnemonics if m[0] == firstletter}
+        return secondletters.keys()
 
+    def third_letters(firstletter, secondletter):
+        thirdletters = {m[2]: 0 for m in mnemonics if m[0] == firstletter and m[1] == secondletter}
+        return thirdletters.keys()
 
-def first_letters():
-    firstletters = {m[0]: 0 for m in mnemonics}
-    return firstletters.keys()
+    def fourth_letters(firstletter, secondletter, thirdletter):
+        longmnem = [m for m in mnemonics if len(m) > 3]
+        fourthletters = {m[3]: 0 for m in longmnem if m[0] == firstletter and m[1] == secondletter and m[2] == thirdletter}
+        return fourthletters.keys()
 
-
-def second_letters(firstletter):
-    secondletters = {m[1]: 0 for m in mnemonics if m[0] == firstletter}
-    return secondletters.keys()
-
-
-def third_letters(firstletter, secondletter):
-    thirdletters = {m[2]: 0 for m in mnemonics if m[0] == firstletter and m[1] == secondletter}
-    return thirdletters.keys()
-
-
-def fourth_letters(firstletter, secondletter, thirdletter):
-    longmnem = [m for m in mnemonics if len(m) > 3]
-    fourthletters = {m[3]: 0 for m in longmnem if m[0] == firstletter and m[1] == secondletter and m[2] == thirdletter}
-    return fourthletters.keys()
-
-
-def make_tree():
-    tree = {}
-    for first in first_letters():
-        tree[first] = {
-            secondletter: {
-                thirdletter: {
-                    fourthletter: {}
-                    for fourthletter in fourth_letters(first, secondletter, thirdletter)
+    def make_tree():
+        tree = {}
+        for first in first_letters():
+            tree[first] = {
+                secondletter: {
+                    thirdletter: {
+                        fourthletter: {}
+                        for fourthletter in fourth_letters(first, secondletter, thirdletter)
+                    }
+                    for thirdletter in third_letters(first, secondletter)
                 }
-                for thirdletter in third_letters(first, secondletter)
+                for secondletter in second_letters(first)
             }
-            for secondletter in second_letters(first)
-        }
-    return tree
+        return tree
+
+    tree = make_tree()
+
+    print("get_opcode_info    .proc")
+    print("_mnem_fourth_letter = cx16.r4")
+    print("_mnem_fifth_letter = cx16.r5")
+    for first in tree:
+        print("    cmp  #'%s'" % first)
+        print("    bne  _not_%s" % first)
+        for second in tree[first]:
+            print("    cpx  #'%s'" % second)
+            print("    bne  _not_%s%s" % (first,second))
+            for third in tree[first][second]:
+                print("    cpy  #'%s'" % third)
+                print("    bne  _not_%s%s%s" % (first, second, third))
+                fourth = tree[first][second][third]
+                if fourth:
+                    if "".join(fourth.keys()) != "01234567":
+                        raise ValueError("fourth", fourth.keys())
+                    print("    bra  _check_%s%s%s" % (first, second, third))
+                else:
+                    print("    lda  _mnem_fourth_letter")   # check that the fourth letter is not present
+                    print("    bne  _invalid")
+                    print("    lda  #<i_%s%s%s" % (first, second, third))
+                    print("    ldy  #>i_%s%s%s" % (first, second, third))
+                    print("    rts")
+                print("_not_%s%s%s:" % (first, second, third))
+            print("_not_%s%s:" % (first, second))
+        print("_not_%s:" % first)
+    print("_invalid:")
+    print("    lda  #0")
+    print("    ldy  #0")
+    print("    rts")
+
+    # the 4-letter mnemonics are:
+    # smb[0-7]
+    # bbr[0-7]
+    # rmb[0-7]
+    # bbs[0-7]
+    for fourlettermnemonic in ["smb", "bbr", "rmb", "bbs"]:
+        print("_check_%s" % fourlettermnemonic)
+        print("    lda  #<_tab_%s" % fourlettermnemonic)
+        print("    ldy  #>_tab_%s" % fourlettermnemonic)
+        print("""    sta  P8ZP_SCRATCH_W2
+        sty  P8ZP_SCRATCH_W2+1    
+        bra  _check4""")
+
+    print("""_check4
+        lda  _mnem_fourth_letter
+        cmp  #'0'
+        bcc  _invalid
+        cmp  #'8'
+        bcs  _invalid
+        lda  _mnem_fifth_letter     ; must have no fifth letter
+        bne  _invalid
+        tay
+        lda  (P8ZP_SCRATCH_W2),y
+        pha
+        iny
+        lda  (P8ZP_SCRATCH_W2),y
+        tay
+        pla
+        rts""")
+
+    for fourlettermnemonic in ["smb", "bbr", "rmb", "bbs"]:
+        print("_tab_%s" % fourlettermnemonic)
+        for ii in "01234567":
+            print("    .word   i_%s%s" % (fourlettermnemonic, ii))
+
+    print("    .pend")
 
 
-tree = make_tree()
+def generate_mnem_list():
+    for m in sorted(InstructionsByName):
+        print(m.upper())
 
 
-print("get_opcode_info    .proc")
-print("_mnem_fourth_letter = cx16.r4")
-print("_mnem_fifth_letter = cx16.r5")
-for first in tree:
-    print("    cmp  #'%s'" % first)
-    print("    bne  _not_%s" % first)
-    for second in tree[first]:
-        print("    cpx  #'%s'" % second)
-        print("    bne  _not_%s%s" % (first,second))
-        for third in tree[first][second]:
-            print("    cpy  #'%s'" % third)
-            print("    bne  _not_%s%s%s" % (first, second, third))
-            fourth = tree[first][second][third]
-            if fourth:
-                if "".join(fourth.keys()) != "01234567":
-                    raise ValueError("fourth", fourth.keys())
-                print("    bra  _check_%s%s%s" % (first, second, third))
-            else:
-                print("    lda  _mnem_fourth_letter")   # check that the fourth letter is not present
-                print("    bne  _invalid")
-                print("    lda  #<i_%s%s%s" % (first, second, third))
-                print("    ldy  #>i_%s%s%s" % (first, second, third))
-                print("    rts")
-            print("_not_%s%s%s:" % (first, second, third))
-        print("_not_%s%s:" % (first, second))
-    print("_not_%s:" % first)
-print("_invalid:")
-print("    lda  #0")
-print("    ldy  #0")
-print("    rts")
-
-# the 4-letter mnemonics are:
-# smb[0-7]
-# bbr[0-7]
-# rmb[0-7]
-# bbs[0-7]
-for fourlettermnemonic in ["smb", "bbr", "rmb", "bbs"]:
-    print("_check_%s" % fourlettermnemonic)
-    print("    lda  #<_tab_%s" % fourlettermnemonic)
-    print("    ldy  #>_tab_%s" % fourlettermnemonic)
-    print("""    sta  P8ZP_SCRATCH_W2
-    sty  P8ZP_SCRATCH_W2+1    
-    bra  _check4""")
-
-print("""_check4
-    lda  _mnem_fourth_letter
-    cmp  #'0'
-    bcc  _invalid
-    cmp  #'8'
-    bcs  _invalid
-    lda  _mnem_fifth_letter     ; must have no fifth letter
-    bne  _invalid
-    tay
-    lda  (P8ZP_SCRATCH_W2),y
-    pha
-    iny
-    lda  (P8ZP_SCRATCH_W2),y
-    tay
-    pla
-    rts""")
-
-for fourlettermnemonic in ["smb", "bbr", "rmb", "bbs"]:
-    print("_tab_%s" % fourlettermnemonic)
-    for ii in "01234567":
-        print("    .word   i_%s%s" % (fourlettermnemonic, ii))
-
-print("    .pend")
+if __name__=="__main__":
+    if sys.argv[1]=="--mnemlist":
+        generate_mnem_list()
+    elif sys.argv[1]=="--parser":
+        generate_mnemonics_parser()
+    else:
+        print("invalid arg")
diff --git a/examples/cx16/assembler/hashes.py b/examples/cx16/assembler/hashes.py
new file mode 100644
index 000000000..84f2b7df3
--- /dev/null
+++ b/examples/cx16/assembler/hashes.py
@@ -0,0 +1,17 @@
+import re
+
+hashcode = open("perfecthash.c", "rt").read()
+
+entries = hashcode.split("wordlist")[1].split("{")[1].split("}")[0].strip().split(",")
+
+max_hash_value = int(re.search(r"MAX_HASH_VALUE = (\d+)", hashcode).group(1))
+
+if len(entries) != max_hash_value+1:
+    raise ValueError("inconsistent number of entries parsed")
+
+
+entries = [e.strip() for e in entries]
+entries = [None if e.endswith('0') else e.strip('"') for e in entries]
+
+for ix, entry in enumerate(entries):
+    print(ix, entry or "-")
diff --git a/examples/cx16/assembler/perfecthash.py b/examples/cx16/assembler/perfecthash.py
new file mode 100644
index 000000000..fcc51aeeb
--- /dev/null
+++ b/examples/cx16/assembler/perfecthash.py
@@ -0,0 +1,180 @@
+TOTAL_KEYWORDS = 98
+MIN_WORD_LENGTH = 3
+MAX_WORD_LENGTH = 4
+MIN_HASH_VALUE = 2
+MAX_HASH_VALUE = 134
+
+
+def hash(string: str, length: int) -> int:
+    asso_values = [
+      135, 135, 135, 135, 135, 135, 135, 135, 135, 135,
+      135, 135, 135, 135, 135, 135, 135, 135, 135, 135,
+      135, 135, 135, 135, 135, 135, 135, 135, 135, 135,
+      135, 135, 135, 135, 135, 135, 135, 135, 135, 135,
+      135, 135, 135, 135, 135, 135, 135, 135,  65,  62,
+       61,  58,  57,  54,  47,  46, 135, 135, 135, 135,
+      135, 135, 135, 135, 135,  26,   4,   1,   2,  33,
+        2, 135, 135,  15,  69,   4,  30,  10,  52,  17,
+        3,  34,  13,   0,   5,  29,   7,  69,  18,   6,
+       53, 135, 135, 135, 135, 135, 135, 135, 135, 135,
+      135, 135, 135, 135, 135, 135, 135, 135, 135, 135,
+      135, 135, 135, 135, 135, 135, 135, 135, 135, 135,
+      135, 135, 135, 135, 135, 135, 135, 135, 135 ]
+
+    hval = 0
+    if length > 3:
+        hval += asso_values[ord(string[3])]
+    if length > 2:
+        hval += asso_values[ord(string[2])]
+    if length > 1:
+        hval += asso_values[ord(string[1])+1]
+    hval += asso_values[ord(string[0])]
+    return hval
+
+
+wordlist = [
+    None,
+    None,
+    "SBC",
+    "SEC",
+    "SED",
+    "DEC",
+    "BCS",
+    "BCC",
+    "BRK",
+    "TRB",
+    "DEY",
+    "TXS",
+    "CLC",
+    "CLD",
+    "TSB",
+    "TAY",
+    "PLP",
+    "SEI",
+    "CLV",
+    "PLY",
+    None,
+    "PHP",
+    "DEX",
+    None,
+    "PHY",
+    None,
+    "CLI",
+    "TAX",
+    "TSX",
+    "ROR",
+    "BRA",
+    "PLX",
+    "STP",
+    "INC",
+    None,
+    "STY",
+    "PHX",
+    "TXA",
+    "INY",
+    "PLA",
+    "BEQ",
+    "CPY",
+    "RTS",
+    "ORA",
+    "PHA",
+    "AND",
+    "ROL",
+    "STX",
+    "LSR",
+    "EOR",
+    "INX",
+    "BBS7",
+    "BBS6",
+    "CPX",
+    "BNE",
+    "STA",
+    "CMP",
+    "RTI",
+    "NOP",
+    "BBS5",
+    "ADC",
+    "ASL",
+    "BBS4",
+    "BBS3",
+    "BBR7",
+    "BBR6",
+    "BBS2",
+    "BBS1",
+    "BPL",
+    "LDY",
+    "BBS0",
+    "BMI",
+    "BBR5",
+    "BVS",
+    "BVC",
+    "BBR4",
+    "BBR3",
+    None,
+    "BIT",
+    "BBR2",
+    "BBR1",
+    "LDX",
+    "STZ",
+    "BBR0",
+    "TYA",
+    None,
+    None,
+    "JSR",
+    "WAI",
+    "LDA",
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    "SMB7",
+    "SMB6",
+    None,
+    None,
+    None,
+    None,
+    None,
+    None,
+    "SMB5",
+    None,
+    None,
+    "SMB4",
+    "SMB3",
+    "RMB7",
+    "RMB6",
+    "SMB2",
+    "SMB1",
+    None,
+    None,
+    "SMB0",
+    None,
+    "RMB5",
+    "JMP",
+    None,
+    "RMB4",
+    "RMB3",
+    None,
+    None,
+    "RMB2",
+    "RMB1",
+    None,
+    None,
+    "RMB0"
+    ]
+
+def in_word_set(string: str) -> bool:
+    length = len(string)
+    if 3 <= length <= 4:
+        key = hash(string, length)
+        if key <= MAX_HASH_VALUE:
+            word = wordlist[key]
+            return word and word==string
+    return False