prog8/examples/cx16/assembler/assem.p8

%target cx16
%import textio
%import diskio
%import string
%import test_stack
%zeropage basicsafe
%option no_sysinit

; raw file loading of the large assembly file $c000-$ffff: 372 jiffies
; time loading and actually processing it: 700 jiffies

main {

    sub start() {
        txt.print("\nCommanderX16 65c02 file based assembler.\n\nfilename or enter for interactive: ")

        str filename = "?" * 20
        if txt.input_chars(filename)
            file_input(filename)
        else
            user_input()

        test_stack.test()
    }

    sub user_input() {
        txt.lowercase()
        parser.print_emit_bytes = true
        parser.program_counter = $4000
        txt.print("\nEmpty line to stop.\n")
        repeat {
            ubyte input_length = 0
            txt.chrout('A')
            txt.print_uwhex(parser.program_counter, 1)
            txt.print(": ")
            ; simulate user always having at least one space at the start
            parser.input_line[0] = ' '
            input_length = txt.input_chars(&parser.input_line+1)
            txt.nl()

            if not input_length {
                txt.print("exit\n")
                return
            }

            if not parser.process_line()
                break
        }
        parser.done()
    }

    sub file_input(uword filename) {
        parser.print_emit_bytes = false
        ubyte success = false

        txt.print("\nreading ")
        txt.print(filename)
        txt.spc()

        if diskio.f_open(8, filename) {
            c64.SETTIM(0,0,0)
            uword line=0
            repeat {
                void diskio.f_readline(parser.input_line)
                line++

                if not lsb(line)
                    txt.chrout('.')

                if not parser.process_line() {
                    txt.print("\nerror. last line was ")
                    txt.print_uw(line)
                    txt.print(": ")
                    txt.print(parser.word_addrs[0])
                    if parser.word_addrs[1] {
                        txt.spc()
                        txt.print(parser.word_addrs[1])
                    }
                    if parser.word_addrs[2] {
                        txt.spc()
                        txt.print(parser.word_addrs[2])
                    }
                    txt.nl()
                    break
                }
                if c64.READST() {
                    success = c64.READST()&64==64       ; end of file?
                    break
                }
                if c64.STOP2() {
                    txt.print("?break\n")
                    break
                }
            }
            diskio.f_close()
            parser.done()

            if success
                print_summary(line, parser.pc_min, parser.pc_max)
        } else {
            txt.print(diskio.status(8))
        }
    }

    sub print_summary(uword lines, uword start_address, uword end_address) {
        txt.print("\n\nstart address: ")
        txt.print_uwhex(start_address, 1)
        txt.print("\n  end address: ")
        txt.print_uwhex(end_address, 1)
        txt.print("\n        lines: ")
        txt.print_uw(lines)

        txt.print("\n   time (sec): ")
        uword current_time = c64.RDTIM16()
        uword secs = current_time / 60
        current_time = (current_time - secs*60)*1000/60
        txt.print_uw(secs)
        txt.chrout('.')
        if current_time<10
            txt.chrout('0')
        if current_time<100
            txt.chrout('0')
        txt.print_uw(current_time)
        txt.nl()
    }
}

parser {
    ; byte counts per address mode id:
    ubyte[17] operand_size = [$ff, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2]

    str input_line = "?" * 160
    uword[3] word_addrs
    uword program_counter = $ffff
    ubyte print_emit_bytes
    uword pc_min = $ffff
    uword pc_max = $0000

    sub process_line() -> ubyte {
        string.lower(input_line)
        preprocess_assignment_spacing()
        split_input()

        if word_addrs[1] and @(word_addrs[1])=='='
            return do_assign()
        else
            return do_label_andor_instr()

        return false
    }

    sub done() {
        if program_counter>pc_max
            pc_max = program_counter
    }

    sub do_assign() -> ubyte {
        ; target is in word_addrs[0], value is in word_addrs[2]   ('=' is in word_addrs[1])
        if not word_addrs[2] {
            txt.print("?syntax error\n")
            return false
        }
        ubyte valid_operand=false
        if @(word_addrs[2])=='*' {
            cx16.r15 = program_counter
            valid_operand = true
        } else {
            ubyte nlen = conv.any2uword(word_addrs[2])
            valid_operand = nlen and @(word_addrs[2]+nlen)==0
        }

        if valid_operand {
            if string.compare(word_addrs[0], "*")==0 {
                program_counter = cx16.r15
                txt.print("\n* = ")
                txt.print_uwhex(program_counter, true)
                txt.nl()
                if program_counter<pc_min
                    pc_min = program_counter
                if program_counter>pc_max
                    pc_max = program_counter
            } else {
                symbols.setvalue(word_addrs[0], cx16.r15)
            }
            return true
        }
        txt.print("?invalid operand\n")
        return false
    }

    sub do_label_andor_instr() -> ubyte {
        uword label_ptr = 0
        uword instr_ptr = 0
        uword operand_ptr = 0
        ubyte starts_with_whitespace = input_line[0]==' ' or input_line[0]==9 or input_line[0]==160

        if word_addrs[2] {
            label_ptr = word_addrs[0]
            instr_ptr = word_addrs[1]
            operand_ptr = word_addrs[2]
        } else if word_addrs[1] {
            if starts_with_whitespace {
                instr_ptr = word_addrs[0]
                operand_ptr = word_addrs[1]
            } else {
                label_ptr = word_addrs[0]
                instr_ptr = word_addrs[1]
            }
        } else if word_addrs[0] {
            if starts_with_whitespace
                instr_ptr = word_addrs[0]
            else
                label_ptr = word_addrs[0]
        }

        if label_ptr {
            uword lastlabelchar = label_ptr + string.length(label_ptr)-1
            if @(lastlabelchar) == ':'
                @(lastlabelchar) = 0
            if instructions.match(label_ptr) {
                txt.print("?label cannot be a mnemonic\n")
                return false
            }
            symbols.setvalue(label_ptr, program_counter)
        }
        if instr_ptr {
            if @(instr_ptr)=='.'
                return process_assembler_directive(instr_ptr, operand_ptr)

            return assemble_instruction(instr_ptr, operand_ptr)
        }

        return true     ; empty line
    }

    sub assemble_instruction(uword instr_ptr, uword operand_ptr) -> ubyte {
        uword instruction_info_ptr = instructions.match(instr_ptr)
        if instruction_info_ptr {
            ; we got a mnemonic match, now process the operand (and its value, if applicable, into cx16.r15)
            ubyte addr_mode = parse_operand(operand_ptr)

            if addr_mode {
                ubyte opcode = instructions.opcode(instruction_info_ptr, addr_mode)
                if_cc {
                    ; most likely an invalid instruction BUT could also be a branchin instruction
                    ; that needs its "absolute" operand recalculated as relative.
                    ubyte retry = false
                    when addr_mode {
                        instructions.am_Abs -> {
                            if @(instr_ptr)=='b' {
                                addr_mode = instructions.am_Rel
                                if not calc_relative_branch_into_r14()
                                    return false
                                cx16.r15 = cx16.r14
                                retry = true
                            }
                        }
                        instructions.am_Imp -> {
                            addr_mode = instructions.am_Acc
                            retry = true
                        }
                        instructions.am_Izp -> {
                            addr_mode = instructions.am_Ind
                            retry = true
                        }
                        instructions.am_Zp -> {
                            addr_mode = instructions.am_Abs
                            retry = true
                        }
                    }

                    if retry
                        opcode = instructions.opcode(instruction_info_ptr, addr_mode)

                    if not opcode {
                        txt.print("?invalid instruction\n")
                        return false
                    }
                }

                if addr_mode==instructions.am_Zpr {
                    ; instructions like BBR4 $zp,$aaaa   (dual-operand)
                    uword comma = string.find(operand_ptr,',')
                    if comma {
                        comma++
                        cx16.r13 = cx16.r15
                        if parse_operand(comma) {
                            program_counter++
                            if not calc_relative_branch_into_r14()
                                return false
                            program_counter--
                            cx16.r15 = (cx16.r14 << 8) | lsb(cx16.r13)
                        } else {
                            txt.print("?invalid operand\n")
                            return false
                        }
                    } else {
                        txt.print("?invalid operand\n")
                        return false
                    }
                }

                ubyte num_operand_bytes = operand_size[addr_mode]
                if print_emit_bytes {
                    txt.spc()
                    txt.print_uwhex(program_counter, 1)
                    txt.print("   ")
                }
                emit(opcode)
                if num_operand_bytes==1 {
                    emit(lsb(cx16.r15))
                } else if num_operand_bytes == 2 {
                    emit(lsb(cx16.r15))
                    emit(msb(cx16.r15))
                }
                if print_emit_bytes
                    txt.nl()
                return true
            }
            txt.print("?invalid operand\n")
            return false
        }
        txt.print("?invalid instruction\n")
        return false
    }

    sub calc_relative_branch_into_r14() -> ubyte {
        cx16.r14 = cx16.r15 - program_counter - 2
        if msb(cx16.r14)  {
            if cx16.r14 < $ff80 {
                txt.print("?branch out of range\n")
                return false
            }
        } else if cx16.r14 > $007f {
            txt.print("?branch out of range\n")
            return false
        }
        return true
    }

    sub parse_operand(uword operand_ptr) -> ubyte {
        ; parses the operand. Returns 2 things:
        ; - addressing mode id as result value or 0 (am_Invalid) when error
        ; - operand numeric value in cx16.r15 (if applicable)

        ubyte @zp firstchr = @(operand_ptr)
        ubyte parsed_len
        when firstchr {
            0 -> return instructions.am_Imp
            '#' -> {
                ; lda #$99   Immediate
                operand_ptr++
                parsed_len = conv.any2uword(operand_ptr)
                if parsed_len {
                    operand_ptr += parsed_len
                    if @(operand_ptr)==0
                        return instructions.am_Imm
                }
            }
            'a' -> {
                if not @(operand_ptr+1)
                    return instructions.am_Acc      ; Accumulator - no value.

                ; TODO its a symbol/label, immediate or indexed addressing
                txt.print("TODO symbol: ")
                txt.print(operand_ptr)
                txt.nl()
            }
            '(' -> {
                ; various forms of indirect
                operand_ptr++
                parsed_len = conv.any2uword(operand_ptr)
                if parsed_len {
                    operand_ptr+=parsed_len
                    if msb(cx16.r15) {
                        ; absolute indirects
                        if str_is1(operand_ptr, ')')
                            return instructions.am_Ind
                        if str_is3(operand_ptr, ",x)")
                            return instructions.am_IaX
                    } else {
                        ; zero page indirects
                        if str_is1(operand_ptr, ')')
                            return instructions.am_Izp
                        if str_is3(operand_ptr, ",x)")
                            return instructions.am_IzX
                        if str_is3(operand_ptr, "),y")
                            return instructions.am_IzY
                    }
                }
            }
            '$', '%', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
                ; address optionally followed by ,x or ,y or ,address
                parsed_len = conv.any2uword(operand_ptr)
                if parsed_len {
                    operand_ptr += parsed_len
                    if msb(cx16.r15) {
                        ; absolute or abs indirects
                        if @(operand_ptr)==0
                            return instructions.am_Abs
                        if str_is2(operand_ptr, ",x")
                            return instructions.am_AbsX
                        if str_is2(operand_ptr, ",y")
                            return instructions.am_AbsY
                    } else {
                        ; zero page or zp indirects
                        if @(operand_ptr)==0
                            return instructions.am_Zp
                        if str_is2(operand_ptr, ",x")
                            return instructions.am_ZpX
                        if str_is2(operand_ptr, ",y")
                            return instructions.am_ZpY
                        if @(operand_ptr)==',' {
                            ; assume BBR $zp,$aaaa or BBS $zp,$aaaa
                            return instructions.am_Zpr
                        }
                    }
                }
            }
        }
        return instructions.am_Invalid
    }

    sub process_assembler_directive(uword directive, uword operand) -> ubyte {
        ; we only recognise .byte right now
        if string.compare(directive, ".byte")==0 {
            if operand {
                ubyte length
                length = conv.any2uword(operand)
                if length {
                    if msb(cx16.r15) {
                        txt.print("?byte value too large\n")
                        return false
                    }
                    if print_emit_bytes {
                        txt.spc()
                        txt.print_uwhex(program_counter, 1)
                        txt.print("   ")
                    }
                    emit(lsb(cx16.r15))
                    operand += length
                    while @(operand)==',' {
                        operand++
                        length = conv.any2uword(operand)
                        if not length
                            break
                        if msb(cx16.r15) {
                            txt.print("?byte value too large\n")
                            return false
                        }
                        emit(lsb(cx16.r15))
                        operand += length
                    }
                    if print_emit_bytes
                        txt.nl()
                    return true
                }
            }
        }
        txt.print("?syntax error\n")
        return false
    }

    asmsub str_is1(uword st @R0, ubyte char @A) clobbers(Y) -> ubyte @A {
        %asm {{
            cmp  (cx16.r0)
            bne  +
            ldy  #1
            lda  (cx16.r0),y
            bne  +
            lda  #1
            rts
+           lda  #0
            rts
        }}
    }

    asmsub str_is2(uword st @R0, uword compare @AY) clobbers(Y) -> ubyte @A {
        %asm {{
            sta  P8ZP_SCRATCH_W1
            sty  P8ZP_SCRATCH_W1+1
            ldy  #0
            jmp  str_is3._is_2_entry
        }}
    }

    asmsub str_is3(uword st @R0, uword compare @AY) clobbers(Y) -> ubyte @A {
        %asm {{
            sta  P8ZP_SCRATCH_W1
            sty  P8ZP_SCRATCH_W1+1
            lda  (cx16.r0)
            cmp  (P8ZP_SCRATCH_W1)
            bne  +
            ldy  #1
_is_2_entry
            lda  (cx16.r0),y
            cmp  (P8ZP_SCRATCH_W1),y
            bne  +
            iny
            lda  (cx16.r0),y
            cmp  (P8ZP_SCRATCH_W1),y
            bne  +
            iny
            lda  (cx16.r0),y
            bne  +
            lda  #1
            rts
+           lda  #0
            rts
        }}
    }

    sub emit(ubyte value) {
        @(program_counter) = value
        program_counter++

        if print_emit_bytes {
            txt.print_ubhex(value, 0)
            txt.spc()
        }
    }

    sub dummy(uword operand_ptr) -> uword {
        uword a1=rndw()
        uword a6=a1+operand_ptr
        return a6
    }

    sub split_input() {
        ; first strip the input string of extra whitespace and comments
        ubyte copying_word = false
        ubyte word_count
        ubyte @zp char_idx = 0

        word_addrs[0] = 0
        word_addrs[1] = 0
        word_addrs[2] = 0

        ubyte @zp char
        for char in input_line {
            when char {
                ' ', 9, 160 -> {
                    if copying_word
                        input_line[char_idx] = 0; terminate word
                    copying_word = false
                }
                ';', 0 -> {
                    ; terminate line on comment char or end-of-string
                    break
                }
                else -> {
                    if not copying_word {
                        if word_count==3
                            break
                        word_addrs[word_count] = &input_line + char_idx
                        word_count++
                    }
                    copying_word = true
                }
            }
            char_idx++
        }

        char = input_line[char_idx]
        if char==' ' or char==9 or char==160 or char==';'
            input_line[char_idx] = 0
    }

    sub debug_print_words() {        ; TODO remove
        txt.print("(debug:) words: ")
        uword word_ptr
        for word_ptr in word_addrs {
            txt.chrout('[')
            txt.print(word_ptr)
            txt.print("] ")
        }
        txt.nl()
    }

    sub preprocess_assignment_spacing() {
        if not string.find(input_line, '=')
            return

        ; split the line around the '='
        str input_line2 = "?" * 40
        uword src = &input_line
        uword dest = &input_line2
        ubyte @zp cc
        for cc in input_line {
            if cc=='=' {
                @(dest) = ' '
                dest++
                @(dest) = '='
                dest++
                cc = ' '
            }
            @(dest) = cc
            dest++
        }
        @(dest)=0
        void string.copy(input_line2, src)
    }
}

symbols {
    sub setvalue(uword symbolname_ptr, uword value) {
        txt.print("symbol: ")
        txt.print(symbolname_ptr)
        txt.chrout('=')
        txt.print_uwhex(value, true)
        txt.nl()
    }
}

instructions {
    const ubyte am_Invalid = 0
    const ubyte am_Imp = 1
    const ubyte am_Acc = 2
    const ubyte am_Imm = 3
    const ubyte am_Zp = 4
    const ubyte am_ZpX = 5
    const ubyte am_ZpY = 6
    const ubyte am_Rel = 7
    const ubyte am_Abs = 8
    const ubyte am_AbsX = 9
    const ubyte am_AbsY = 10
    const ubyte am_Ind = 11
    const ubyte am_IzX = 12
    const ubyte am_IzY = 13
    const ubyte am_Zpr = 14
    const ubyte am_Izp = 15
    const ubyte am_IaX = 16

    ; TODO: explore (benchmark) hash based matchers.   Faster (although the bulk of the time is not in the mnemonic matching)? Less memory?

    asmsub  match(uword mnemonic_ptr @AY) -> uword @AY {
        ; -- input: mnemonic_ptr in AY,   output:  pointer to instruction info structure or $0000 in AY
        %asm {{
            phx
            sta  P8ZP_SCRATCH_W1
            sty  P8ZP_SCRATCH_W1+1
            lda  (P8ZP_SCRATCH_W1)
            and  #$7f   ; lowercase
            pha
            ldy  #1
            lda  (P8ZP_SCRATCH_W1),y
            and  #$7f   ; lowercase
            pha
            iny
            lda  (P8ZP_SCRATCH_W1),y
            and  #$7f   ; lowercase
            pha
            iny
            lda  (P8ZP_SCRATCH_W1),y
            and  #$7f   ; lowercase
            sta  cx16.r4                ; fourth letter in R4 (only exists for the few 4-letter mnemonics)
            iny
            lda  (P8ZP_SCRATCH_W1),y
            and  #$7f   ; lowercase
            sta  cx16.r5                ; fifth letter in R5 (should always be zero or whitespace for a valid mnemonic)
            pla
            tay
            pla
            tax
            pla
            jsr  get_opcode_info
            plx
            rts
        }}
    }

    asmsub  opcode(uword instr_info_ptr @AY, ubyte addr_mode @X) clobbers(X) -> ubyte @A, ubyte @Pc {
        ; -- input: instruction info struct ptr @AY,  desired addr_mode @X
        ;    output: opcode @A,   valid @carrybit
        %asm {{
            cpy  #0
            beq  _not_found
            sta  P8ZP_SCRATCH_W2
            sty  P8ZP_SCRATCH_W2+1
            stx  cx16.r5

            ; debug result address
            ;sec
            ;jsr  txt.print_uwhex
            ;lda  #13
            ;jsr  c64.CHROUT

            lda  (P8ZP_SCRATCH_W2)
            beq  _multi_addrmodes
            ldy  #1
            lda  (P8ZP_SCRATCH_W2),y
            cmp  cx16.r5               ; check single possible addr.mode
            bne  _not_found
            iny
            lda  (P8ZP_SCRATCH_W2),y    ; get opcode
            sec
            rts

_not_found  lda  #0
            clc
            rts

_multi_addrmodes
            ldy  cx16.r5
            lda  (P8ZP_SCRATCH_W2),y    ; check opcode for addr.mode
            bne  _valid
            ; opcode $00 usually means 'invalid' but for "brk" it is actually valid so check for "brk"
            lda  (P8ZP_SCRATCH_W1)
            and  #$7f       ; lowercase
            cmp  #'b'
            bne  _not_found
            ldy  #1
            lda  (P8ZP_SCRATCH_W1),y
            and  #$7f       ; lowercase
            cmp  #'r'
            bne  _not_found
            iny
            lda  (P8ZP_SCRATCH_W1),y
            and  #$7f       ; lowercase
            cmp  #'k'
            bne  _not_found
            lda  #0
_valid      sec
            rts
        }}
    }

    %asminclude "opcodes.asm", ""
}