diff --git a/8bitunity.asm b/8bitunity.asm index dbfe948..e86d286 100644 --- a/8bitunity.asm +++ b/8bitunity.asm @@ -25,8 +25,14 @@ hires equ $c057 Start - jsr test_blitSHR_orig + ; original blitSHR + ;jsr test_blitSHR_orig + + ; new blitSHR_2 (no output) + ; jsr t_blitSHR_n2 + ; blitSHR_IO (screen 2 output and input 2 screen) + jsr t_blitSHR_IO end jmp end @@ -34,6 +40,8 @@ end jmp end include "blitSHR_orig.asm" include "hires.asm" + include "blitSHR_new.asm" + org $2000 incbin "marioluigi-apple2.hires.bin" \ No newline at end of file diff --git a/bin/8bitunity.asm.rom b/bin/8bitunity.asm.rom index 78656a4..9c06a8e 100644 Binary files a/bin/8bitunity.asm.rom and b/bin/8bitunity.asm.rom differ diff --git a/blitSHR_new.asm b/blitSHR_new.asm new file mode 100644 index 0000000..7b9cfa5 --- /dev/null +++ b/blitSHR_new.asm @@ -0,0 +1,279 @@ + + + +hiresXZP equ $ec ; hires X offset (xcol) +hiresYZP equ $ed ; hires Y offset (yrow) +hiresAddrZP equ $fc ; 16 bit address of hires line (from LUT) +inputAddrZP equ $fa ; 16 bit input address (bitmap to draw) +outputAddrZP equ $ee ; 16 bit output address (buffer to save) +scr2outRowsZP equ $ce ; number of rows (output) (height) +inp2scrRowsZP equ $eb ; number of rows (input) (height) +bytesPerRowZP equ $e3 ; bytes per row (width) +toggleMainAuxZP equ $42 ; not used here + +ycounter equ $08 +xcounter equ $09 + + + +blitSHR_I subroutine + +ycounter equ $08 +xcounter equ $09 + + ; input 2 screen (no output) + ; 2372 + 11 = 2383 (= -19% ! vs 2942) + + ldy hiresYZP ; +3 ypos + lda inp2scrRowsZP ; +3 height + sta ycounter ; +3 + clc ; +2 + ; = 11 + +.nextline lda hiresLinesHI,y ; +4 + sta .inp2scr+2 ; +4 + lda hiresLinesLO,y ; +4 + sta .inp2scr+1 ; +4 + ; = 16 + + lda bytesPerRowZP ; +3 width + sta xcounter ; +3 + ldx hiresXZP ; +3 xpos + ; = 9 + +inp_addr2 lda $1000,x ; +(4|)5 +.inp2scr sta $2000,x ; +5 +.cont inx ; +2 + dec xcounter ; +6 + bne inp_addr2 ; +(2|)3 + ; xloop = 5 + 5 + 2 + 6 + 3 = 21 x width = 63 + + iny ; +2 + lda inp_addr2+1 ; +4 + adc bytesPerRowZP ; +3 + sta inp_addr2+1 ; +4 + bcc .dec_ycounter ; +(2|)3 + ; = 2 + 4 + 3 + 4 + (3 or +2+6+2) = 16 + + inc inp_addr2+2 ; +6 + clc ; +2 + + + +.dec_ycounter dec ycounter ; +6 + bne .nextline ; +(2)|3 + ; = 9 + + ; yloop = (16 + 9 + 63 + 16 + 9)*21 - 1 = 2373 - 1 = 2372 + + rts + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +blitSHR_IO subroutine + + ; screen 2 output AND + ; input 2 screen + + ; 3401 + 11 = 3412 vs 4097 = -17% + + +ycounter equ $08 +xcounter equ $09 + + + ldy hiresYZP ; +3 ypos + lda inp2scrRowsZP ; +3 height + sta ycounter ; +3 + clc ; +2 + ; = 11 + +.nextline lda hiresLinesHI,y ; +4 + sta .inp2scr+2 ; +4 + sta .scr_addr+2 ; +4 + lda hiresLinesLO,y ; +4 + sta .inp2scr+1 ; +4 + sta .scr_addr+1 ; +4 + ; =24 + + lda bytesPerRowZP ; +3 width + sta xcounter ; +3 + ldx hiresXZP ; +3 xpos + ; = 9 + +.nextbyte +.scr_addr lda $2000,x ; +4 +out_addr3 sta $1000,x ; +5(|4) + +inp_addr3 lda $1000,x ; +5(|4) +.inp2scr sta $2000,x ; +5 + + inx ; +2 + dec xcounter ; +6 + bne .nextbyte ; +(2|)3 + ; xloop = 4+5+5+5+2+6+3 = 30 x width = 90 - 1 + + iny ; +2 + lda inp_addr3+1 ; +4 + adc bytesPerRowZP ; +3 + sta inp_addr3+1 ; +4 + bcc .inc_outp_addr ; +(2|)3 + ; = 2 + 4 + 3 + 4 + (3 or +2+6+2) = 16 + + inc inp_addr3+2 ; +6 + clc ; +2 + + +.inc_outp_addr lda out_addr3+1 ; +4 + adc bytesPerRowZP ; +3 + sta out_addr3+1 ; +4 + bcc .dec_ycounter ; +(2|)3 + ; = 4 + 3 + 4 + (3 or +2+6+2) = 14 + + inc out_addr3+2 ; +6 + clc ; +2 + + +.dec_ycounter dec ycounter ; +6 + bne .nextline ; +(2)|3 + ; = 9 + + ; yloop = (24 + 9 + 90 + 16 + 14 + 9)*21 - 1 = 3402 - 1 = 3401 + + rts + +;;;;;;;;;;;;;;;;;;;;;;;;; test routines + + ; test input 2 screen only + +t_blitSHR_I subroutine + +xcol equ $06 +yrow equ $07 + + lda #0 + sta xcol + sta yrow + + lda #3 + sta bytesPerRowZP ; width + + lda #21 ; height + sta inp2scrRowsZP + sta scr2outRowsZP + + + ldx xcol ; 3 +.loop_mario_x + + stx hiresXZP ; 3 + lda yrow ; 3 + sta hiresYZP ; 3 + ; = 9 + + sec + lda #mario ; 2 + sbc #0 ; 2 + sta inp_addr2+2 ; 4 + ; = 13 + + + + jsr blitSHR_I ; 2383 + + + + + ldx xcol + inx + stx xcol + cpx #38 + bcc .loop_mario_x + + ldx #0 + stx xcol + ldy yrow + iny + sty yrow + cpy #172 + bcc .loop_mario_x + +.rts rts + + +t_blitSHR_IO subroutine + +xcol equ $06 +yrow equ $07 + + lda #0 + sta xcol + sta yrow + + lda #3 + sta bytesPerRowZP ; width + + lda #21 ; height + sta inp2scrRowsZP + sta scr2outRowsZP + + + ldx xcol ; 3 + + +.loop_mario_x + stx hiresXZP ; 3 + lda yrow ; 3 + sta hiresYZP ; 3 + ; = 9 + + sec + lda #mario ; 2 + sbc #0 ; 2 + sta inp_addr3+2 ; 4 + ; = 13 + + sec + lda #buffer ; 2 + sbc #0 ; 2 + sta out_addr3+2 ; 4 + sta inp_addr2+2 ; 4 + ; = 25 + + jsr blitSHR_IO ; 3412 + + jsr blitSHR_I ; 2320 + + + + + ldx xcol + inx + stx xcol + cpx #38 + bcc .loop_mario_x + + ldx #0 + stx xcol + ldy yrow + iny + sty yrow + cpy #172 + bcc .loop_mario_x + +.rts rts + + + + + \ No newline at end of file diff --git a/blitSHR_orig.asm b/blitSHR_orig.asm index 2387fa1..d6ba4dd 100644 --- a/blitSHR_orig.asm +++ b/blitSHR_orig.asm @@ -13,70 +13,73 @@ toggleMainAuxZP equ $42 ; not used here ;; blitSHR original code -blitSHR_orig subroutine +blitSHR_orig subroutine ; 4095 + 2 = 4097 + ; w/o output = 2940 + 2 = 2942 - ldx #0 + ldx #0 ; +2 ; Copy Screen Address from Hires Tables (using Line Offset Y and Byte Offset X) -.loopRow - ldy hiresYZP ; Y-Offset to Hires Line (ytop) - lda hiresLinesHI,y - sta hiresAddrZP+1 - lda hiresLinesLO,y - adc hiresXZP ; X-Offset to Hires Byte (xcol) - sta hiresAddrZP +.loopRow ; loopRow = 20 + 61 + 65 + 18 + 18 + 13 = 195 x height = 195 x 21 = 4095 + ; w/o outp= 20+ 3+3 + 65 + 18 + 18 + 13 = 140 x height = 2940 + ldy hiresYZP ; +3 Y-Offset to Hires Line (ytop) + lda hiresLinesHI,y ; +4 + sta hiresAddrZP+1 ; +3 + + lda hiresLinesLO,y ; +4 + adc hiresXZP ; +3 X-Offset to Hires Byte (xcol) + sta hiresAddrZP ; +3 ; Copy bytes from SHR buffer to ouput -.screen2output - lda outputAddrZP+1 - beq .input2screen ; If high-byte is zero, then skip - ldy #0 ; Y loop: Copy xxx bytes per row -.loopCopy1 ; Copy 1 byte - lda (hiresAddrZP),y - sta (outputAddrZP),y - iny - cpy bytesPerRowZP - bne .loopCopy1 ; Iterate Y loop +.screen2output ; = 7 + loopCopy1 + 3 + 2 = 61 + lda outputAddrZP+1 ; +3 + beq .input2screen ; +2(|3) If high-byte is zero, then skip + ldy #0 ; +2 Y loop: Copy xxx bytes per row +.loopCopy1 ; loopCopy1 = 19 x width - 1 = 19 x 3 - 1 = 56 + lda (hiresAddrZP),y ; +5 + sta (outputAddrZP),y ; +6 + iny ; +2 + cpy bytesPerRowZP ; +3 + bne .loopCopy1 ; +(2|)3 Iterate Y loop ; Copy bytes from input to SHR buffer - cpx inp2scrRowsZP ; Check number of input rows (for cropped sprites) - bcs .incAddress1 -.input2screen - clc - lda inputAddrZP+1 - beq .incAddress1 ; If high-byte is zero, then skip - ldy #0 ; Y loop: Copy xxx bytes per row -.loopCopy2 - lda (inputAddrZP),y ; Copy 1 byte - sta (hiresAddrZP),y - iny - cpy bytesPerRowZP ; Iterate Y loop - bne .loopCopy2 + cpx inp2scrRowsZP ; +3 Check number of input rows (for cropped sprites) + bcs .incAddress1 ; +(2|)3 +.input2screen ; = 9 + loopCopy2 = 65 + clc ; +2 + lda inputAddrZP+1 ; +3 + beq .incAddress1 ; +2(|3) If high-byte is zero, then skip + ldy #0 ; +2 Y loop: Copy xxx bytes per row +.loopCopy2 ; loopCopy2 = 19 x width - 1 = 19 x 3 - 1 = 56 + lda (inputAddrZP),y ; +5 Copy 1 byte + sta (hiresAddrZP),y ; +6 + iny ; +2 + cpy bytesPerRowZP ; +3 Iterate Y loop + bne .loopCopy2 ; +(2|)3 -.incAddress1 - clc ; Increment address of output block - lda outputAddrZP - adc bytesPerRowZP ; Move by xxx bytes - sta outputAddrZP - bcc .nocarry1 ; Check if carry to high-byte - inc outputAddrZP+1 +.incAddress1 ; = 18 + clc ; +2 Increment address of output block + lda outputAddrZP ; +3 + adc bytesPerRowZP ; +3 Move by xxx bytes + sta outputAddrZP ; +3 + bcc .nocarry1 ; +2(|3) Check if carry to high-byte + inc outputAddrZP+1 ; +5 .nocarry1 -.incAddress2 - clc ; Increment address of input block - lda inputAddrZP - adc bytesPerRowZP ; Move by xxx bytes - sta inputAddrZP - bcc .nocarry2 ; Check if carry to high byte - inc inputAddrZP+1 +.incAddress2 ; = 18 + clc ; +2 Increment address of input block + lda inputAddrZP ; +3 + adc bytesPerRowZP ; +3 Move by xxx bytes + sta inputAddrZP ; +3 + bcc .nocarry2 ; +2(|3) Check if carry to high byte + inc inputAddrZP+1 ; +5 .nocarry2 .nextRow - ; Move to next row - inc hiresYZP ; Increment Hires Line offset - inx - cpx scr2outRowsZP - bcc .loopRow ; Iterate X loop (rows) + ; Move to next row ; = 13 + inc hiresYZP ; +5 Increment Hires Line offset + inx ; +2 + cpx scr2outRowsZP ; +3 + bcc .loopRow ; +(2|)3 Iterate X loop (rows) rts @@ -102,40 +105,48 @@ yrow equ $07 clc ; needed because not in routine ! .loop_mario_x - lda #buffer - sta outputAddrZP+1 + lda #buffer ; 2 + sta outputAddrZP+1 ; 3 + ; = 10 - lda #mario - sta inputAddrZP+1 + lda #mario ; 2 + sta inputAddrZP+1 ; 3 + ; = 10 - lda xcol - sta hiresXZP - lda yrow - sta hiresYZP + lda xcol ; 3 + sta hiresXZP ; 3 + lda yrow ; 3 + sta hiresYZP ; 3 + ; = 12 - jsr blitSHR_orig + jsr blitSHR_orig ; 10 + 10 + 12 + 4097 = 4129 - lda #buffer - sta inputAddrZP+1 + lda #buffer ; 2 + sta inputAddrZP+1 ; 3 + ; = 10 - lda #0 - sta outputAddrZP+1 + lda #0 ; 2 + sta outputAddrZP+1 ; 3 + ; = 5 - lda xcol - sta hiresXZP - lda yrow - sta hiresYZP + lda xcol ; 3 + sta hiresXZP ; 3 + lda yrow ; 3 + sta hiresYZP ; 3 + ; = 12 - clc ; needed because not in routine ! - jsr blitSHR_orig + clc ; +2 needed because not in routine ! + + jsr blitSHR_orig ; 10 + 5 + 12 + 2 + 2942 = 2971 + ; 2971 + 4129 = 7100 ldx xcol inx @@ -148,7 +159,7 @@ yrow equ $07 ldx yrow inx stx yrow - cpx #191-21 + cpx #172 bcc .loop_mario_x .rts rts diff --git a/hires.asm b/hires.asm index 3753c2a..12810f9 100644 --- a/hires.asm +++ b/hires.asm @@ -49,7 +49,7 @@ clear_hgr1 subroutine bpl .loop rts - + org $0900 hiresLinesHI hex 2024282C3034383C hex 2024282C3034383C @@ -75,6 +75,8 @@ hiresLinesHI hex 22262A2E32363A3E hex 23272B2F33373B3F hex 23272B2F33373B3F + + org $0a00 hiresLinesLO hex 0000000000000000 @@ -126,5 +128,7 @@ mario ; 3x21 hex A8808A hex AA80AA + org $b00 + buffer ; 3x21 ds.b 63,00 \ No newline at end of file