From 69ed76a65d5e3e7e9458cdc50a685bb2968e20b7 Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Mon, 22 Nov 2021 16:56:53 -0600 Subject: [PATCH] Optimize the simple mixed BG0/BG1 handler There is enough room in the 32-byte exception handler to inline the 9-byte epilogue when generating the code sequence for mixed BG1/BG0 rendering. This code sequence is generated once and run for as many frames as the word appear on screen, so saving an uncondition branch (3 cycles) at the cost of 60 cycles is probably worth it. --- macros/CORE.MACS.S | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/macros/CORE.MACS.S b/macros/CORE.MACS.S index b4ea604..38b55d3 100644 --- a/macros/CORE.MACS.S +++ b/macros/CORE.MACS.S @@ -301,8 +301,8 @@ mixed cmp #$FFFF ; All 1's in the mask is fully transparent ldx _X_REG ; Get the addressing offset ldal JTableOffset,x ; Get the address offset and add to the base address - adc _BASE_ADDR ; of the current code field line - adc #{]3&$F000} ; adjust for the current row offset + ora _BASE_ADDR ; of the current code field row (2 rows per bank) $0000 or $8000 + ora #{]3&$7000} ; adjust for the current line offset within the row sta: ]3+1,y tay ; This becomes the new address that we use to patch in @@ -324,8 +324,22 @@ mixed cmp #$FFFF ; All 1's in the mask is fully transparent ldal ]1,x sta: $0006,y - lda #$0D80 ; branch to the prologue (BRA *+15) - sta: $0008,y +; Copy the top 9 bytes down. We have 23 bytes of space and are only using 8. Since 9 + 8 = 17 < 23, we +; can save 3 cycles per word by eliminating the BRA instruction + +; lda #$0D80 ; branch to the prologue (BRA *+15) +; sta: $0008,y + + lda: $0017,y + sta: $0008,y + lda: $0019,y + sta: $000A,y + lda: $001B,y + sta: $000C,y + lda: $001D,y + sta: $000E,y + lda: $001E,y + sta: $000F,y ldy _Y_REG ; restore original y-register value and move on bra next