Break up Ypos loop into pre/loop/post segment to optimize. Saved ~5,000 cycles/sec. When applied to the other routines will save a few thousand more cycles

This commit is contained in:
Lucas Scharenbroich 2022-07-31 11:51:42 -05:00
parent 4c21d6e217
commit 7a6c4e5ff4
1 changed files with 162 additions and 4 deletions

View File

@ -5,7 +5,7 @@
; Based on the current value of StartY in the direct page. Set up the dispatch
; information so that the BltRange driver will render the correct code field
; lines in the correct order
_ApplyBG0YPos
_ApplyBG0YPosOld
:rtbl_idx_x2 equ tmp0
:virt_line_x2 equ tmp1
@ -51,7 +51,7 @@ _ApplyBG0YPos
ldal BTableLow,x ; Get the address of the first code field line
tay
ldal BTableHigh,x ; Target bank in low byte, current bank in high
ldal BTableHigh,x ; Target bank in low byte
pha
txa
@ -69,7 +69,8 @@ _ApplyBG0YPos
sta :virt_line_x2
plb
CopyRTableToStkAddr :rtbl_idx_x2 ; X = rtbl_idx_x2 on return
jsr _CopyRTableToStkAddr
; CopyRTableToStkAddr :rtbl_idx_x2 ; X = rtbl_idx_x2 on return
txa ; carry flag is unchanged
adc :draw_count_x2 ; advance the index into the RTable
@ -87,6 +88,159 @@ _ApplyBG0YPos
plb
rts
; This is an optimized version of _ApplyBG0YPos. We pre-compute the breakdown across the bank
; boundries in order to eliminate the the minimum calculation and some loop variable updates
; from the inner loop.
_ApplyBG0YPos
:rtbl_idx_x2 equ tmp0
:virt_line_x2 equ tmp1
:lines_left_x2 equ tmp2
:draw_count_x2 equ tmp3
:stk_save equ tmp4
:line_count equ tmp5
; First task is to fill in the STK_ADDR values by copying them from the RTable array. We
; copy from RTable[i] into BlitField[StartY+i]. As with all of this code, the difficult part
; is decomposing the update across banks
stz :rtbl_idx_x2 ; Start copying from the first entry in the table
lda StartY ; This is the base line of the virtual screen
jsr Mod208
sta StartYMod208
asl
sta :virt_line_x2 ; Keep track of it
phb ; Save the current bank
tsc ; we intentionally leak one byte of stack in each loop
sta :stk_save ; iteration, so save the stack to repair at the end
; copy a range of address from the table into the destination bank. If we restrict ourselves to
; rectangular playfields, this can be optimized to just subtracting a constant value. See the
; Templates::SetScreenAddrs subroutine.
lda ScreenHeight
asl
sta :lines_left_x2
; This is the verbose part -- figure out how many lines to draw. We don't want to artificially limit
; the height of the visible screen (for example, doing an animated wipe while scrolling), so the screen
; height could be anything from 1 to 200.
;
; For larger values, we want to break things up on 16-line boundaries based on the virt_line value. So,
;
; draw_count = min(lines_left, (16 - (virt_line % 16))
; Pre-loop: Calculate the number of lines to copy to get the loop into a bank-aligned state
;
; lines_in_bank = 16 - (virt_line % 16)
:pre
ldx :virt_line_x2
ldal BTableLow,x ; Get the address of the first code field line
tay
ldal BTableHigh,x ; Target bank in low byte
pha
txa
and #$001E
eor #$FFFF
sec
adc #32
min :lines_left_x2
sta :draw_count_x2 ; Do this many lines
tax
clc ; pre-advance virt_line_2 because we have the value
adc :virt_line_x2
sta :virt_line_x2
plb
jsr _CopyRTableToStkAddr
txa ; carry flag is unchanged
adc :draw_count_x2 ; advance the index into the RTable
sta :rtbl_idx_x2
lda :lines_left_x2 ; subtract the number of lines we just completed
sec
sbc :draw_count_x2
sta :lines_left_x2
jeq :done ; if there are no lines left, we're done!
cmp #33
jcc :post ; if there are 16 lines or less left, jump to post
; Now we are in the main loop. We know that the virt_line is a multiple of 16, but the number
; of remaining lines could be any number greater than 0. we test to see if the lines_left are
; less than 16. If so, we can jump straight to the post-loop update. Otherwise we caculate
; the number of 16-line iterations and but that in an auxiliary count variable and simplify
; the loop update.
tax
and #$001E ; this is the number of lines in post
sta :lines_left_x2
txa
lsr
lsr
lsr
lsr
lsr
sta :line_count ; single byte count, saves 9 cycles per loop iteration
:loop
ldx :virt_line_x2
ldal BTableLow,x ; Get the address of the first code field line
tay
ldal BTableHigh,x ; Target bank in low byte
pha
lda #32 ; Do this many lines (x2)
tax
clc ; pre-advance virt_line_2 because we have the value
adc :virt_line_x2
sta :virt_line_x2
plb
CopyRTableToStkAddr :rtbl_idx_x2
txa ; carry flag is unchanged
adc #32 ; advance the index into the RTable
sta :rtbl_idx_x2
dec :line_count
jne :loop
lda :lines_left_x2
beq :done
; Draw some number of lines that are less that 16. No need to update loop variabls because we
; know we are in the last iteration
:post
ldx :virt_line_x2
ldal BTableLow,x ; Get the address of the first code field line
tay
ldal BTableHigh,x ; Target bank in low byte
pha
ldx :lines_left_x2 ; Do this many lines
plb
jsr _CopyRTableToStkAddr
:done
lda :stk_save
tcs
plb
rts
; Unrolled copy routine to move RTable intries into STK_ADDR position.
;
; A = intect into the RTable array (x2)
@ -163,4 +317,8 @@ x02 ldal RTable+02,x
x01 ldal RTable+00,x
sta: STK_ADDR+$0000,y
bottom
<<<
<<<
_CopyRTableToStkAddr
CopyRTableToStkAddr tmp0
rts