From f5a27362a7f8fa9945b80806f579279ccc0e7670 Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Thu, 28 Jul 2022 11:56:05 -0500 Subject: [PATCH 1/5] Remove ommented out code --- src/Memory.s | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/Memory.s b/src/Memory.s index c4c449d..fc52753 100644 --- a/src/Memory.s +++ b/src/Memory.s @@ -28,9 +28,7 @@ InitMemory lda EngineMode _NewHandle ; returns LONG Handle on stack plx ; base address of the new handle pla ; high address 00XX of the new handle (bank) -; _Deref -; stx Buff00 -; sta Buff00+2 + :no_bnk0_buff PushLong #0 ; space for result @@ -41,9 +39,6 @@ InitMemory lda EngineMode _NewHandle ; returns LONG Handle on stack plx ; base address of the new handle pla ; high address 00XX of the new handle (bank) -; _Deref -; stx Buff01 -; sta Buff01+2 PushLong #0 ; space for result From 0f920acd03a2e6553c146772e4d7a01a6d5b849d Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Thu, 28 Jul 2022 11:57:14 -0500 Subject: [PATCH 2/5] Eliminate redundent register loads --- src/blitter/Horz.s | 152 +++++++++++++++++------------------- src/blitter/TemplateUtils.s | 5 +- src/blitter/Vert.s | 119 +++++++++++++--------------- 3 files changed, 128 insertions(+), 148 deletions(-) diff --git a/src/blitter/Horz.s b/src/blitter/Horz.s index eb7aa5d..496956f 100644 --- a/src/blitter/Horz.s +++ b/src/blitter/Horz.s @@ -33,8 +33,8 @@ _RestoreBG0Opcodes lda LastPatchOffset ; If zero, there are no saved opcodes sta :exit_offset -:loop ldx :virt_line_x2 +:loop ldal BTableLow,x ; Get the address of the first code field line tay @@ -47,8 +47,7 @@ _RestoreBG0Opcodes txa ; lda :virt_line_x2 and #$001E eor #$FFFF - inc - clc + sec adc #32 min :lines_left_x2 sta :draw_count_x2 ; Do half of this many lines @@ -60,12 +59,13 @@ _RestoreBG0Opcodes clc adc :exit_offset ; Add some offsets to get the base address in the code field line - jsr RestoreOpcode + jsr (RestoreOpcode,x) lda :virt_line_x2 ; advance to the virtual line after the segment we just clc ; filled in adc :draw_count_x2 sta :virt_line_x2 + tax lda :lines_left_x2 ; subtract the number of lines we just completed sec @@ -113,16 +113,16 @@ _ApplyBG0XPosPre _ApplyBG0XPos -:virt_line equ tmp1 -:lines_left equ tmp2 -:draw_count equ tmp3 +:stk_save equ tmp0 +:virt_line_x2 equ tmp1 +:lines_left_x2 equ tmp2 +:draw_count_x2 equ tmp3 :exit_offset equ tmp4 :entry_offset equ tmp5 :exit_bra equ tmp6 :exit_address equ tmp7 :base_address equ tmp8 -:draw_count_x2 equ tmp9 -:opcode equ tmp0 +:opcode equ tmp9 :odd_entry_offset equ tmp10 ; If there are saved opcodes that have not been restored, do not run this routine @@ -133,10 +133,12 @@ _ApplyBG0XPos ; This code is fairly succinct. See the corresponding code in Vert.s for more detailed comments. :ok lda StartYMod208 ; This is the base line of the virtual screen - sta :virt_line ; Keep track of it + asl + sta :virt_line_x2 ; Keep track of it lda ScreenHeight - sta :lines_left + asl + sta :lines_left_x2 ; Calculate the exit and entry offsets into the code fields. This is a bit tricky, because odd-aligned ; rendering causes the left and right edges to move in a staggered fashion. @@ -280,32 +282,30 @@ _ApplyBG0XPos ; 3. Writes the JMP entry point to enter the code field phb ; Save the existing bank + tsc + sta :stk_save + :loop - lda :virt_line - asl ; This will clear the carry bit - tax + ldx :virt_line_x2 ldal BTableLow,x ; Get the address of the first code field line tay ; Save it to use as the base address + + clc adc :exit_offset ; Add some offsets to get the base address in the code field line sta :exit_address sty :base_address - sep #$20 ldal BTableHigh,x pha - plb ; This is the bank that will receive the updates - rep #$20 + plb - lda :virt_line - and #$000F + txa + and #$001E eor #$FFFF - inc - clc - adc #16 - min :lines_left + sec + adc #32 + min :lines_left_x2 - sta :draw_count ; Do this many lines - asl sta :draw_count_x2 ; First step is to set the BRA instruction to exit the code field at the proper location. There @@ -317,64 +317,63 @@ _ApplyBG0XPos ; screen ; y is already set to :base_address tax ; :draw_count_x2 - lda :exit_address ; Save from this location - jsr SaveOpcode + clc ; advance to the virtual line after the segment we just + adc :virt_line_x2 ; filled in + sta :virt_line_x2 + lda :exit_address ; Save from this location + jsr (SaveOpcode,x) ; X = :exit_address on return + + txy ; ldy :exit_address -- starting at this address ldx :draw_count_x2 ; Do this many lines lda :exit_bra ; Copy this value into all of the lines - ldy :exit_address ; starting at this address - jsr SetConst + jsr (SetConst,x) ; All registers are preserved ; Next, patch in the CODE_ENTRY value, which is the low byte of a JMP instruction. This is an ; 8-bit operation and, since the PEA code is bank aligned, we use the entry_offset value directly sep #$20 - ldx :draw_count_x2 +; ldx :draw_count_x2 lda :entry_offset ldy :base_address - jsr SetCodeEntry + jsr (SetCodeEntry,x) ; All registers are preserved ; Now, patch in the opcode - ldx :draw_count_x2 +; ldx :draw_count_x2 lda :opcode - ldy :base_address ; Y-register is preserved, this can be removed - jsr SetCodeEntryOpcode + jsr (SetCodeEntryOpcode,x) ; All registers are preserved ; If this is an odd entry, also set the odd_entry low byte and save the operand high byte lda :odd_entry_offset beq :not_odd - ldx :draw_count_x2 - ldy :base_address ; Y-register is preserved, this can be removed - jsr SetOddCodeEntry +; NOTE: SetOddCodeEntry and SaveHighOperand can probably be combined to eliminate call/return overhead - ldx :draw_count_x2 - ldy :base_address ; Y-register is preserved, this can be removed - pei :exit_address - jmp :SaveHighOperand ; Only used once, so "inline" it +; ldx :draw_count_x2 + jsr (SetOddCodeEntry,x) ; All registers are preserved + +; ldx :draw_count_x2 + jmp (:SaveHighOperand,x) ; Only used once, so "inline" it :save_high_op_rtn :not_odd - rep #$20 + rep #$21 ; clear the carry ; Do the end of the loop -- update the virtual line counter and reduce the number ; of lines left to render - lda :virt_line ; advance to the virtual line after the segment we just - clc ; filled in - adc :draw_count - sta :virt_line - - lda :lines_left ; subtract the number of lines we just completed + lda :lines_left_x2 ; subtract the number of lines we just completed sec - sbc :draw_count - sta :lines_left + sbc :draw_count_x2 + sta :lines_left_x2 jne :loop + lda :stk_save + tcs plb rts @@ -387,45 +386,43 @@ _ApplyBG0XPos ; Y = starting line * $1000 ; A = code field location * $1000 :SaveHighOperand - jmp (:tbl,x) - -:tbl da :bottom + da :bottom da :do01,:do02,:do03,:do04 da :do05,:do06,:do07,:do08 da :do09,:do10,:do11,:do12 da :do13,:do14,:do15,:do16 -:do15 plx +:do15 ldx :exit_address ; accumulator is in 8-bit mode, so can't use TAX bra :x15 -:do14 plx +:do14 ldx :exit_address bra :x14 -:do13 plx +:do13 ldx :exit_address bra :x13 -:do12 plx +:do12 ldx :exit_address bra :x12 -:do11 plx +:do11 ldx :exit_address bra :x11 -:do10 plx +:do10 ldx :exit_address bra :x10 -:do09 plx +:do09 ldx :exit_address bra :x09 -:do08 plx +:do08 ldx :exit_address bra :x08 -:do07 plx +:do07 ldx :exit_address bra :x07 -:do06 plx +:do06 ldx :exit_address bra :x06 -:do05 plx +:do05 ldx :exit_address bra :x05 -:do04 plx +:do04 ldx :exit_address bra :x04 -:do03 plx +:do03 ldx :exit_address bra :x03 -:do02 plx +:do02 ldx :exit_address bra :x02 -:do01 plx +:do01 ldx :exit_address bra :x01 -:do16 plx +:do16 ldx :exit_address :x16 lda $F002,x sta OPCODE_HIGH_SAVE+$F000,y :x15 lda $E002,x @@ -469,9 +466,7 @@ _ApplyBG0XPos ; Y = starting line * $1000 ; A = code field location * $1000 SaveOpcode - jmp (:tbl,x) - -:tbl da :bottom + da :bottom da :do01,:do02,:do03,:do04 da :do05,:do06,:do07,:do08 da :do09,:do10,:do11,:do12 @@ -550,9 +545,7 @@ SaveOpcode ; Y = starting line * $1000 ; A = code field location * $1000 RestoreOpcode - jmp (:tbl,x) - -:tbl da :bottom + da :bottom da :do01,:do02,:do03,:do04 da :do05,:do06,:do07,:do08 da :do09,:do10,:do11,:do12 @@ -631,8 +624,7 @@ RestoreOpcode ; Y = starting line * $1000 ; A = address low byte SetCodeEntry - jmp (:tbl,x) -:tbl da :bottom-00,:bottom-03,:bottom-06,:bottom-09 + da :bottom-00,:bottom-03,:bottom-06,:bottom-09 da :bottom-12,:bottom-15,:bottom-18,:bottom-21 da :bottom-24,:bottom-27,:bottom-30,:bottom-33 da :bottom-36,:bottom-39,:bottom-42,:bottom-45 @@ -663,8 +655,7 @@ SetCodeEntry ; Y = starting line * $1000 ; A = address low byte SetOddCodeEntry - jmp (:tbl,x) -:tbl da :bottom-00,:bottom-03,:bottom-06,:bottom-09 + da :bottom-00,:bottom-03,:bottom-06,:bottom-09 da :bottom-12,:bottom-15,:bottom-18,:bottom-21 da :bottom-24,:bottom-27,:bottom-30,:bottom-33 da :bottom-36,:bottom-39,:bottom-42,:bottom-45 @@ -695,8 +686,7 @@ SetOddCodeEntry ; Y = starting line * $1000 ; A = opcode value SetCodeEntryOpcode - jmp (:tbl,x) -:tbl da :bottom-00,:bottom-03,:bottom-06,:bottom-09 + da :bottom-00,:bottom-03,:bottom-06,:bottom-09 da :bottom-12,:bottom-15,:bottom-18,:bottom-21 da :bottom-24,:bottom-27,:bottom-30,:bottom-33 da :bottom-36,:bottom-39,:bottom-42,:bottom-45 diff --git a/src/blitter/TemplateUtils.s b/src/blitter/TemplateUtils.s index 1f11770..0dae3d5 100644 --- a/src/blitter/TemplateUtils.s +++ b/src/blitter/TemplateUtils.s @@ -87,9 +87,8 @@ Counter equ tmp3 ; A = value ; ; Set M to 0 or 1 -SetConst ; Need a blank line here, otherwise the :tbl local variable resolveds backwards - jmp (:tbl,x) -:tbl da :bottom-00,:bottom-03,:bottom-06,:bottom-09 +SetConst + da :bottom-00,:bottom-03,:bottom-06,:bottom-09 da :bottom-12,:bottom-15,:bottom-18,:bottom-21 da :bottom-24,:bottom-27,:bottom-30,:bottom-33 da :bottom-36,:bottom-39,:bottom-42,:bottom-45 diff --git a/src/blitter/Vert.s b/src/blitter/Vert.s index 042cce3..eafe61f 100644 --- a/src/blitter/Vert.s +++ b/src/blitter/Vert.s @@ -7,29 +7,36 @@ ; lines in the correct order _ApplyBG0YPos -:rtbl_idx equ tmp0 -:virt_line equ tmp1 -:lines_left equ tmp2 -:draw_count equ tmp3 +:rtbl_idx_x2 equ tmp0 +:virt_line_x2 equ tmp1 +:lines_left_x2 equ tmp2 +:draw_count_x2 equ tmp3 +:stk_save equ tmp4 ; First task is to fill in the STK_ADDR values by copying them from the RTable array. We ; copy from RTable[i] into BlitField[StartY+i]. As with all of this code, the difficult part ; is decomposing the update across banks - stz :rtbl_idx ; Start copying from the first entry in the table + stz :rtbl_idx_x2 ; Start copying from the first entry in the table lda StartY ; This is the base line of the virtual screen jsr Mod208 sta StartYMod208 - sta :virt_line ; Keep track of it + asl + sta :virt_line_x2 ; Keep track of it + + phb ; Save the current bank + tsc ; we intentionally leak one byte of stack in each loop + sta :stk_save ; iteration, so save the stack to repair at the end ; copy a range of address from the table into the destination bank. If we restrict ourselves to ; rectangular playfields, this can be optimized to just subtracting a constant value. See the ; Templates::SetScreenAddrs subroutine. lda ScreenHeight - sta :lines_left + asl + sta :lines_left_x2 ; This is the verbose part -- figure out how many lines to draw. We don't want to artificially limit ; the height of the visible screen (for example, doing an animated wipe while scrolling), so the screen @@ -38,62 +45,47 @@ _ApplyBG0YPos ; For larger values, we want to break things up on 16-line boundaries based on the virt_line value. So, ; ; draw_count = min(lines_left, (16 - (virt_line % 16)) -; -; Note that almost everything in this loop can be done with 8-bit operations sincc the values are -; all under 200. The one exception is the virt_line value which could exceed 256. This will be -; a later optimization and might save around 10 cycles per iteration, or up to ~120 cycles per frame -; and ~2,500 per secord. This is ~1% of our total CPU budget and is *just* enough cycles to be -; interesting.... Another 8 cycles could be removed by doing all calculatinos pre-multiplied by 2 -; to avoid several 'asl' instructions - phb + :loop - lda :virt_line - asl - tax + ldx :virt_line_x2 ldal BTableLow,x ; Get the address of the first code field line tay - sep #$20 - ldal BTableHigh,x + ldal BTableHigh,x ; Target bank in low byte, current bank in high pha - plb ; This is the bank that will receive the updates - rep #$20 - lda :virt_line - and #$000F + txa + and #$001E eor #$FFFF - inc - clc - adc #16 - min :lines_left + sec + adc #32 + min :lines_left_x2 - sta :draw_count ; Do this many lines - asl + sta :draw_count_x2 ; Do this many lines tax - lda :rtbl_idx ; Read from this location in the RTable - asl + clc ; pre-advance virt_line_2 because we have the value + adc :virt_line_x2 + sta :virt_line_x2 - jsr CopyRTableToStkAddr + plb + jsr (:CopyRTableToStkAddr,x) ; X = rtbl_idx_x2 on return - lda :virt_line ; advance to the virtual line after the segment we just - clc ; filled in - adc :draw_count - sta :virt_line + txa ; carry flag is unchanged + adc :draw_count_x2 ; advance the index into the RTable + sta :rtbl_idx_x2 - lda :rtbl_idx ; advance the index into the RTable - adc :draw_count - sta :rtbl_idx - lda :lines_left ; subtract the number of lines we just completed + lda :lines_left_x2 ; subtract the number of lines we just completed sec - sbc :draw_count - sta :lines_left + sbc :draw_count_x2 + sta :lines_left_x2 jne :loop - plb -:out + lda :stk_save + tcs + plb rts ; Unrolled copy routine to move RTable intries into STK_ADDR position. @@ -101,44 +93,43 @@ _ApplyBG0YPos ; A = intect into the RTable array (x2) ; Y = starting line * $1000 ; X = number of lines (x2) -CopyRTableToStkAddr - jmp (:tbl,x) -:tbl da :none +:CopyRTableToStkAddr + da :none da :do01,:do02,:do03,:do04 da :do05,:do06,:do07,:do08 da :do09,:do10,:do11,:do12 da :do13,:do14,:do15,:do16 -:do15 tax +:do15 ldx :rtbl_idx_x2 bra :x15 -:do14 tax +:do14 ldx :rtbl_idx_x2 bra :x14 -:do13 tax +:do13 ldx :rtbl_idx_x2 bra :x13 -:do12 tax +:do12 ldx :rtbl_idx_x2 bra :x12 -:do11 tax +:do11 ldx :rtbl_idx_x2 bra :x11 -:do10 tax +:do10 ldx :rtbl_idx_x2 bra :x10 -:do09 tax +:do09 ldx :rtbl_idx_x2 bra :x09 -:do08 tax +:do08 ldx :rtbl_idx_x2 bra :x08 -:do07 tax +:do07 ldx :rtbl_idx_x2 bra :x07 -:do06 tax +:do06 ldx :rtbl_idx_x2 bra :x06 -:do05 tax +:do05 ldx :rtbl_idx_x2 bra :x05 -:do04 tax +:do04 ldx :rtbl_idx_x2 bra :x04 -:do03 tax +:do03 ldx :rtbl_idx_x2 bra :x03 -:do02 tax +:do02 ldx :rtbl_idx_x2 bra :x02 -:do01 tax +:do01 ldx :rtbl_idx_x2 bra :x01 -:do16 tax +:do16 ldx :rtbl_idx_x2 ldal RTable+30,x sta STK_ADDR+$F000,y :x15 ldal RTable+28,x From fa731f4b2dd4f7d570ea6f43f65350b10bcfd4df Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Thu, 28 Jul 2022 12:15:00 -0500 Subject: [PATCH 3/5] Streamline restore BG0; experiment with more aggressive inlining --- src/blitter/Horz.s | 145 ++++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 75 deletions(-) diff --git a/src/blitter/Horz.s b/src/blitter/Horz.s index 496956f..66c0f25 100644 --- a/src/blitter/Horz.s +++ b/src/blitter/Horz.s @@ -20,6 +20,7 @@ _RestoreBG0Opcodes :lines_left_x2 equ tmp2 :draw_count_x2 equ tmp3 :exit_offset equ tmp4 +:stk_save equ tmp5 phb ; Save data bank @@ -33,16 +34,17 @@ _RestoreBG0Opcodes lda LastPatchOffset ; If zero, there are no saved opcodes sta :exit_offset - ldx :virt_line_x2 + tsc + sta :stk_save + :loop + ldx :virt_line_x2 ldal BTableLow,x ; Get the address of the first code field line tay - sep #$20 - ldal BTableHigh,x + ldal BTableHigh,x ; This intentionally leaks one byte on the stack pha plb ; This is the bank that will receive the updates - rep #$20 txa ; lda :virt_line_x2 and #$001E @@ -54,18 +56,15 @@ _RestoreBG0Opcodes ; y is already set to :base_address tax ; :draw_count * 2 + clc + adc :virt_line_x2 + sta :virt_line_x2 tya - clc adc :exit_offset ; Add some offsets to get the base address in the code field line - jsr (RestoreOpcode,x) - - lda :virt_line_x2 ; advance to the virtual line after the segment we just - clc ; filled in - adc :draw_count_x2 - sta :virt_line_x2 - tax + jmp (:tgt,x) +:tgt RestoreOpcode lda :lines_left_x2 ; subtract the number of lines we just completed sec @@ -73,9 +72,11 @@ _RestoreBG0Opcodes sta :lines_left_x2 jne :loop + stz LastPatchOffset ; Clear the value once completed -:out + lda :stk_save + tcs plb rts @@ -318,7 +319,7 @@ _ApplyBG0XPos ; y is already set to :base_address tax ; :draw_count_x2 clc ; advance to the virtual line after the segment we just - adc :virt_line_x2 ; filled in + adc :virt_line_x2 ; filled in sta :virt_line_x2 lda :exit_address ; Save from this location @@ -334,14 +335,12 @@ _ApplyBG0XPos sep #$20 -; ldx :draw_count_x2 lda :entry_offset ldy :base_address jsr (SetCodeEntry,x) ; All registers are preserved ; Now, patch in the opcode -; ldx :draw_count_x2 lda :opcode jsr (SetCodeEntryOpcode,x) ; All registers are preserved @@ -350,12 +349,7 @@ _ApplyBG0XPos lda :odd_entry_offset beq :not_odd -; NOTE: SetOddCodeEntry and SaveHighOperand can probably be combined to eliminate call/return overhead - -; ldx :draw_count_x2 jsr (SetOddCodeEntry,x) ; All registers are preserved - -; ldx :draw_count_x2 jmp (:SaveHighOperand,x) ; Only used once, so "inline" it :save_high_op_rtn @@ -544,77 +538,78 @@ SaveOpcode ; X = number of lines * 2, 0 to 32 ; Y = starting line * $1000 ; A = code field location * $1000 -RestoreOpcode - da :bottom - da :do01,:do02,:do03,:do04 - da :do05,:do06,:do07,:do08 - da :do09,:do10,:do11,:do12 - da :do13,:do14,:do15,:do16 +RestoreOpcode mac + da bottom + da do01,do02,do03,do04 + da do05,do06,do07,do08 + da do09,do10,do11,do12 + da do13,do14,do15,do16 -:do15 tax - bra :x15 -:do14 tax - bra :x14 -:do13 tax - bra :x13 -:do12 tax - bra :x12 -:do11 tax - bra :x11 -:do10 tax - bra :x10 -:do09 tax - bra :x09 -:do08 tax - bra :x08 -:do07 tax - bra :x07 -:do06 tax - bra :x06 -:do05 tax - bra :x05 -:do04 tax - bra :x04 -:do03 tax - bra :x03 -:do02 tax - bra :x02 -:do01 tax - bra :x01 -:do16 tax -:x16 lda OPCODE_SAVE+$F000,y +do15 tax + bra x15 +do14 tax + bra x14 +do13 tax + bra x13 +do12 tax + bra x12 +do11 tax + bra x11 +do10 tax + bra x10 +do09 tax + bra x09 +do08 tax + bra x08 +do07 tax + bra x07 +do06 tax + bra x06 +do05 tax + bra x05 +do04 tax + bra x04 +do03 tax + bra x03 +do02 tax + bra x02 +do01 tax + bra x01 +do16 tax +x16 lda OPCODE_SAVE+$F000,y sta $F000,x -:x15 lda OPCODE_SAVE+$E000,y +x15 lda OPCODE_SAVE+$E000,y sta $E000,x -:x14 lda OPCODE_SAVE+$D000,y +x14 lda OPCODE_SAVE+$D000,y sta $D000,x -:x13 lda OPCODE_SAVE+$C000,y +x13 lda OPCODE_SAVE+$C000,y sta $C000,x -:x12 lda OPCODE_SAVE+$B000,y +x12 lda OPCODE_SAVE+$B000,y sta $B000,x -:x11 lda OPCODE_SAVE+$A000,y +x11 lda OPCODE_SAVE+$A000,y sta $A000,x -:x10 lda OPCODE_SAVE+$9000,y +x10 lda OPCODE_SAVE+$9000,y sta $9000,x -:x09 lda OPCODE_SAVE+$8000,y +x09 lda OPCODE_SAVE+$8000,y sta $8000,x -:x08 lda OPCODE_SAVE+$7000,y +x08 lda OPCODE_SAVE+$7000,y sta $7000,x -:x07 lda OPCODE_SAVE+$6000,y +x07 lda OPCODE_SAVE+$6000,y sta $6000,x -:x06 lda OPCODE_SAVE+$5000,y +x06 lda OPCODE_SAVE+$5000,y sta $5000,x -:x05 lda OPCODE_SAVE+$4000,y +x05 lda OPCODE_SAVE+$4000,y sta $4000,x -:x04 lda OPCODE_SAVE+$3000,y +x04 lda OPCODE_SAVE+$3000,y sta $3000,x -:x03 lda OPCODE_SAVE+$2000,y +x03 lda OPCODE_SAVE+$2000,y sta $2000,x -:x02 lda OPCODE_SAVE+$1000,y +x02 lda OPCODE_SAVE+$1000,y sta $1000,x -:x01 lda: OPCODE_SAVE+$0000,y +x01 lda: OPCODE_SAVE+$0000,y sta: $0000,x -:bottom rts +bottom + <<< ; SetCodeEntry ; From dedb6575462ce694e26ce7edd652c3ab57906a8e Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Thu, 28 Jul 2022 12:18:54 -0500 Subject: [PATCH 4/5] Remove redundent initialization --- demos/kfest-2022/demo-1/App.Main.s | 6 ------ 1 file changed, 6 deletions(-) diff --git a/demos/kfest-2022/demo-1/App.Main.s b/demos/kfest-2022/demo-1/App.Main.s index 63e1e6a..882b4ef 100644 --- a/demos/kfest-2022/demo-1/App.Main.s +++ b/demos/kfest-2022/demo-1/App.Main.s @@ -56,12 +56,6 @@ appTmp0 equ 28 stz StartY stz frameCount -; Initialize the graphics screen playfield - - pea #320 - pea #200 - _GTESetScreenMode - ; Load a tileset pea #^tiledata From 456744027dbd3fa080e152b88e39ebb56adfb709 Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Thu, 28 Jul 2022 13:15:46 -0500 Subject: [PATCH 5/5] Inline all functions that are only called once Eliminates the JSR/RTS overhead for the copy functions. Combined with the other streamlining, we save around 60 - 70 cycles per bank, or a total savings of around 10,000 cycles per seconds when running at full screen. This doesn't really change the FPS, but just gives some cycles back to the main application logic. --- src/blitter/Horz.s | 294 ++++++++++++++++++------------------ src/blitter/TemplateUtils.s | 18 ++- src/blitter/Vert.s | 111 +++++++------- 3 files changed, 217 insertions(+), 206 deletions(-) diff --git a/src/blitter/Horz.s b/src/blitter/Horz.s index 66c0f25..75840de 100644 --- a/src/blitter/Horz.s +++ b/src/blitter/Horz.s @@ -63,8 +63,7 @@ _RestoreBG0Opcodes tya adc :exit_offset ; Add some offsets to get the base address in the code field line - jmp (:tgt,x) -:tgt RestoreOpcode + RestoreOpcode lda :lines_left_x2 ; subtract the number of lines we just completed sec @@ -323,12 +322,12 @@ _ApplyBG0XPos sta :virt_line_x2 lda :exit_address ; Save from this location - jsr (SaveOpcode,x) ; X = :exit_address on return + SaveOpcode ; X = :exit_address on return txy ; ldy :exit_address -- starting at this address ldx :draw_count_x2 ; Do this many lines lda :exit_bra ; Copy this value into all of the lines - jsr (SetConst,x) ; All registers are preserved + SetConst ; All registers are preserved ; Next, patch in the CODE_ENTRY value, which is the low byte of a JMP instruction. This is an ; 8-bit operation and, since the PEA code is bank aligned, we use the entry_offset value directly @@ -337,21 +336,20 @@ _ApplyBG0XPos lda :entry_offset ldy :base_address - jsr (SetCodeEntry,x) ; All registers are preserved + SetCodeEntry ; All registers are preserved ; Now, patch in the opcode lda :opcode - jsr (SetCodeEntryOpcode,x) ; All registers are preserved + SetCodeEntryOpcode ; All registers are preserved ; If this is an odd entry, also set the odd_entry low byte and save the operand high byte lda :odd_entry_offset - beq :not_odd + jeq :not_odd - jsr (SetOddCodeEntry,x) ; All registers are preserved - jmp (:SaveHighOperand,x) ; Only used once, so "inline" it -:save_high_op_rtn + SetOddCodeEntry ; All registers are preserved + SaveHighOperand :exit_address ; Only used once, so "inline" it :not_odd rep #$21 ; clear the carry @@ -379,77 +377,78 @@ _ApplyBG0XPos ; X = number of lines * 2, 0 to 32 ; Y = starting line * $1000 ; A = code field location * $1000 -:SaveHighOperand - da :bottom - da :do01,:do02,:do03,:do04 - da :do05,:do06,:do07,:do08 - da :do09,:do10,:do11,:do12 - da :do13,:do14,:do15,:do16 +SaveHighOperand mac + jmp (dispTbl,x) +dispTbl da bottom + da do01,do02,do03,do04 + da do05,do06,do07,do08 + da do09,do10,do11,do12 + da do13,do14,do15,do16 -:do15 ldx :exit_address ; accumulator is in 8-bit mode, so can't use TAX - bra :x15 -:do14 ldx :exit_address - bra :x14 -:do13 ldx :exit_address - bra :x13 -:do12 ldx :exit_address - bra :x12 -:do11 ldx :exit_address - bra :x11 -:do10 ldx :exit_address - bra :x10 -:do09 ldx :exit_address - bra :x09 -:do08 ldx :exit_address - bra :x08 -:do07 ldx :exit_address - bra :x07 -:do06 ldx :exit_address - bra :x06 -:do05 ldx :exit_address - bra :x05 -:do04 ldx :exit_address - bra :x04 -:do03 ldx :exit_address - bra :x03 -:do02 ldx :exit_address - bra :x02 -:do01 ldx :exit_address - bra :x01 -:do16 ldx :exit_address -:x16 lda $F002,x +do15 ldx ]1 ; accumulator is in 8-bit mode, so can't use TAX + bra x15 +do14 ldx ]1 + bra x14 +do13 ldx ]1 + bra x13 +do12 ldx ]1 + bra x12 +do11 ldx ]1 + bra x11 +do10 ldx ]1 + bra x10 +do09 ldx ]1 + bra x09 +do08 ldx ]1 + bra x08 +do07 ldx ]1 + bra x07 +do06 ldx ]1 + bra x06 +do05 ldx ]1 + bra x05 +do04 ldx ]1 + bra x04 +do03 ldx ]1 + bra x03 +do02 ldx ]1 + bra x02 +do01 ldx ]1 + bra x01 +do16 ldx ]1 +x16 lda $F002,x sta OPCODE_HIGH_SAVE+$F000,y -:x15 lda $E002,x +x15 lda $E002,x sta OPCODE_HIGH_SAVE+$E000,y -:x14 lda $D002,x +x14 lda $D002,x sta OPCODE_HIGH_SAVE+$D000,y -:x13 lda $C002,x +x13 lda $C002,x sta OPCODE_HIGH_SAVE+$C000,y -:x12 lda $B002,x +x12 lda $B002,x sta OPCODE_HIGH_SAVE+$B000,y -:x11 lda $A002,x +x11 lda $A002,x sta OPCODE_HIGH_SAVE+$A000,y -:x10 lda $9002,x +x10 lda $9002,x sta OPCODE_HIGH_SAVE+$9000,y -:x09 lda $8002,x +x09 lda $8002,x sta OPCODE_HIGH_SAVE+$8000,y -:x08 lda $7002,x +x08 lda $7002,x sta OPCODE_HIGH_SAVE+$7000,y -:x07 lda $6002,x +x07 lda $6002,x sta OPCODE_HIGH_SAVE+$6000,y -:x06 lda $5002,x +x06 lda $5002,x sta OPCODE_HIGH_SAVE+$5000,y -:x05 lda $4002,x +x05 lda $4002,x sta OPCODE_HIGH_SAVE+$4000,y -:x04 lda $3002,x +x04 lda $3002,x sta OPCODE_HIGH_SAVE+$3000,y -:x03 lda $2002,x +x03 lda $2002,x sta OPCODE_HIGH_SAVE+$2000,y -:x02 lda $1002,x +x02 lda $1002,x sta OPCODE_HIGH_SAVE+$1000,y -:x01 lda: $0002,x +x01 lda: $0002,x sta: OPCODE_HIGH_SAVE+$0000,y -:bottom jmp :save_high_op_rtn +bottom <<< ; SaveOpcode ; @@ -459,77 +458,79 @@ _ApplyBG0XPos ; X = number of lines * 2, 0 to 32 ; Y = starting line * $1000 ; A = code field location * $1000 -SaveOpcode - da :bottom - da :do01,:do02,:do03,:do04 - da :do05,:do06,:do07,:do08 - da :do09,:do10,:do11,:do12 - da :do13,:do14,:do15,:do16 +SaveOpcode mac + jmp (dispTbl,x) +dispTbl da bottom + da do01,do02,do03,do04 + da do05,do06,do07,do08 + da do09,do10,do11,do12 + da do13,do14,do15,do16 -:do15 tax - bra :x15 -:do14 tax - bra :x14 -:do13 tax - bra :x13 -:do12 tax - bra :x12 -:do11 tax - bra :x11 -:do10 tax - bra :x10 -:do09 tax - bra :x09 -:do08 tax - bra :x08 -:do07 tax - bra :x07 -:do06 tax - bra :x06 -:do05 tax - bra :x05 -:do04 tax - bra :x04 -:do03 tax - bra :x03 -:do02 tax - bra :x02 -:do01 tax - bra :x01 -:do16 tax -:x16 lda $F000,x +do15 tax + bra x15 +do14 tax + bra x14 +do13 tax + bra x13 +do12 tax + bra x12 +do11 tax + bra x11 +do10 tax + bra x10 +do09 tax + bra x09 +do08 tax + bra x08 +do07 tax + bra x07 +do06 tax + bra x06 +do05 tax + bra x05 +do04 tax + bra x04 +do03 tax + bra x03 +do02 tax + bra x02 +do01 tax + bra x01 +do16 tax +x16 lda $F000,x sta OPCODE_SAVE+$F000,y -:x15 lda $E000,x +x15 lda $E000,x sta OPCODE_SAVE+$E000,y -:x14 lda $D000,x +x14 lda $D000,x sta OPCODE_SAVE+$D000,y -:x13 lda $C000,x +x13 lda $C000,x sta OPCODE_SAVE+$C000,y -:x12 lda $B000,x +x12 lda $B000,x sta OPCODE_SAVE+$B000,y -:x11 lda $A000,x +x11 lda $A000,x sta OPCODE_SAVE+$A000,y -:x10 lda $9000,x +x10 lda $9000,x sta OPCODE_SAVE+$9000,y -:x09 lda $8000,x +x09 lda $8000,x sta OPCODE_SAVE+$8000,y -:x08 lda $7000,x +x08 lda $7000,x sta OPCODE_SAVE+$7000,y -:x07 lda $6000,x +x07 lda $6000,x sta OPCODE_SAVE+$6000,y -:x06 lda $5000,x +x06 lda $5000,x sta OPCODE_SAVE+$5000,y -:x05 lda $4000,x +x05 lda $4000,x sta OPCODE_SAVE+$4000,y -:x04 lda $3000,x +x04 lda $3000,x sta OPCODE_SAVE+$3000,y -:x03 lda $2000,x +x03 lda $2000,x sta OPCODE_SAVE+$2000,y -:x02 lda $1000,x +x02 lda $1000,x sta OPCODE_SAVE+$1000,y -:x01 lda: $0000,x +x01 lda: $0000,x sta: OPCODE_SAVE+$0000,y -:bottom rts +bottom + <<< ; RestoreOpcode ; @@ -539,7 +540,8 @@ SaveOpcode ; Y = starting line * $1000 ; A = code field location * $1000 RestoreOpcode mac - da bottom + jmp (dispTbl,x) +dispTbl da bottom da do01,do02,do03,do04 da do05,do06,do07,do08 da do09,do10,do11,do12 @@ -618,13 +620,14 @@ bottom ; X = number of lines * 2, 0 to 32 ; Y = starting line * $1000 ; A = address low byte -SetCodeEntry - da :bottom-00,:bottom-03,:bottom-06,:bottom-09 - da :bottom-12,:bottom-15,:bottom-18,:bottom-21 - da :bottom-24,:bottom-27,:bottom-30,:bottom-33 - da :bottom-36,:bottom-39,:bottom-42,:bottom-45 - da :bottom-48 -:top sta CODE_ENTRY+$F000,y +SetCodeEntry mac + jmp (dispTbl,x) +dispTbl da bottom-00,bottom-03,bottom-06,bottom-09 + da bottom-12,bottom-15,bottom-18,bottom-21 + da bottom-24,bottom-27,bottom-30,bottom-33 + da bottom-36,bottom-39,bottom-42,bottom-45 + da bottom-48 + sta CODE_ENTRY+$F000,y sta CODE_ENTRY+$E000,y sta CODE_ENTRY+$D000,y sta CODE_ENTRY+$C000,y @@ -640,7 +643,8 @@ SetCodeEntry sta CODE_ENTRY+$2000,y sta CODE_ENTRY+$1000,y sta: CODE_ENTRY+$0000,y -:bottom rts +bottom + <<< ; SetOddCodeEntry ; @@ -649,13 +653,14 @@ SetCodeEntry ; X = number of lines * 2, 0 to 32 ; Y = starting line * $1000 ; A = address low byte -SetOddCodeEntry - da :bottom-00,:bottom-03,:bottom-06,:bottom-09 - da :bottom-12,:bottom-15,:bottom-18,:bottom-21 - da :bottom-24,:bottom-27,:bottom-30,:bottom-33 - da :bottom-36,:bottom-39,:bottom-42,:bottom-45 - da :bottom-48 -:top sta ODD_ENTRY+$F000,y +SetOddCodeEntry mac + jmp (dispTbl,x) +dispTbl da bottom-00,bottom-03,bottom-06,bottom-09 + da bottom-12,bottom-15,bottom-18,bottom-21 + da bottom-24,bottom-27,bottom-30,bottom-33 + da bottom-36,bottom-39,bottom-42,bottom-45 + da bottom-48 + sta ODD_ENTRY+$F000,y sta ODD_ENTRY+$E000,y sta ODD_ENTRY+$D000,y sta ODD_ENTRY+$C000,y @@ -671,7 +676,8 @@ SetOddCodeEntry sta ODD_ENTRY+$2000,y sta ODD_ENTRY+$1000,y sta: ODD_ENTRY+$0000,y -:bottom rts +bottom + <<< ; SetCodeEntryOpcode ; @@ -680,13 +686,14 @@ SetOddCodeEntry ; X = number of lines * 2, 0 to 32 ; Y = starting line * $1000 ; A = opcode value -SetCodeEntryOpcode - da :bottom-00,:bottom-03,:bottom-06,:bottom-09 - da :bottom-12,:bottom-15,:bottom-18,:bottom-21 - da :bottom-24,:bottom-27,:bottom-30,:bottom-33 - da :bottom-36,:bottom-39,:bottom-42,:bottom-45 - da :bottom-48 -:top sta CODE_ENTRY_OPCODE+$F000,y +SetCodeEntryOpcode mac + jmp (dispTbl,x) +dispTbl da bottom-00,bottom-03,bottom-06,bottom-09 + da bottom-12,bottom-15,bottom-18,bottom-21 + da bottom-24,bottom-27,bottom-30,bottom-33 + da bottom-36,bottom-39,bottom-42,bottom-45 + da bottom-48 + sta CODE_ENTRY_OPCODE+$F000,y sta CODE_ENTRY_OPCODE+$E000,y sta CODE_ENTRY_OPCODE+$D000,y sta CODE_ENTRY_OPCODE+$C000,y @@ -702,4 +709,5 @@ SetCodeEntryOpcode sta CODE_ENTRY_OPCODE+$2000,y sta CODE_ENTRY_OPCODE+$1000,y sta: CODE_ENTRY_OPCODE+$0000,y -:bottom rts +bottom + <<< diff --git a/src/blitter/TemplateUtils.s b/src/blitter/TemplateUtils.s index 0dae3d5..534b19e 100644 --- a/src/blitter/TemplateUtils.s +++ b/src/blitter/TemplateUtils.s @@ -87,13 +87,14 @@ Counter equ tmp3 ; A = value ; ; Set M to 0 or 1 -SetConst - da :bottom-00,:bottom-03,:bottom-06,:bottom-09 - da :bottom-12,:bottom-15,:bottom-18,:bottom-21 - da :bottom-24,:bottom-27,:bottom-30,:bottom-33 - da :bottom-36,:bottom-39,:bottom-42,:bottom-45 - da :bottom-48 -:top sta $F000,y +SetConst mac + jmp (dispTbl,x) +dispTbl da bottom-00,bottom-03,bottom-06,bottom-09 + da bottom-12,bottom-15,bottom-18,bottom-21 + da bottom-24,bottom-27,bottom-30,bottom-33 + da bottom-36,bottom-39,bottom-42,bottom-45 + da bottom-48 + sta $F000,y sta $E000,y sta $D000,y sta $C000,y @@ -109,7 +110,8 @@ SetConst sta $2000,y sta $1000,y sta: $0000,y -:bottom rts +bottom + <<< ; SetDPAddrs ; diff --git a/src/blitter/Vert.s b/src/blitter/Vert.s index eafe61f..e949b73 100644 --- a/src/blitter/Vert.s +++ b/src/blitter/Vert.s @@ -69,13 +69,12 @@ _ApplyBG0YPos sta :virt_line_x2 plb - jsr (:CopyRTableToStkAddr,x) ; X = rtbl_idx_x2 on return + CopyRTableToStkAddr :rtbl_idx_x2 ; X = rtbl_idx_x2 on return txa ; carry flag is unchanged adc :draw_count_x2 ; advance the index into the RTable sta :rtbl_idx_x2 - lda :lines_left_x2 ; subtract the number of lines we just completed sec sbc :draw_count_x2 @@ -93,73 +92,75 @@ _ApplyBG0YPos ; A = intect into the RTable array (x2) ; Y = starting line * $1000 ; X = number of lines (x2) -:CopyRTableToStkAddr - da :none - da :do01,:do02,:do03,:do04 - da :do05,:do06,:do07,:do08 - da :do09,:do10,:do11,:do12 - da :do13,:do14,:do15,:do16 -:do15 ldx :rtbl_idx_x2 - bra :x15 -:do14 ldx :rtbl_idx_x2 - bra :x14 -:do13 ldx :rtbl_idx_x2 - bra :x13 -:do12 ldx :rtbl_idx_x2 - bra :x12 -:do11 ldx :rtbl_idx_x2 - bra :x11 -:do10 ldx :rtbl_idx_x2 - bra :x10 -:do09 ldx :rtbl_idx_x2 - bra :x09 -:do08 ldx :rtbl_idx_x2 - bra :x08 -:do07 ldx :rtbl_idx_x2 - bra :x07 -:do06 ldx :rtbl_idx_x2 - bra :x06 -:do05 ldx :rtbl_idx_x2 - bra :x05 -:do04 ldx :rtbl_idx_x2 - bra :x04 -:do03 ldx :rtbl_idx_x2 - bra :x03 -:do02 ldx :rtbl_idx_x2 - bra :x02 -:do01 ldx :rtbl_idx_x2 - bra :x01 -:do16 ldx :rtbl_idx_x2 +CopyRTableToStkAddr mac + jmp (dispTbl,x) +dispTbl da bottom + da do01,do02,do03,do04 + da do05,do06,do07,do08 + da do09,do10,do11,do12 + da do13,do14,do15,do16 +do15 ldx ]1 + bra x15 +do14 ldx ]1 + bra x14 +do13 ldx ]1 + bra x13 +do12 ldx ]1 + bra x12 +do11 ldx ]1 + bra x11 +do10 ldx ]1 + bra x10 +do09 ldx ]1 + bra x09 +do08 ldx ]1 + bra x08 +do07 ldx ]1 + bra x07 +do06 ldx ]1 + bra x06 +do05 ldx ]1 + bra x05 +do04 ldx ]1 + bra x04 +do03 ldx ]1 + bra x03 +do02 ldx ]1 + bra x02 +do01 ldx ]1 + bra x01 +do16 ldx ]1 ldal RTable+30,x sta STK_ADDR+$F000,y -:x15 ldal RTable+28,x +x15 ldal RTable+28,x sta STK_ADDR+$E000,y -:x14 ldal RTable+26,x +x14 ldal RTable+26,x sta STK_ADDR+$D000,y -:x13 ldal RTable+24,x +x13 ldal RTable+24,x sta STK_ADDR+$C000,y -:x12 ldal RTable+22,x +x12 ldal RTable+22,x sta STK_ADDR+$B000,y -:x11 ldal RTable+20,x +x11 ldal RTable+20,x sta STK_ADDR+$A000,y -:x10 ldal RTable+18,x +x10 ldal RTable+18,x sta STK_ADDR+$9000,y -:x09 ldal RTable+16,x +x09 ldal RTable+16,x sta STK_ADDR+$8000,y -:x08 ldal RTable+14,x +x08 ldal RTable+14,x sta STK_ADDR+$7000,y -:x07 ldal RTable+12,x +x07 ldal RTable+12,x sta STK_ADDR+$6000,y -:x06 ldal RTable+10,x +x06 ldal RTable+10,x sta STK_ADDR+$5000,y -:x05 ldal RTable+08,x +x05 ldal RTable+08,x sta STK_ADDR+$4000,y -:x04 ldal RTable+06,x +x04 ldal RTable+06,x sta STK_ADDR+$3000,y -:x03 ldal RTable+04,x +x03 ldal RTable+04,x sta STK_ADDR+$2000,y -:x02 ldal RTable+02,x +x02 ldal RTable+02,x sta STK_ADDR+$1000,y -:x01 ldal RTable+00,x +x01 ldal RTable+00,x sta: STK_ADDR+$0000,y -:none rts +bottom + <<< \ No newline at end of file