From 4c31a0d05667fc6375af8d94c0d9be6142998bae Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Fri, 19 Nov 2021 10:24:09 -0600 Subject: [PATCH] Reorg of exception handling code in the core blitter * Moved V-flag handling outside of the 32-byte exception handler * Switched relative branches to JMP to save a cycle per word * Updated macros to create a full code snippet instead of assuming certain values exist in the exception handler buffer --- macros/CORE.MACS.S | 45 ++++++++++++++++------------ src/blitter/Template.s | 68 ++++++++++++++++++++++++++---------------- 2 files changed, 69 insertions(+), 44 deletions(-) diff --git a/macros/CORE.MACS.S b/macros/CORE.MACS.S index e3e53c9..340044e 100644 --- a/macros/CORE.MACS.S +++ b/macros/CORE.MACS.S @@ -219,7 +219,7 @@ mixed cmp #$FFFF ; All 1's in the mask is fully transparent ; This is the slowest path because there is a *lot* of work to do. So much that it's ; worth it to change up the environment to optimize things a bit more. ; -; Need to fill in the first 8 bytes of the JMP handler with the following code sequence +; Need to fill in the first 10 bytes of the JMP handler with the following code sequence ; ; lda (00),y ; and #MASK @@ -252,6 +252,9 @@ mixed cmp #$FFFF ; All 1's in the mask is fully transparent lda ]1 sta: $0006,y + lda #$0D80 ; branch to the prologue (BRA *+15) + sta: $0008,y + ldy _Y_REG ; restore original y-register value and move on bra next @@ -321,6 +324,9 @@ mixed cmp #$FFFF ; All 1's in the mask is fully transparent ldal ]1,x sta: $0006,y + lda #$0D80 ; branch to the prologue (BRA *+15) + sta: $0008,y + ldy _Y_REG ; restore original y-register value and move on bra next @@ -346,12 +352,11 @@ next ; ]3 : code field offset CopyMaskedDWord MAC -; Need to fill in the first 8 bytes of the JMP handler with the following code sequence +; Need to fill in the first 6 bytes of the JMP handler with the following code sequence ; ; lda (00),y ; and $80,x ; ora $00,x -; bcc *+4 ldx _X_REG ; Get the addressing offset ldal JTableOffset,x ; Get the address offset and add to the base address @@ -377,7 +382,7 @@ CopyMaskedDWord MAC sta: $0004,x ; Set ORA 00,x opcode rep #$30 - lda #$0290 ; BCC *+4 + lda #$0F80 ; branch to the prologue (BRA *+17) sta: $0006,x eom @@ -392,25 +397,17 @@ CopyMaskedDWord MAC ; ]3 : code field offset CopyMaskedDynSpriteWord MAC -; Need to fill in the first 12(!!) bytes of the JMP handler with the following code sequence +; Need to fill in the first 12 bytes of the JMP handler with the following code sequence ; ; lda (00),y ; and $80,x ; ora $00,x ; and #MASK ; ora #DATA +; bra *+11 ; ; If MASK == 0, then we can do a PEA. If MASK == $FFFF, then fall back to the simple Dynamic Masked ; code. -; -; If the tile priority bit is set, then we use an alternate bit of code that changes to oder -; of operations to and can't make an assumption about the transparency -; -; lda (00),y -; and #MASK -; ora #DATA -; and $80,x -; ora $00,x ldx _X_REG ; Get the addressing offset @@ -432,11 +429,21 @@ CopyMaskedDynSpriteWord MAC ora #$80 sta: $0003,x ; Set AND 00,x operand lda #$35 - sta: $0002,x ; Set AND 00,x operand + sta: $0002,x ; Set AND 00,x opcode lda #$15 - sta: $0004,x ; Set ORA 00,x operand - rep #$30 + sta: $0004,x ; Set ORA 00,x opcode - lda #$0290 ; BCC *+4 - sta: $0006,x + lda #$29 + sta: $0006,y ; AND #$0000 opcode + lda #$09 + sta: $0009,y ; ORA #$0000 opcode + rep #$20 + + lda ]1+32 ; insert the tile mask and data into the exception + sta: $0007,y ; handler. + lda ]1 + sta: $000A,y + + lda #$0990 ; BCC *+11 + sta: $000C,x eom diff --git a/src/blitter/Template.s b/src/blitter/Template.s index 18eec80..1685115 100644 --- a/src/blitter/Template.s +++ b/src/blitter/Template.s @@ -37,8 +37,18 @@ PagePatches da {long_0-base+2} da {loop_back-base+2} da {loop_exit_3-base+2} da {even_exit-base+2} + da {jmp_rtn_1-base+2} + da {jmp_rtn_2-base+2} + +]index equ 0 + lup 82 ; All the snippet addresses. The two JMP + da {snippets-base+{]index*32}+31} ; instructino are at the end of each of + da {snippets-base+{]index*32}+28} ; the 32-byte buffers +]index equ ]index+1 + --^ PagePatchNum equ *-PagePatches +; Location that need a bank byte set for long addressing modes BankPatches da {long_0-base+3} da {long_1-base+3} da {long_2-base+3} @@ -483,7 +493,9 @@ BuildBank ; 13 banks for a total of 208 lines, which is what is required to render 26 tiles ; to cover the full screen vertical scrolling. ; -; The 'base' location is always assumed to be on a 4kb ($1000) boundary +; The 'base' location is always assumed to be on a 4kb ($1000) boundary. We make sure that +; the code is assembled on a page boundary to help will alignment + ds \,$00 ; pad to the next page boundary base entry_1 ldx #0000 ; Used for LDA 00,x addressing entry_2 ldy #0000 ; Used for LDA (00),y addressing @@ -538,7 +550,7 @@ long_3 stal *+5-base full_return jml blt_return ; Full exit ; Re-enable interrupts and continue -- the even_exit JMP from the previous line will jump here every -; 8 or 16 lines in order to give the system some extra time to handle interrupts. +; 8 or 16 lines in order to give the system time to handle interrupts. enable_int ldal stk_save+1 ; restore the stack tcs sep #$20 ; 8-bit mode @@ -554,13 +566,26 @@ enable_int ldal stk_save+1 ; restore the stack rep #$20 bra entry_1 +; The even/odd branch of this line's exception handler will return here. This is mostly +; a space-saving measure to allow for more code in the exeption handers themselved, but +; also simplified the relocation process since we only have to update a single address +; in each exception handler, rather than two. +; +; Oce working, this code should be able to be interleaved with the r_jmp_rtn code +; above to eliminate a couple of branches +jmp_rtn + bvs jmp_rtn_v ; overflow set means this is the right edge (entry) + clc ; carry is set only for edge operations; force clear +jmp_rtn_1 jmp l_jmp_rtn-base +jmp_rtn_v rep #$41 ; clear V and C +jmp_rtn_2 jmp r_jmp_rtn-base ; This is the spot that needs to be page-aligned. In addition to simplifying the entry address ; and only needing to update a byte instad of a word, because the code breaks out of the ; code field with a BRA instruction, we keep everything within a page to avoid the 1-cycle ; page-crossing penalty of the branch. - ds 166 + ds \,$00 ; pad to the next page boundary loop_exit_1 jmp odd_exit-base ; +0 Alternate exit point depending on whether the left edge is loop_exit_2 jmp even_exit-base ; +3 odd-aligned @@ -685,32 +710,25 @@ epilogue_1 tsc ; r_edge rep #$41 ; brl r_jmp_rtn ; 3 - +; Each snippet is provided 32 bytes of space. The constant code is filled in from the end and +; it is the responsibility of the code that fills in the hander to create valid program in the +; first 23 bytes are available to be manipulated. +; +; Note that the code that's assembled in the first bytes of these snippets is just an example. Every +; routine that created an exception handler *MUST* write a full set of instructions since there is +; no guarantee of what was written previously. ds \,$00 ; pad to the next page boundary ]index equ 0 snippets lup 82 - ds 2 ; space for a 2-byte sequence; LDA (00),y LDA 00,x LDA 0,s + ds 2 ; space for all exception handlers and #$0000 ; the mask operand will be set when the tile is drawn ora #$0000 ; the data operand will be set when the tile is drawn - bcs *+6 - pha - brl loop+3+{3*]index} ; use relative branch for convenience - bvs *+6 ; overflow set means this is the right edge (entry) - clc ; carry is set only for edge operations; force clear - brl l_jmp_rtn - rep #$41 ; clear V and C - brl r_jmp_rtn ; 25 bytes - ds 7 ; padding + ds 15 ; extra padding + + bcs :byte ; if C = 0, just push the data and return + pha ; 1 byte + jmp loop+3+{3*]index}-base ; 3 bytes : use relative branch for convenience +:byte jmp jmp_rtn-base ; 3 bytes ]index equ ]index+1 --^ -top - - - - - - - - - - +top \ No newline at end of file