iigs-game-engine/src/blitter/Template.s

334 lines
17 KiB
ArmAsm

; Template and equates for GTE blitter
mx %00
DP_ADDR equ entry_1-base+1 ; offset to patch in the direct page for dynamic tiles
BG1_ADDR equ entry_2-base+1 ; offset to patch in the Y-reg for BG1 (dp),y addressing
STK_ADDR equ entry_3-base+1 ; offset to patch in the stack (SHR) right edge address
; BNK_ADDR equ entry_0-base+1 ; offset to patch in the address of a Bank 0 memory location to load the bank register
DP_ENTRY equ entry_1-base
TWO_LYR_ENTRY equ entry_2-base
ONE_LYR_ENTRY equ entry_3-base
; BANK_ENTRY equ entry_0-base
CODE_ENTRY_OPCODE equ entry_jmp-base
CODE_ENTRY equ entry_jmp-base+1 ; low byte of the page-aligned jump address
ODD_ENTRY equ odd_entry-base+1
CODE_TOP equ loop-base
CODE_LEN equ top-base
CODE_EXIT equ even_exit-base
OPCODE_SAVE equ odd_low_save-base ; spot to save the code field opcode when patching exit BRA
OPCODE_HIGH_SAVE equ odd_high_save-base ; save the second and third byte
FULL_RETURN equ full_return-base ; offset that returns from the blitter
ENABLE_INT equ enable_int-base ; offset that re-enable interrupts and continues
LINES_PER_BANK equ 16
SNIPPET_BASE equ snippets-base
; offsets from each snippet base address for the different entry points
SNIPPET_ENTRY_1 equ 0 ; two layer + dynamic tile + sprite
SNIPPET_ENTRY_2 equ 4 ; (two layer | dynamic tile) + sprite
SNIPPET_ENTRY_3 equ 18 ; sprite under dynamic tile
SNIPPET_ENTRY_4 equ 19 ; two layer + dynamic tile (no sprite)
; Locations that need the page offset added
PagePatches da {long_0-base+2}
da {long_1-base+2}
da {long_2-base+2}
da {long_3-base+2}
; da {long_4-base+2}
da {long_5-base+2}
da {long_6-base+2}
da {odd_entry-base+2}
da {loop_exit_1-base+2}
da {loop_exit_2-base+2}
da {loop_back-base+2}
da {loop_exit_3-base+2}
da {even_exit-base+2}
da {jmp_rtn_1-base+2}
; da {jmp_rtn_2-base+2}
]index equ 0
lup 82 ; Patch anything that needs updating within the snippets
da {snippets-base+{]index*32}+17}
da {snippets-base+{]index*32}+29}
]index equ ]index+1
--^
PagePatchNum equ *-PagePatches
; Location that need a bank byte set for long addressing modes
BankPatches da {long_0-base+3}
da {long_1-base+3}
da {long_2-base+3}
da {long_3-base+3}
; da {long_4-base+3}
da {long_5-base+3}
da {long_6-base+3}
BankPatchNum equ *-BankPatches
; Start of the template code. This code is replicated 16 times per bank and spans
; 13 banks for a total of 208 lines, which is what is required to render 26 tiles
; to cover the full screen vertical scrolling.
;
; The 'base' location is always assumed to be on a 4kb ($1000) boundary. We make sure that
; the code is assembled on a page boundary to help with alignment
ds \,$00 ; pad to the next page boundary
base
;entry_0 lda #0000 ; Used to set per-scanline bank register
; tcs
; plb
entry_1 ldx #0000 ; Used for LDA 00,x addressing (Dynamic Tiles)
entry_2 ldy #0000 ; Used for LDA (00),y addressing (Second Layer; BG1)
entry_3 lda #0000 ; Sets screen address (right edge)
tcs
long_0
entry_jmp jmp $0100
dfb $00 ; if the screen is odd-aligned, then the opcode is set to
; $AF to convert to a LDA long instruction. This puts the
; first two bytes of the instruction field in the accumulator
; and falls through to the next instruction.
; We structure the line so that the entry point only needs to
; update the low-byte of the address, the means it takes only
; an amortized 4-cycles per line to set the entry point break
bit #$000B ; Check the bottom nibble to quickly identify a PEA instruction
bne r_is_not_pea ; This costs 5 cycles in the fast-path
xba ; fast code for PEA
r_jmp_rtn sep #$20 ; shared return code path by all methods
two_byte_rtn pha
rep #$61 ; Clear Carry, Overflow and M bits #$20
odd_entry jmp $0100 ; unconditionally jump into the "next" instruction in the
; code field. This is OK, even if the entry point was the
; last instruction, because there is a JMP at the end of
; the code field, so the code will simply jump to that
; instruction directly.
;
; As with the original entry point, because all of the
; code field is page-aligned, only the low byte needs to
; be updated when the scroll position changes
r_is_not_pea bit #$0040 ; Check bit 6 to distinguish between JMP and all of the LDA variants
bne r_is_jmp
long_1 stal *+6-base ; Everything else is a two-byte LDA opcode + PHA
sep #$20 ; Lift 8-bit mode here to save a cycle in the LDA
dfb $00,$00
bra two_byte_rtn
r_is_jmp sep #$41 ; Set the C and V flags which tells a snippet to push only the low byte
long_2 ldal entry_jmp+1-base
long_3 stal *+5-base
jmp $0000 ; Jumps into the exception code, which returns to r_jmp_rtn
; The next labels are special, in that they are entry points into special subroutines. They are special
; because they are within the first 256 bytes of each code field, which allows them to be selectable
; by patching the low byte of the JMP instructions.
; Return to caller -- the even_exit JMP from the previous line will jump here when a render is complete
full_return jml blt_return ; Full exit
; The even/odd branch of this line's exception handler will return here. This is mostly
; a space-saving measure to allow for more code in the exeption handers themselves, but
; also simplifies the relocation process since we only have to update a single address
; in each exception handler, rather than two.
;
; Once working, this code should be able to be interleaved with the r_jmp_rtn code
; above to eliminate a couple of branches
jmp_rtn
bvs r_jmp_rtn
jmp_rtn_1 jmp l_jmp_rtn-base ; Could inline the code and save 3 cycles / line
; If we switch even/odd exit points, could fall through
; to the even_exit JMP at the head of the PEA field to
; save 6 cycles.
; Re-enable interrupts and continue -- the even_exit JMP from the previous line will jump here every
; 8 or 16 lines in order to give the system time to handle interrupts.
enable_int ldal stk_save+1 ; restore the stack
tcs
sep #$30 ; 8-bit mode
ldal STATE_REG
tax ; Save the value
and #$CF ; Read Bank 0 / Write Bank 0
stal STATE_REG
cli
nop ; Give a couple of cycles
sei
txa ; Restore the state
stal STATE_REG
rep #$30
bra entry_1
; This is the spot that needs to be page-aligned. In addition to simplifying the entry address
; and only needing to update a byte instad of a word, because the code breaks out of the
; code field with a BRA instruction, we keep everything within a page to avoid the 1-cycle
; page-crossing penalty of the branch.
ds \,$00 ; pad to the next page boundary
loop_exit_1 jmp odd_exit-base ; +0 Alternate exit point depending on whether the left edge is
loop_exit_2 jmp even_exit-base ; +3 odd-aligned
loop lup 82 ; +6 Set up 82 PEA instructions, which is 328 pixels and consumes 246 bytes
pea $0000 ; This is 41 8x8 tiles in width. Need to have N+1 tiles for screen overlap
--^
loop_back jmp loop-base ; +252 Ensure execution continues to loop around
loop_exit_3 jmp even_exit-base ; +255
odd_exit sep #$21 ; 8-bit mode and set the carry just in case we get to a snippet JMP
long_5 ldal OPCODE_SAVE ; Load the opcode that was saved
bit #$0B
bne :chk_jmp
long_6 ldal OPCODE_HIGH_SAVE+1 ; get the high byte of the PEA operand
; Fall-through when we have to push a byte on the left edge. Must be 8-bit on entry. Optimized
; for the PEA $0000 case -- only 17 cycles to handle the edge, so pretty good
pha
rep #$21
; JMP opcode = $4C, JML opcode = $5C
even_exit jmp $1000 ; Jump to the next line.
ds 1 ; space so that the last line in a bank can be patched into a JML
:chk_jmp mx %10 ; 8-bit accumulator / 16-bit registers
bit #$40
bne l_is_jmp
rep #$20 ; saved 3 cycles using 8-bit mode, but give it back here.
odd_low_save dfb $00,$00 ; save the first and second bytes of the code field. Works for LDA dp,x and LDA (0),y
l_jmp_rtn xba
sep #$20
pha
rep #$61 ; Clear everything C, V and M
bra even_exit
l_is_jmp
rep #$20 ; Back to 16-bit mode (carry was set above)
; sec ; Set the C flag (V is always cleared at this point) which tells a snippet to push only the high byte
dfb $4C ; Expect a JMP instruction
odd_high_save dfb $00,$00 ; The high 2 bytes of the 3-byte code field sequence is always stashed here
; Special epilogue: skip a number of bytes and jump back into the code field. This is useful for
; large, floating panels in the attract mode of a game, or to overlay solid
; dialog while still animating the play field
epilogue_1 tsc
sec
sbc #0
tcs
jmp $0000 ; This jumps back into the code field
:out jmp $0000 ; This jumps to the next epilogue chain element
ds 1
; These are the special code snippets -- there is a 1:1 relationship between each snippet space
; and a 3-byte entry in the code field. Thus, each snippet has a hard-coded JMP to return to
; the next code field location
;
; The snippet is required to handle the odd-alignment in-line; there is no facility for
; patching or intercepting these values due to their complexity. The only requirements
; are:
;
; 1. Carry Clear -> 16-bit write and return to the next code field operand
; 2. Carry Set
; a. Overflow set -> Low 8-bit write and return to the next code field operand
; b. Overflow clear -> High 8-bit write and exit the line
; c. Always clear the Carry flags. It's actually OK to leave the overflow bit in
; its passed state, because having the carry bit clear prevents evaluation of
; the V bit.
;
; In order to improve performance, especially for two-layer tiles + sprites, the
; snippet code has a fixed structure so that the constant DATA and MASK values
; always exist in the same location, regarless of the tile type. The
; tradeoff is that there is a different entry point into the snippet based on the
; tile type, but that is significantly cheaper to lookup and patch into the code
; field JMP instruction than it is to rebuild 20+ bytes of code each time.
;
; There are different snippet templates + offset tables based on the EngineMode
;
; EngineMode
;
; ENGINE_MODE_TWO_LAYER NO
; ENGINE_MODE_DYN_TILES NO
;
; Snippet Template
; None.
;
; ENGINE_MODE_TWO_LAYER YES
; ENGINE_MODE_DYN_TILES NO
;
; Snippet Template
;
; ds 4
; lda (00),y <-- Single Entry Point
; and #MASK <-- Mask is always at byte 8
; ora #DATA <-- Data is always at byte 11
; bcs _alt
; pha
; jmp NEXT
; _alt jmp RTN
;
; ENGINE_MODE_TWO_LAYER NO
; ENGINE_MODE_DYN_TILES YES
;
; Snippet Template
;
; ds 4
; lda 00,x <-- Single Entry Point
; and #MASK
; ora #DATA
; bcs _alt
; pha
; jmp NEXT
; _alt jmp RTN
;
; ENGINE_MODE_TWO_LAYER YES
; ENGINE_MODE_DYN_TILES YES
;
; Snippet Template
;
; lda (00),y <-- Entry Point 1
; and $80,x
; ora $00,x <-- Entry Point 2 (Change this word to "lda (00),y" or "lda 00,x", or "ora 00,x" depending on combination)
; and #MASK
; ora #DATA
; bcs _alt
; _16bit pha
; jmp NEXT
; db 1 <--- Entry Point 3 (opcode for an LDA #DATA instruction)
; lda (00),y <--- Entry Point 4 (sneak this in here to avoid extra branch)
; and $80,x
; ora $00,x
; bcc _16bit
; _alt jmp RTN (29 bytes)
;
; Note that the code that's assembled in these snippets is just a template. Every routine that utilizes
; an exception handler *MUST* patch up the routines. There are different routines based on the Engine Mode.
;
; The LDA (00),y opcodes have a fixed operand, but the dynamic tile instructions are determined by the
; dynamic tile id and must be set each time.
ds \,$00 ; pad to the next page boundary
]index equ 0
snippets lup 82
lda ({{81-]index}*2}),y ; 0: Pre-set the LDA (XX),y instructions
and $80,x ; 2: The direct page instructions are placeholders and get overwritten
ora $00,x ; 4: This gets patched out often
and #$0000 ; 6: the mask operand will be set when the tile is drawn
ora #$0000 ; 9: the data operand will be set when the tile is drawn
bcs :byte ; 12: if C = 0, just push the data and return
:word pha ; 14:
jmp loop+3+{3*]index}-base ; 15: Return address offset within the code field
db $A9 ; 18: LDA #DATA opcode
lda ({{81-]index}*2}),y ; 19: Pre-set the LDA (XX),y instructions
and $80,x ; 21:
ora $00,x ; 23:
bcc :word ; 25:
:byte jmp jmp_rtn-base ; 27:
ds 2 ; 30: Padding to make a full 32 bytes
]index equ ]index+1
--^
top