Another checkpoint; converging on the working implementation

This commit is contained in:
Lucas Scharenbroich 2022-05-31 08:43:26 -05:00
parent 7909113a97
commit 78d7dafe14
15 changed files with 535 additions and 259 deletions

View File

@ -11,6 +11,7 @@
"crossrunner": "C:\\Programs\\Crossrunner\\Crossrunner.exe"
},
"scripts": {
"archive": "%npm_package_config_cadius% EXTRACTFILE ",
"test": "npm run build && build-image.bat %npm_package_config_cadius% && %npm_package_config_gsport%",
"build": "%npm_package_config_merlin32% -V %npm_package_config_macros% ./src/Master.s",
"build:debug": "%npm_package_config_merlin32% -V %npm_package_config_macros% ./src/Debug.s",

View File

@ -93,7 +93,7 @@ _CoreStartUp
jsr EngineReset ; All of the resources are allocated, put the engine in a known state
jsr InitGraphics ; Initialize all of the graphics-related data
; jsr InitSprites ; Initialize the sprite subsystem
jsr InitSprites ; Initialize the sprite subsystem
jsr InitTiles ; Initialize the tile subsystem
jsr InitTimers ; Initialize the timer subsystem

View File

@ -96,7 +96,7 @@ LastKey equ 116
LastTick equ 118
ForceSpriteFlag equ 120
VBuffArrayPtr equ 122
;VBuffArrayPtr equ 122
SpriteRemovedFlag equ 126 ; Indicate if any sprites were removed this frame
activeSpriteList equ 128 ; 32 bytes for the active sprite list (can persist across frames)
@ -147,8 +147,10 @@ _TILE_ID equ 158 ; Copy of the tile descriptor
; Define free space the the application to use
; FREE_SPACE_DP2 equ 160
DP2_DIRTY_TILE_COUNT equ 160 ; Local copy of dirty tile count to avoid banking
DP2_DIRTY_TILE_COUNT equ 160 ; Local copy of dirty tile count to avoid banking
DP2_DIRTY_TILE_CALLBACK equ 162
SPRITE_VBUFF_PTR equ 224 ; 32 bytes of adjusted pointers to VBuffArray addresses
; End direct page values
; EngineMode definitions
@ -242,4 +244,4 @@ ScreenModeWidth EXT
ScreenModeHeight EXT
_SpriteBits EXT
_SpriteBitsNot EXT
VBuffArrayAddr EXT
VBuffArray EXT

View File

@ -102,22 +102,22 @@ _Render
; The _ApplyTilesFast is the same as _ApplyTiles, but we use the _RenderTileFast subroutine
_ApplyTilesFast
ldx DirtyTileCount
tdc
clc
adc #$100 ; move to the next page
tcd
lda DirtyTileCount ; Cache the dirty tile count
sta DP2_DIRTY_TILE_COUNT
stx DP2_DIRTY_TILE_COUNT ; Cache the dirty tile count
jsr _PopDirtyTilesFast
stz DirtyTileCount
tdc ; Move back to the original direct page
sec
sbc #$100
tcd
stz DirtyTileCount ; Reset the dirty tile count
rts
; The _ApplyTiles function is responsible for rendering all of the dirty tiles into the code
@ -198,17 +198,10 @@ _ApplyDirtyTiles
; Only render solid tiles and sprites
_RenderDirtyTile
ldx TileStore+TS_VBUFF_ADDR_COUNT,y ; How many sprites are on this tile?
lda TileStore+TS_SPRITE_FLAG,y
beq NoSpritesDirty ; This is faster if there are no sprites
lda TileStore+TS_TILE_ID,y ; Check if the tile has
jmp (dirty_dispatch,x)
dirty_dispatch
da NoSpritesDirty
da OneSpriteDirty
da TwoSpritesDirty
da ThreeSpritesDirty
da FourSpritesDirty
; TODO: handle sprite drawing
; The rest of this function handles that non-sprite blit, which is super fast since it blits directly from the
; tile data store to the graphics screen with no masking. The only extra work is selecting a blit function
@ -218,12 +211,12 @@ dirty_dispatch
; Y is set to the top-left address of the tile in SHR screen
; A is set to the address of the tile data
NoSpritesDirty
tyx
ldy TileStore+TS_SCREEN_ADDR,x ; Get the on-screen address of this tile
lda TileStore+TS_TILE_ADDR,x ; load the address of this tile's data (pre-calculated)
lda TileStore+TS_DIRTY_TILE_DISP,y
stal :nsd+1
ldx TileStore+TS_SCREEN_ADDR,y ; Get the on-screen address of this tile
lda TileStore+TS_TILE_ADDR,y ; load the address of this tile's data (pre-calculated)
plb ; set the code field bank
jmp (TileStore+TS_DIRTY_TILE_DISP,x) ; go to the tile copy routine (just basics)
:nsd jmp $0000
; Use some temporary space for the spriteIdx array (maximum of 4 entries)
stkSave equ tmp9
@ -240,7 +233,7 @@ ThreeSpritesDirty
TwoSpritesDirty
sta tileAddr
sty screenAddr
stx screenAddr
plb
tsc

View File

@ -19,19 +19,32 @@ InitSprites
cpx #$FFFE
bne :loop2
; Clear values in the sprite array
; Set the VBuff array addresses for each sprite, since they're static
; ldx #{MAX_SPRITES-1}*2
;:loop3 stz _Sprites+TILE_STORE_ADDR_1,x
; dex
; dex
; bpl :loop3
ldx #0
lda #VBuffArray
:loop3 sta _Sprites+VBUFF_ARRAY_ADDR,x
clc
adc #4*2 ; skip ahead 4 tiles
inx
inx
cpx #8*2
bcc :loop3
; Now do the second set of sprites
lda #VBuffArray+{3*{TILE_STORE_WIDTH*2}}
:loop4 sta _Sprites+VBUFF_ARRAY_ADDR,x
clc
adc #4*2 ; skip ahead 4 tiles
inx
inx
cpx #8*2
bcc :loop4
; Precalculate some bank values
jsr _CacheSpriteBanks
rts
; _RenderSprites
;
; The function is responsible for updating all of the rendering information based on any changes
@ -59,7 +72,7 @@ InitSprites
; a. If it is not marked in the DirtyTile list
; * Clear its bit from the TileStore's TS_SPRITE_FLAG
; * Add the tile to the DirtyTile list
;
;t
; 2. If a sprite is marked as SPRITE_STATUS_REMOVED, then
; A. Clear its bit from the SpriteBits bitmap
; B. For each tile the sprite overlaps with:
@ -134,6 +147,8 @@ _DoPhase1
lda _SpriteBits,y ; Clear from the sprite bitmap
sta SpriteRemovedFlag ; Stick a non-zero value here
trb SpriteMap
lda #SPRITE_STATUS_EMPTY ; Mark as empty so no error if we try to Add a sprite here again
sta _Sprites+SPRITE_STATUS,y
jmp _ClearSpriteFromTileStore ; Clear the tile flags, add to the dirty tile list and done
@ -154,15 +169,6 @@ _DoPhase1
:no_move
jmp _MarkDirtySpriteTiles
; Once all of the sprite values have been calculated, we need to scan the dirty tile list and
; collapse the sprite information down to no more than 4 vbuff references per tile. We used to
; do this on the fly in the renderer, but that required differentiating between tile with and
; without sprites in the core rendering function. My lifting this up, we simplify the core code
; and possible open up some optimization opportunities.
_SetTileStoreVBuffAddrs
; Dispatch table. It's unintersting, so it's tucked out of the way
phase1 dw :phase1_0
dw :phase1_1,:phase1_2,:phase1_3,:phase1_4
@ -200,8 +206,8 @@ phase1 dw :phase1_0
:phase1_2 ldy activeSpriteList+2
jsr _DoPhase1
:phase1_1 ldy activeSpriteList
jsr _DoPhase1
:phase1_0 jmp _SetTileStoreVBuffAddrs
jmp _DoPhase1
:phase1_0 rts
; Utility function to calculate the difference in tile positions between a sprite's current
; position and it's previous position. This gets interesting because the number of tiles
@ -299,9 +305,6 @@ _AddSprite
lda _SpriteBits,x ; Get the bit flag for this sprite slot
tsb SpriteMap ; Mark it in the sprite map bit field
; txa ; And return the sprite ID
; clc ; Mark that the sprite was successfully added
rts
; Macro to make the unrolled loop more concise
@ -314,7 +317,7 @@ TSClearSprite mac
ldy TileStoreLookup+]1,x
lda TileStore+TS_SPRITE_FLAG,y
and tmp0
and tmp0
sta TileStore+TS_SPRITE_FLAG,y
lda TileStore+TS_DIRTY,y

View File

@ -66,14 +66,11 @@ _CalcDirtySprite
clc
lda _Sprites+SPRITE_CLIP_TOP,y
adc StartYMod208 ; Adjust for the scroll offset
pha ; Cache
tax ; Cache
and #$FFF8 ; mask first to ensure LSR will clear the carry
lsr
lsr
tax
lda TileStoreLookupYTable,x ; Even numbers from [0, 100] (50 elements)
sta RowTop
pla
sta RowTop ; Even numbers from [0, 100] (50 elements)
; Get the position of the top edge within the tile and then add it to the sprite's height
; to calculate the number of tiles that are overlapped. We use the actual width and height
@ -90,7 +87,7 @@ _CalcDirtySprite
and #$0018
sta AreaIndex
txa ; Get the verical offset in the VBUFF memory
txa ; Get the vertical offset in the VBUFF memory
asl
tax
ldal :vbuff_mul,x
@ -108,13 +105,19 @@ _CalcDirtySprite
adc RowTop
sta _Sprites+TS_LOOKUP_INDEX,y ; This is the index into the TileStoreLookup table
; Create an offset value for loading the calculated VBUFF addresses within the core renderer
eor #$FFFF
sec
adc _Sprites+VBUFF_ARRAY_ADDR,y
sta tmp1 ; Spill this value to direct page temp space
; Calculate the final address of the sprite data in the stamp buffer. We have to move earlier
; in the buffer based on the horizontal offset and move up for each vertical offset.
txa
and #$0003
tax
adc tmp0 ; add to the vertical offset
; Subtract this value from the SPRITE_DISP address
@ -134,12 +137,17 @@ _CalcDirtySprite
and #$000C
lsr ; max value = 4 = 0x04
ora AreaIndex ; merge into the area index
sta _Sprites+TS_COVERAGE_SIZE,y ; Save this value as a key to the coverage size of the sprite
; No need to copy the TileStore addresses into the Sprite's TILE_STORE_ADDR values. Just
; hold a copy of the corner offset into the lookup table and the sprite's size in tiles.
; Then, when we need to erase we can just lookup the values in the TileStoreLookup table.
; Last task. Since we don't need to use the X-register to cache values; load the direct page 2
; offset for the SPRITE_VBUFF_PTR and save it
tya
ora #$100
tax
lda tmp1
sta SPRITE_VBUFF_PTR,x
sta _Sprites+TS_COVERAGE_SIZE,y
mdsOut rts
@ -147,12 +155,6 @@ mdsOut rts
; parallel structure to the Tile Store. This allows up to use the same TileStoreLookup offset
; to index into the array of 16 sprite VBUFF addresses that are bound to a given tile
_MarkDirtySpriteTiles
lda VBuffArrayAddr,y ; Get the base address for the TileStore VBuff array for this sprite
sta VBuffArrayPtr
lda _Sprites+TS_VBUFF_BASE,y ; This is the final upper-left cornder for this frame
sta VBuffOrigin
lda _SpriteBits,y
sta SpriteBit
@ -184,10 +186,6 @@ TSSetSprite mac
ora TileStore+TS_SPRITE_FLAG,y
sta TileStore+TS_SPRITE_FLAG,y
lda VBuffOrigin
adc ]2
sta [tmp0],y ; This is *very* carefully constructed....
lda TileStore+TS_DIRTY,y
bne next
@ -203,31 +201,68 @@ TSSetSprite mac
next
<<<
ROW equ TILE_STORE_WIDTH*2
COL equ TILE_DATA_SPAN
:mark1x1
ldx _Sprites+VBUFF_ARRAY_ADDR,y ; get the address of this sprite's vbuff values
lda _Sprites+TS_VBUFF_BASE,y ; get the starting vbuff address
sta: {0*ROW}+{0*COL},x ; Put in the vbuff address
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2};#0
TSSetSprite 0*{TS_LOOKUP_SPAN*2}
rts
:mark1x2
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{1*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2;#{0*VBUFF_TILE_ROW_BYTES}+{1*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2
rts
:mark1x3
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{1*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{2*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2;#{0*VBUFF_TILE_ROW_BYTES}+{1*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+4;#{0*VBUFF_TILE_ROW_BYTES}+{2*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+4
rts
:mark2x1
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_ROW_BYTES
sta: {1*ROW}+{0*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 1*{TS_LOOKUP_SPAN*2}+0;#{1*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0
TSSetSprite 1*{TS_LOOKUP_SPAN*2}+0
rts
:mark2x2
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{1*COL},x
adc #VBUFF_TILE_ROW_BYTES-VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{1*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2;#{0*VBUFF_TILE_ROW_BYTES}+{1*VBUFF_TILE_COL_BYTES}
@ -236,6 +271,20 @@ next
rts
:mark2x3
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{1*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{2*COL},x
adc #VBUFF_TILE_ROW_BYTES-{2*VBUFF_TILE_COL_BYTES}
sta: {1*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{1*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{2*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2;#{0*VBUFF_TILE_ROW_BYTES}+{1*VBUFF_TILE_COL_BYTES}
@ -246,6 +295,14 @@ next
rts
:mark3x1
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_ROW_BYTES
sta: {1*ROW}+{0*COL},x
adc #VBUFF_TILE_ROW_BYTES
sta: {2*ROW}+{0*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 1*{TS_LOOKUP_SPAN*2}+0;#{1*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
@ -253,6 +310,20 @@ next
rts
:mark3x2
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{1*COL},x
adc #VBUFF_TILE_ROW_BYTES-VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{1*COL},x
adc #VBUFF_TILE_ROW_BYTES-VBUFF_TILE_COL_BYTES
sta: {2*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {2*ROW}+{1*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2;#{0*VBUFF_TILE_ROW_BYTES}+{1*VBUFF_TILE_COL_BYTES}
@ -263,6 +334,26 @@ next
rts
:mark3x3
ldx _Sprites+VBUFF_ARRAY_ADDR,y
lda _Sprites+TS_VBUFF_BASE,y
sta: {0*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{1*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {0*ROW}+{2*COL},x
adc #VBUFF_TILE_ROW_BYTES-{2*VBUFF_TILE_COL_BYTES}
sta: {1*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{1*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {1*ROW}+{2*COL},x
adc #VBUFF_TILE_ROW_BYTES-{2*VBUFF_TILE_COL_BYTES}
sta: {2*ROW}+{0*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {2*ROW}+{1*COL},x
adc #VBUFF_TILE_COL_BYTES
sta: {2*ROW}+{2*COL},x
ldx _Sprites+TS_LOOKUP_INDEX,y
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+0;#{0*VBUFF_TILE_ROW_BYTES}+{0*VBUFF_TILE_COL_BYTES}
TSSetSprite 0*{TS_LOOKUP_SPAN*2}+2;#{0*VBUFF_TILE_ROW_BYTES}+{1*VBUFF_TILE_COL_BYTES}

View File

@ -120,14 +120,6 @@ InitTiles
; lda TileProcs ; Same for non-dirty, non-sprite base case
; stal TileStore+TS_BASE_TILE_DISP,x
; *** DEPRECATED ***
; lda :vbuff ; array of sprite vbuff addresses per tile
; stal TileStore+TS_VBUFF_ARRAY_ADDR,x
; clc
; adc #32
; sta :vbuff
; *** ********** ***
; The next set of values are constants that are simply used as cached parameters to avoid needing to
; calculate any of these values during tile rendering
@ -267,3 +259,211 @@ _SetBG0YPos
stx OldStartY ; First change, so preserve the value
:out rts
; Macro helper for the bit test tree
; dobit bit_position,dest;next;exit
dobit mac
lsr
bcc next_bit
beq last_bit
tax
lda (SPRITE_VBUFF_PTR+{]2*2}),y
sta sprite_ptr0+{]2*4}
txa
jmp ]3
last_bit lda (SPRITE_VBUFF_PTR+{]2*2}),y
sta sprite_ptr0+{]2*4}
jmp ]4
next_bit
<<<
; Specialization for the first sprite which can just return the vbuff address
; in a register if there is only one sprite intersecting the tile
dobit1 mac
lsr
bcc next_bit
beq last_bit
tax
lda (SPRITE_VBUFF_PTR+{]2*2}),y
sta sprite_ptr0+{]2*4}
txa
jmp ]3
last_bit lda (SPRITE_VBUFF_PTR+{]2*2}),y
jmp ]4
next_bit
<<<
; Optimization discussion. In the Sprite2.s file, we calculate the VBUFF address for each tile overlapped
; by a sprite:
;
; 4 lda VBuffOrigin
; 3 adc ]2
; 7 sta [tmp0],y
;
; and then in this macro it is loaded again and copied to the direct page. If a sprite is never drawn, this is
; wasted work (which is not too ofter since >4 sprites would need to be overlapping), but still.
;
; 6 ldy: {]1*TILE_STORE_SIZE},x
; 4 sty sprite_ptr0+{]2*4}
;
; Since we know *exactly* which sprite is being accessed, the _Sprites+TS_VBUFF_BASE,y value can be loaded without
; an index
;
; 5 lda _Sprites+TS_VBUFF_BASE+{]1*2}
; 6 adc {]1*TILE_STORE_SIZE},x
; 4 sta sprite_ptr0+{]2*4}
; 2 tya
;
; = a savings of at least (24 - 17) = 7 cycles per tile and more if the sprite is skipped.
;
; The problem is that this still required storing a value for the sprite in the tile store. What is ideal is
; if there is a way to know implicitly which relative tile offset we are on for a given sprite and use
; that to calculate the offset...
;
; What do we know
; X = current tile
; Sprite+TS_LOOKUP_INDEX
;
; txa
; sbc _Sprites+TS_LOOKUP_INDEX+{]1*2}
; tay
; lda _Sprites+TS_VBUFF_BASE+{]1*2}
; adc DisplacementTable,y
; sta sprite_ptr0+{]2*4}
;
; Have the sprite select a table base which holds the offset values, pre-adjusted for the TS_LOOKUP_INDEX. The table
; values are fixed. Yes!! This is the solution!! It will only need 288 bytes of total space
;
; Best implementation will pass the Tile Store index in Y instead of X
;
; 5 lda _Sprites+VBUFF_TABLE+{]1*2}
; 6 sta self_mod
; 6 lda $0000,x
; 4 sta sprite_ptr0+{]2*4}
; 2 tya
;
; or
;
; 5 lda _Sprites+VBUFF_TABLE+{]1*2}
; 4 sta tmp0
; 7 lda (tmp0),y
; 4 sta sprite_ptr0+{]2*4}
; 2 txa
;
; Even better, if the VBUFF_TABLE (only 32 bytes) was already stored in the second direct page
;
; 7 lda (VBUFF_TABLE+{]1*2}),y
; 5 adc _Sprites+VBUFF_TABLE+{]1*2}
; 4 sta sprite_ptr0+{]2*4}
; 2 txa
;
; Final saving compared to current implementation is (24 - 18) = 6 cycles per tile and we eliminate
; the need to pre-calculate
;
; If we find a last bit (4th in this case) and will exit
stpbit mac
lsr
bcc next_bit
lda (SPRITE_VBUFF_PTR+{]2*2}),y
sta sprite_ptr0+{]2*4}
jmp ]3
next_bit
<<<
; Last bit test which *must* be set
endbit mac
lda (SPRITE_VBUFF_PTR+{]2*2}),y
sta sprite_ptr0+{]2*4}
jmp ]3
<<<
; OPTIMIZATION:
;
; bit #$00FF ; Optimization to skip the first 8 bits if they are all zeros
; bne norm_entry
; xba
; jmp skip_entry
;
; Placed at the entry point
; This is a complex, but fast subroutine that is called from the core tile rendering code. It
; Takes a bitmap of sprites in the Accumulator and then extracts the VBuff addresses for the
; target TileStore entry and places them in specific direct page locations.
;
; Inputs:
; A = sprite bitmap (assumed to be non-zero)
; Y = tile store index
; D = second work page
; B = vbuff array bank
; Output:
; X =
;
; ]1 address of single sprite process
; ]2 address of two sprite process
; ]3 address of three sprite process
; ]4 address of four sprite process
SpriteBitsToVBuffAddrs mac
dobit1 0;0;b_1_1;]1
dobit1 1;0;b_2_1;]1
dobit1 2;0;b_3_1;]1
dobit1 3;0;b_4_1;]1
dobit1 4;0;b_5_1;]1
dobit1 5;0;b_6_1;]1
dobit1 6;0;b_7_1;]1
dobit1 7;0;b_8_1;]1
dobit1 8;0;b_9_1;]1
dobit1 9;0;b_10_1;]1
dobit1 10;0;b_11_1;]1
dobit1 11;0;b_12_1;]1
dobit1 12;0;b_13_1;]1
dobit1 13;0;b_14_1;]1
dobit1 14;0;b_15_1;]1
endbit 15;0;]1
b_1_1 dobit 1;1;b_2_2;]2
b_2_1 dobit 2;1;b_3_2;]2
b_3_1 dobit 3;1;b_4_2;]2
b_4_1 dobit 4;1;b_5_2;]2
b_5_1 dobit 5;1;b_6_2;]2
b_6_1 dobit 6;1;b_7_2;]2
b_7_1 dobit 7;1;b_8_2;]2
b_8_1 dobit 8;1;b_9_2;]2
b_9_1 dobit 9;1;b_10_2;]2
b_10_1 dobit 10;1;b_11_2;]2
b_11_1 dobit 11;1;b_12_2;]2
b_12_1 dobit 12;1;b_13_2;]2
b_13_1 dobit 13;1;b_14_2;]2
b_14_1 dobit 14;1;b_15_2;]2
b_15_1 endbit 15;1;]2
b_2_2 dobit 2;2;b_3_3;]3
b_3_2 dobit 3;2;b_4_3;]3
b_4_2 dobit 4;2;b_5_3;]3
b_5_2 dobit 5;2;b_6_3;]3
b_6_2 dobit 6;2;b_7_3;]3
b_7_2 dobit 7;2;b_8_3;]3
b_8_2 dobit 8;2;b_9_3;]3
b_9_2 dobit 9;2;b_10_3;]3
b_10_2 dobit 10;2;b_11_3;]3
b_11_2 dobit 11;2;b_12_3;]3
b_12_2 dobit 12;2;b_13_3;]3
b_13_2 dobit 13;2;b_14_3;]3
b_14_2 dobit 14;2;b_15_3;]3
b_15_2 endbit 15;2;]3
b_3_3 stpbit 3;3;]4
b_4_3 stpbit 4;3;]4
b_5_3 stpbit 5;3;]4
b_6_3 stpbit 6;3;]4
b_7_3 stpbit 7;3;]4
b_8_3 stpbit 8;3;]4
b_9_3 stpbit 9;3;]4
b_10_3 stpbit 10;3;]4
b_11_3 stpbit 11;3;]4
b_12_3 stpbit 12;3;]4
b_13_3 stpbit 13;3;]4
b_14_3 stpbit 14;3;]4
b_15_3 endbit 15;3;]4
<<<

View File

@ -52,8 +52,23 @@ _CallTable
adrl _TSRender-1
adrl _TSLoadTileSet-1
adrl _TSCreateSpriteStamp-1
adrl _TSAddSprite-1
adrl _TSMoveSprite-1
adrl _TSUpdateSprite-1
adrl _TSRemoveSprite-1
_CTEnd
_GTEAddSprite MAC
UserTool $1000+GTEToolNum
<<<
_GTEMoveSprite MAC
UserTool $1100+GTEToolNum
<<<
_GTEUpdateSprite MAC
UserTool $1200+GTEToolNum
<<<
_GTERemoveSprite MAC
UserTool $1300+GTEToolNum
<<<
; Helper function to set the data back to the toolset default
_SetDataBank sep #$20
lda #^TileStore
@ -285,6 +300,21 @@ _TSAddSprite
_TSExit #0;#8
_TSMoveSprite
:spriteY equ FirstParam+0
:spriteX equ FirstParam+2
:spriteSlot equ FirstParam+4
_TSEntry
lda :spriteX,s
tax
lda :spriteY,s
tay
lda :spriteSlot,s
jsr _MoveSprite
_TSExit #0;#6
_TSUpdateSprite
:vbuff equ FirstParam+0
:spriteFlags equ FirstParam+2
@ -300,6 +330,15 @@ _TSUpdateSprite
_TSExit #0;#6
_TSRemoveSprite
:spriteSlot equ FirstParam+0
_TSEntry
lda :spriteSlot,s
jsr _UpdateSprite
_TSExit #0;#2
; Insert the GTE code
put Math.s

View File

@ -73,7 +73,7 @@ Counter equ tmp3
; Patch an 8-bit or 16-bit valueS into the bank. These are a set up unrolled loops to
; quickly patch in a constanct value, or a value from an array into a given set of
; quickly patch in a constant value, or a value from an array into a given set of
; templates.
;
; Because we have structured everything as parallel code blocks, most updates to the blitter

View File

@ -27,6 +27,18 @@ _TBCopyTileDataAndMaskToCBuffV
jsr _TBCopyTileDataToCBuffV
jmp _TBCopyTileMaskToCBuffV
_CopyTileDataToDP2
]line equ 0
lup 8
ldal tiledata+{]line*4},x
sta tmp_tile_data+{]line*4}
ldal tiledata+{]line*4}+2,x
sta tmp_tile_data+{]line*4}+2
]line equ ]line+1
--^
rts
_TBCopyTileDataToCBuff
]line equ 0
lup 8

View File

@ -84,18 +84,6 @@ CopyNoSprites
:tiledisp jmp $0000 ; render the tile
; Let's make a macro helper for the bit test tree
; dobit src_offset,dest,next_target,end_target
dobit MAC
beq last_bit
ldx: ]1,y
stx ]2
jmp ]3
last_bit ldx: ]1,y
stx ]2
jmp ]4
EOM
; The sprite code is just responsible for quickly copying all of the sprite data
; into the direct page temp area.
@ -117,83 +105,9 @@ dirty_sprite_dispatch
da CopyThreeSprites
da CopyFourSprites ; MAX, don't bother with more than 4 sprites per tile
; This is very similar to the code in the dirty tile renderer, but we can't reuse
; because that code draws directly to the graphics screen, and this code draws
; to a temporary buffer that has a different stride.
; ldy TileStore+TS_VBUFF_ARRAY_ADDR,x ; base address of the VBUFF sprite address array for this tile
;
; lsr
; bcc :loop_0_bit_1
; dobit $0000;sprite_ptr0;:loop_1_bit_1;CopyOneSprite
;:loop_0_bit_1 lsr
; bcc :loop_0_bit_2
; dobit $0002;sprite_ptr0;:loop_1_bit_2;CopyOneSprite
;:loop_0_bit_2 lsr
; bcc :loop_0_bit_3
; dobit $0004;sprite_ptr0;:loop_1_bit_3;CopyOneSprite
;:loop_0_bit_3 lsr
; bcc :loop_0_bit_4
; dobit $0006;sprite_ptr0;:loop_1_bit_4;CopyOneSprite
;:loop_0_bit_4 lsr
; bcc :loop_0_bit_5
; dobit $0008;sprite_ptr0;:loop_1_bit_5;CopyOneSprite
;:loop_0_bit_5 lsr
; bcc :loop_0_bit_6
; dobit $000A;sprite_ptr0;:loop_1_bit_6;CopyOneSprite
;:loop_0_bit_6 lsr
; bcc :loop_0_bit_7
; dobit $000C;sprite_ptr0;:loop_1_bit_7;CopyOneSprite
;:loop_0_bit_7 lsr
; bcc :loop_0_bit_8
; dobit $000E;sprite_ptr0;:loop_1_bit_8;CopyOneSprite
;:loop_0_bit_8 lsr
; bcc :loop_0_bit_9
; dobit $0010;sprite_ptr0;:loop_1_bit_9;CopyOneSprite
;:loop_0_bit_9 lsr
; bcc :loop_0_bit_10
; ldx: $0012,y
; stx spriteIdx
; cmp #0
; jne :loop_1_bit_10
; jmp CopyOneSprite
;:loop_0_bit_10 lsr
; bcc :loop_0_bit_11
; dobit $0014;sprite_ptr0;:loop_1_bit_11;CopyOneSprite
;:loop_0_bit_11 lsr
; bcc :loop_0_bit_12
; dobit $0016;sprite_ptr0;:loop_1_bit_12;CopyOneSprite
;:loop_0_bit_12 lsr
; bcc :loop_0_bit_13
; dobit $0018;sprite_ptr0;:loop_1_bit_13;CopyOneSprite
;:loop_0_bit_13 lsr
; bcc :loop_0_bit_14
; dobit $001A;sprite_ptr0;:loop_1_bit_14;CopyOneSprite
;:loop_0_bit_14 lsr
; bcc :loop_0_bit_15
; dobit $001C;sprite_ptr0;:loop_1_bit_15;CopyOneSprite
;:loop_0_bit_15 ldx: $001E,y
; stx spriteIdx
; jmp CopyOneSprite
; We can optimize later, for now just copy the sprite data and mask into its own
; direct page buffer and combine with the tile data later
;
; We set up direct page pointers to the mask bank and use the bank register for the
; data.
CopyFourSprites

View File

@ -30,7 +30,6 @@ _TBFastSpriteTile_VH
; Need to update the X-register before calling this
_TBApplySpriteData
ldx _SPR_X_REG ; set to the unaligned tile block address in the sprite plane
]line equ 0
lup 8
lda blttmp+{]line*4}

View File

@ -2,18 +2,15 @@
put ../Defs.s
put TileStoreDefs.s
put ../blitter/Template.s
;-------------------------------------------------------------------------------------
;
; Buffer space
ds 256
put ../blitter/Template.s
;-------------------------------------------------------------------------------------
TileStore ENT
ds {TILE_STORE_SIZE*17}
ds {TILE_STORE_SIZE*TILE_STORE_NUM}
;-------------------------------------------------------------------------------------
;
@ -23,7 +20,7 @@ TileStore ENT
DirtyTileCount ENT
ds 2
DirtyTiles ENT
ds TILE_STORE_SIZE ; At most this many tiles can possibly be update at once
ds TILE_STORE_SIZE ; At most this many tiles can possibly be updated at once
;-------------------------------------------------------------------------------------
;
@ -373,9 +370,15 @@ ScreenModeWidth ENT
ScreenModeHeight ENT
dw 200,192,200,176,160,160,160,128,144,192,102,1
; List of addresses of the VBuff arrays for each Tile Store entry, indexed by sprite index
VBuffArrayAddr ENT
ds MAX_SPRITES*2
; VBuff arrays for each sprite. We need at least a 3x3 block for each sprite and the shape of the
; array must match the TileStore structure. The TileStore is 41 blocks wide. To keep things simple
; we allocate 8 sprites in the first row and 8 more sprites in the 4th row. So we need to allocate a
; total of 6 rows of TileStore space
;
; It is *critical* that this array be placed in a memory location that is greated than the largest
; TileStore offset.
VBuffArray ENT
ds 6*{TILE_STORE_WIDTH*2}
; Convert sprite index to a bit position
_SpriteBits ENT

View File

@ -14,24 +14,17 @@ TS_CODE_ADDR_HIGH equ TILE_STORE_SIZE*5
TS_WORD_OFFSET equ TILE_STORE_SIZE*6 ; const value, word offset value for this tile if LDA (dp),y instructions re used
TS_BASE_ADDR equ TILE_STORE_SIZE*7 ; const value, because there are two rows of tiles per bank, this is set to $0000 ot $8000.
TS_SCREEN_ADDR equ TILE_STORE_SIZE*8 ; cached value of on-screen location of tile. Used for DirtyRender.
;TS_VBUFF_ARRAY_ADDR equ TILE_STORE_SIZE*9 ; const value to an aligned 32-byte array starting at $8000 in TileStore bank
TS_BASE_TILE_COPY equ TILE_STORE_SIZE*9 ; derived from TS_TILE_ID to optimize tile copy to support sprite rendering
TS_BASE_TILE_DISP equ TILE_STORE_SIZE*10 ; derived from TS_TILE_ID to optimize base (non-sprite) tile dispatch in the Render function
TS_DIRTY_TILE_DISP equ TILE_STORE_SIZE*11 ; derived from TS_TILE_ID to optimize dirty tile dispatch in the Render function
; Hold values for up to 4 sprites per tile
TS_VBUFF_ADDR_0 equ TILE_STORE_SIZE*12
TS_VBUFF_ADDR_1 equ TILE_STORE_SIZE*13
TS_VBUFF_ADDR_2 equ TILE_STORE_SIZE*14
TS_VBUFF_ADDR_3 equ TILE_STORE_SIZE*15
TS_VBUFF_ADDR_COUNT equ TILE_STORE_SIZE*16 ; replace usage of TS_VBUFF_ARRAY_ADDR with this later
TILE_STORE_NUM equ 12 ; Need this many parallel arrays
; Sprite data structures. We cache quite a few pieces of information about the sprite
; to make calculations faster, so this is hidden from the caller.
MAX_SPRITES equ 16
SPRITE_REC_SIZE equ 52
SPRITE_REC_SIZE equ 42
; Mark each sprite as ADDED, UPDATED, MOVED, REMOVED depending on the actions applied to it
; on this frame. Quick note, the same Sprite ID cannot be removed and added in the same frame.
@ -68,6 +61,7 @@ SPRITE_HEIGHT equ {MAX_SPRITES*32}
SPRITE_CLIP_WIDTH equ {MAX_SPRITES*34}
SPRITE_CLIP_HEIGHT equ {MAX_SPRITES*36}
TS_VBUFF_BASE equ {MAX_SPRITES*38} ; Finalized VBUFF address based on the sprite position and tile offsets
VBUFF_ARRAY_ADDR equ {MAX_SPRITES*40} ; Fixed address where this sprite's VBUFF addresses are stores. The array is the same shape as TileStore, but much smaller
;TILE_DATA_OFFSET equ {MAX_SPRITES*2}
;TILE_STORE_ADDR_1 equ {MAX_SPRITES*12}
;TILE_STORE_ADDR_2 equ {MAX_SPRITES*14}

View File

@ -6,8 +6,8 @@
; If there are sprites, then the sprite data is flattened and stored into a direct page buffer
; and then copied into the code field
_RenderTileFast
; lda TileStore+TS_VBUFF_ADDR_COUNT,x ; How many sprites are on this tile?
; bne SpriteDispatch ; This is faster if there are no sprites
lda TileStore+TS_SPRITE_FLAG,x ; any sprites on this line?
bne SpriteDispatch
NoSpriteFast
lda TileStore+TS_CODE_ADDR_HIGH,x ; load the bank of the target code field line
@ -15,7 +15,7 @@ NoSpriteFast
ldy TileStore+TS_CODE_ADDR_LOW,x ; load the address of the code field
lda TileStore+TS_BASE_TILE_DISP,x ; go to the tile copy routine (just basics)
stal nsf_patch+1
lda TileStore+TS_TILE_ADDR,x ; load the address of this tile's data (pre-calculated)
lda TileStore+TS_TILE_ADDR,x ; load the address of this tile's data (pre-calculated)
plb ; set the code field bank
nsf_patch jmp $0000
@ -24,35 +24,60 @@ nsf_patch jmp $0000
FastTileProcs dw _TBCopyDataFast,_TBCopyDataFast,_TBCopyDataFast,_TBCopyDataFast
; dw _TBCopyDataFast,_TBCopyDataFast,_TBCopyDataVFast,_TBCopyDataVFast
SpriteDispatch
tax
jmp (:,x) ; Dispatch to the other routines
: da NoSpriteFast ; Placeholder
da OneSpriteFast
da TwoSpritesFast
da ThreeSpritesFast
da FourSpritesFast
; NOTE: Inlining the dispatch would eliminate a JSR,RTS,LDX, and JMP (abs,x) because the exit code
; could jump directly to the target address. Net savings of 20 cycles per tile. For a 16x16
; sprite with a 3x3 block coverage this is 180 cycles per frame per block... This would also
; preserve a register
;
; For comparison, a fast one sprite copy takes 22 cycles per word, so this would save
; about 1/2 block of render time per tile.
;
; Need to determine if the sprite or tile data is on top, as that will decide whether the
; sprite or tile data is copied into the temporary buffer first. Also, if TWO_LAYER is set
; then the mask information must be copied as well....This is the last decision point.
; Pointers to sprite data and masks
spritedata_0 equ tmp0
spritedata_1 equ tmp2
spritedata_2 equ tmp4
spritedata_3 equ tmp6
spritemask_0 equ tmp8
spritemask_1 equ tmp10
spritemask_2 equ tmp12
spritemask_3 equ tmp14
SpriteDispatch
txy
SpriteBitsToVBuffAddrs OneSpriteFast;OneSpriteFast;OneSpriteFast;OneSpriteFast
sta sprite_ptr0
ldx TileStore+TS_TILE_ADDR,y
jsr _CopyTileDataToDP2 ; preserves Y
lda TileStore+TS_CODE_ADDR_HIGH,y ; load the bank of the target code field line
pha ; and put on the stack for later. Has TileStore bank in high byte.
ldx sprite_ptr0 ; address of sprite vbuff info
lda TileStore+TS_CODE_ADDR_LOW,y ; load the address of the code field
tay
; jmp _TBApplySpriteData2
_TBApplySpriteData2
]line equ 0
lup 8
lda blttmp+{]line*4}
andl spritemask+{]line*SPRITE_PLANE_SPAN},x
oral spritedata+{]line*SPRITE_PLANE_SPAN},x
sta: $0004+{]line*$1000},y
lda blttmp+{]line*4}+2
andl spritemask+{]line*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{]line*SPRITE_PLANE_SPAN}+2,x
sta: $0001+{]line*$1000},y
]line equ ]line+1
--^
rts
; Where there are sprites involved, the first step is to call a routine to copy the
; tile data into a temporary buffer. Then the sprite data is merged and placed into
; the code field.
;
; A = vbuff address
; Y = tile store address
OneSpriteFast
tyx
lda TileStore+TS_TILE_ADDR,y
per :-1
jmp (TileStore+TS_BASE_TILE_COPY,x) ; Copy the tile data to the temporary buffer
:
ldx TileStore+TS_VBUFF_ADDR_0,y ; address of the sprite data
tax ; address of the sprite data
lda TileStore+TS_BASE_TILE_COPY,y ; copy routine (handles flips and other behaviors)
stal osf_copy+1
osf_copy jsr $0000
; ldx TileStore+TS_VBUFF_ADDR_0,y ; address of the sprite data
lda TileStore+TS_CODE_ADDR_HIGH,y ; load the bank of the target code field line
pha ; and put on the stack for later.
lda TileStore+TS_CODE_ADDR_LOW,y
@ -71,17 +96,17 @@ OneSpriteFast
rts
TwoSpritesFast
tyx
lda TileStore+TS_TILE_ADDR,y
per :-1
jmp (TileStore+TS_BASE_TILE_COPY,x) ; Copy the tile data to the temporary buffer
:
lda TileStore+TS_VBUFF_ADDR_0,y ; address of the sprite data
sta spritedata_0
sta spritemask_0
lda TileStore+TS_VBUFF_ADDR_1,y ; address of the sprite data
sta spritedata_1
sta spritemask_1
; tyx
; lda TileStore+TS_TILE_ADDR,y
; per :-1
; jmp (TileStore+TS_BASE_TILE_COPY,x) ; Copy the tile data to the temporary buffer
;:
; lda TileStore+TS_VBUFF_ADDR_0,y ; address of the sprite data
; sta spritedata_0
; sta spritemask_0
; lda TileStore+TS_VBUFF_ADDR_1,y ; address of the sprite data
; sta spritedata_1
; sta spritemask_1
lda TileStore+TS_CODE_ADDR_HIGH,y ; load the bank of the target code field line
pha ; and put on the stack for later.
@ -89,33 +114,33 @@ TwoSpritesFast
tay
plb ; set the code field bank
TwoSpritesToCodeField 0
TwoSpritesToCodeField 1
TwoSpritesToCodeField 2
TwoSpritesToCodeField 3
TwoSpritesToCodeField 4
TwoSpritesToCodeField 5
TwoSpritesToCodeField 6
TwoSpritesToCodeField 7
; TwoSpritesToCodeField 0
; TwoSpritesToCodeField 1
; TwoSpritesToCodeField 2
; TwoSpritesToCodeField 3
; TwoSpritesToCodeField 4
; TwoSpritesToCodeField 5
; TwoSpritesToCodeField 6
; TwoSpritesToCodeField 7
rts
ThreeSpritesFast
FourSpritesFast
tyx
lda TileStore+TS_TILE_ADDR,y
per :-1
jmp (TileStore+TS_BASE_TILE_COPY,x) ; Copy the tile data to the temporary buffer
:
lda TileStore+TS_VBUFF_ADDR_0,y ; address of the sprite data
sta spritedata_0
sta spritemask_0
lda TileStore+TS_VBUFF_ADDR_1,y
sta spritedata_1
sta spritemask_1
lda TileStore+TS_VBUFF_ADDR_2,y
sta spritedata_2
sta spritemask_2
; tyx
; lda TileStore+TS_TILE_ADDR,y
; per :-1
; jmp (TileStore+TS_BASE_TILE_COPY,x) ; Copy the tile data to the temporary buffer
;:
; lda TileStore+TS_VBUFF_ADDR_0,y ; address of the sprite data
; sta spritedata_0
; sta spritemask_0
; lda TileStore+TS_VBUFF_ADDR_1,y
; sta spritedata_1
; sta spritemask_1
; lda TileStore+TS_VBUFF_ADDR_2,y
; sta spritedata_2
; sta spritemask_2
lda TileStore+TS_CODE_ADDR_HIGH,y ; load the bank of the target code field line
pha ; and put on the stack for later.
@ -123,13 +148,13 @@ FourSpritesFast
tay
plb ; set the code field bank
ThreeSpritesToCodeField 0
ThreeSpritesToCodeField 1
ThreeSpritesToCodeField 2
ThreeSpritesToCodeField 3
ThreeSpritesToCodeField 4
ThreeSpritesToCodeField 5
ThreeSpritesToCodeField 6
ThreeSpritesToCodeField 7
; ThreeSpritesToCodeField 0
; ThreeSpritesToCodeField 1
; ThreeSpritesToCodeField 2
; ThreeSpritesToCodeField 3
; ThreeSpritesToCodeField 4
; ThreeSpritesToCodeField 5
; ThreeSpritesToCodeField 6
; ThreeSpritesToCodeField 7
rts