More sprite streamlining

* Added a TS_LAST_VBUFF cached value in the tile store
* Added a fast path for single sprite w/fast test
* Improved raster timing granularity for visual profiling

All of the code paths are generally good.  Both the _RenderSprites
and _ApplyDirtyTiles functions take a fair bit of raster time. Will
continue to try and streamline the data structures and code to
reduce overhead.`
This commit is contained in:
Lucas Scharenbroich 2022-02-22 02:35:21 -06:00
parent 93ed3b3f16
commit 2507724a49
5 changed files with 204 additions and 41 deletions

View File

@ -81,11 +81,13 @@ BG1TileMapPtr equ 86
SCBArrayPtr equ 90 ; Used for palette binding
SpriteBanks equ 94 ; Bank bytes for the sprite data and sprite mask
LastRender equ 96 ; Record which reder function was last executed
; DamagedSprites equ 98
; gap
SpriteMap equ 100 ; Bitmap of open sprite slots.
ActiveSpriteCount equ 102
BankLoad equ 104
Next equ 106
TileStoreBankAndBank01 equ 106
TileStoreBankAndTileDataBank equ 108
Next equ 110
activeSpriteList equ 128 ; 32 bytes for the active sprite list (can persist across frames)
AppSpace equ 160 ; 16 bytes of space reserved for application use
@ -155,7 +157,7 @@ MAX_TILES equ {26*41} ; Number of tiles in the code fiel
TILE_STORE_SIZE equ {MAX_TILES*2} ; The tile store contains a tile descriptor in each slot
TS_TILE_ID equ TILE_STORE_SIZE*0 ; tile descriptor for this location
TS_DIRTY equ TILE_STORE_SIZE*1 ; Flag. Used to prevent a tile from being queues multiple times per frame
TS_DIRTY equ TILE_STORE_SIZE*1 ; Flag. Used to prevent a tile from being queued multiple times per frame
TS_SPRITE_FLAG equ TILE_STORE_SIZE*2 ; Bitfield of all sprites that intersect this tile. 0 if no sprites.
TS_TILE_ADDR equ TILE_STORE_SIZE*3 ; cached value, the address of the tiledata for this tile
TS_CODE_ADDR_LOW equ TILE_STORE_SIZE*4 ; const value, address of this tile in the code fields
@ -164,3 +166,5 @@ TS_WORD_OFFSET equ TILE_STORE_SIZE*6 ; const value, word offset val
TS_BASE_ADDR equ TILE_STORE_SIZE*7 ; const value, because there are two rows of tiles per bank, this is set to $0000 ot $8000.
TS_SCREEN_ADDR equ TILE_STORE_SIZE*8 ; cached value of on-screen location of tile. Used for DirtyRender.
TS_VBUFF_ARRAY_ADDR equ TILE_STORE_SIZE*9 ; const value to an aligned 32-byte array starting at $8000 in TileStore bank
TS_TILE_DISP equ TILE_STORE_SIZE*10 ; derived from TS_TILE_ID to optimize tile dispatch in the Render function
TS_LAST_VBUFF equ TILE_STORE_SIZE*11 ; a cached copy of one of the valid VBUFF values in the array

View File

@ -162,9 +162,16 @@ RenderDirty ENT
_RenderDirty
lda LastRender ; If the full renderer was last called, we assume that
bne :norecalc ; the scroll positions have likely changed, so recalculate
lda #2 ; blue
jsr _SetBorderColor
jsr _RecalcTileScreenAddrs ; them to make sure sprites draw at the correct screen address
:norecalc
lda #3 ; purple
jsr _SetBorderColor
jsr _RenderSprites
lda #4 ; dk. green
jsr _SetBorderColor
jsr _ApplyDirtyTiles
lda #1
sta LastRender
@ -192,50 +199,32 @@ _ApplyDirtyTiles
; Only render solid tiles and sprites
_RenderDirtyTile
pea >TileStore ; Need that addressing flexibility here. Callers responsible for restoring bank reg
plb
ldal TileStore+TS_SPRITE_FLAG,x ; This is a bitfield of all the sprites that intersect this tile, only care if non-zero or not
bne dirty_sprite
; The rest of this function handles that non-sprite blit, which is super fast since it blits directly from the
; tile data store to the graphics screen with no masking. The only extra work is selecting a blit function
; based on the tile flip flags.
pei TileStoreBankAndBank01 ; Special value that has the TileStore bank in LSB and $01 in MSB
plb
txy
lda TileStore+TS_SPRITE_FLAG,y ; This is a bitfield of all the sprites that intersect this tile, only care if non-zero or not
beq :nosprite
jsr BuildActiveSpriteArray ; Build the sprite index list from the bit field
sta ActiveSpriteCount
lda TileStore+TS_VBUFF_ARRAY_ADDR,y ; Scratch space
sta _SPR_X_REG
phy
ldy spriteIdx
lda (_SPR_X_REG),y
sta _SPR_X_REG
ply
lda TileStore+TS_TILE_ID,y ; build the finalized tile descriptor
and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value
xba
tax
ldal DirtyTileSpriteProcs,x
stal :tiledisp+1
bra :sprite
:nosprite
lda TileStore+TS_TILE_ID,y ; build the finalized tile descriptor
and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value
xba
tax
ldx TileStore+TS_TILE_DISP,y ; get the finalized tile descriptor
ldal DirtyTileProcs,x ; load and patch in the appropriate subroutine
stal :tiledisp+1
:sprite
ldx TileStore+TS_TILE_ADDR,y ; load the address of this tile's data (pre-calculated)
lda TileStore+TS_SCREEN_ADDR,y ; Get the on-screen address of this tile
pha
tay
; pha
; lda TileStore+TS_WORD_OFFSET,y ; We don't support this in dirty rendering mode
; ply
; pea $0101
; plb
lda TileStore+TS_WORD_OFFSET,y
ply
pea $0101
plb
plb ; set the bank
; B is set to Bank 01
@ -245,6 +234,39 @@ _RenderDirtyTile
:tiledisp jmp $0000 ; render the tile
; Handler for the sprite path
dirty_sprite
pei TileStoreBankAndTileDataBank ; Special value that has the TileStore bank in LSB and TileData bank in MSB
plb
; Now do all of the deferred work of actually drawing the sprites. We put considerable effort into
; figuring out if there is only one sprite or more than one since we optimize the former case as it
; is very common and can be done significantly faster.
;
; We use the logic operation of A & (A - 1) which removes the least-significant set bit position. If
; this results in a zero value, then we know that there is only a single sprite and can move to an
; optimized routine.
dec
and TileStore+TS_SPRITE_FLAG,x
bne multi_sprite
; Now we are on the fast path. There is only one sprite at this tile, so load the cached VBUFF address
; from the TileStore data structure. This is only guaranteed to be a VBUFF address from one of the
; sprites at the tile, but if there is only one, it's the value we want.
lda TileStore+TS_LAST_VBUFF,x
pha
ldy TileStore+TS_TILE_ADDR,x ; load the address of this tile's data
lda TileStore+TS_SCREEN_ADDR,x ; Get the on-screen address of this tile
plx ; Set the vbuff address
plb
jmp FastBlit
; This is the code path to handle tile with multiple, overlapping sprites.
multi_sprite
rts ; TBD
DirtyTileProcs dw _TBDirtyTile_00,_TBDirtyTile_0H,_TBDirtyTile_V0,_TBDirtyTile_VH
DirtyTileSpriteProcs dw _TBDirtySpriteTile_00,_TBDirtySpriteTile_0H,_TBDirtySpriteTile_V0,_TBDirtySpriteTile_VH
@ -384,6 +406,118 @@ BuildActiveSpriteArray
:out_3 lda #6
rts
; Fast blit that directly combines a tile with a single sprite and renders directly to the screen
;
; A = screen address
; X = sprite VBUFF address
; Y = tile data address
FastBlit
TILE_DATA_SPAN equ 4
phd
sei
clc
tcd
_R0W1
lda tiledata+{0*TILE_DATA_SPAN},y
andl spritemask+{0*SPRITE_PLANE_SPAN},x
oral spritedata+{0*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{0*TILE_DATA_SPAN}+2,y
andl spritemask+{0*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{0*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{1*TILE_DATA_SPAN},y
andl spritemask+{1*SPRITE_PLANE_SPAN},x
oral spritedata+{1*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{1*TILE_DATA_SPAN}+2,y
andl spritemask+{1*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{1*SPRITE_PLANE_SPAN}+2,x
sta $A2
tdc
adc #320
tcd
lda tiledata+{2*TILE_DATA_SPAN},y
andl spritemask+{2*SPRITE_PLANE_SPAN},x
oral spritedata+{2*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{2*TILE_DATA_SPAN}+2,y
andl spritemask+{2*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{2*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{3*TILE_DATA_SPAN},y
andl spritemask+{3*SPRITE_PLANE_SPAN},x
oral spritedata+{3*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{3*TILE_DATA_SPAN}+2,y
andl spritemask+{3*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{3*SPRITE_PLANE_SPAN}+2,x
sta $A2
tdc
adc #320
tcd
lda tiledata+{4*TILE_DATA_SPAN},y
andl spritemask+{4*SPRITE_PLANE_SPAN},x
oral spritedata+{4*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{4*TILE_DATA_SPAN}+2,y
andl spritemask+{4*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{4*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{5*TILE_DATA_SPAN},y
andl spritemask+{5*SPRITE_PLANE_SPAN},x
oral spritedata+{5*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{5*TILE_DATA_SPAN}+2,y
andl spritemask+{5*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{5*SPRITE_PLANE_SPAN}+2,x
sta $A2
tdc
adc #320
tcd
lda tiledata+{6*TILE_DATA_SPAN},y
andl spritemask+{6*SPRITE_PLANE_SPAN},x
oral spritedata+{6*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{6*TILE_DATA_SPAN}+2,y
andl spritemask+{6*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{6*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{7*TILE_DATA_SPAN},y
andl spritemask+{7*SPRITE_PLANE_SPAN},x
oral spritedata+{7*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{7*TILE_DATA_SPAN}+2,y
andl spritemask+{7*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{7*SPRITE_PLANE_SPAN}+2,x
sta $A2
_R0W0
cli
pld
rts
; Run through all of the active sprites and put then on-screen. We have three different heuristics depending on
; how many active sprites there are intersecting this tile.

View File

@ -535,12 +535,22 @@ _GetTileAt
clc
rts
; Small initialization routine to cache the banks for the sprite data and mask
; Small initialization routine to cache the banks for the sprite data and mask and tile/sprite stuff
_CacheSpriteBanks
lda #>spritemask
and #$FF00
ora #^spritedata
sta SpriteBanks
lda #$0100
ora #^TileStore
sta TileStoreBankAndBank01
lda #>tiledata
and #$FF00
ora #^TileStore
sta TileStoreBankAndTileDataBank
rts
; This is 13 blocks wide

View File

@ -331,6 +331,7 @@ _MarkDirtySprite
lda VBuffOrigin
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
; lda VBuffOrigin ; This is an interesting case. The mapping between the tile store
; adc #{0*4}+{0*256} ; and the sprite buffers changes as the StartX, StartY values change
@ -362,6 +363,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{0*4}+{1*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x
@ -382,6 +384,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{0*4}+{2*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x
@ -403,6 +406,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{1*4}+{0*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x
@ -424,6 +428,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{1*4}+{1*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x
@ -445,6 +450,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{1*4}+{2*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x
@ -466,6 +472,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{2*4}+{0*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x
@ -487,6 +494,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{2*4}+{1*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x
@ -508,6 +516,7 @@ _MarkDirtySprite
lda VBuffOrigin
adc #{2*4}+{2*8*SPRITE_PLANE_SPAN}
sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x

View File

@ -590,6 +590,8 @@ InitTiles
lda #0
stal TileStore+TS_TILE_ID,x ; clear the tile store with the special zero tile
stal TileStore+TS_TILE_ADDR,x
stal TileStore+TS_TILE_DISP,x
stal TileStore+TS_LAST_VBUFF,x
stal TileStore+TS_SPRITE_FLAG,x ; no sprites are set at the beginning
lda #$FFFF ; none of the tiles are dirty
@ -693,11 +695,15 @@ _SetTile
beq :nochange
stal TileStore+TS_TILE_ID,x ; Value is different, store it.
jsr _GetTileAddr
stal TileStore+TS_TILE_ADDR,x ; Committed to drawing this tile, so get the address of the tile in the tiledata bank for later
; txa ; Add this tile to the list of dirty tiles to refresh
ldal TileStore+TS_TILE_ID,x
and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value
xba
stal TileStore+TS_TILE_DISP,x
; txa ; Add this tile to the list of dirty tiles to refresh
jmp _PushDirtyTileX ; on the next call to _ApplyTiles
:nochange rts