More sprite streamlining

* Added a TS_LAST_VBUFF cached value in the tile store
* Added a fast path for single sprite w/fast test
* Improved raster timing granularity for visual profiling

All of the code paths are generally good.  Both the _RenderSprites
and _ApplyDirtyTiles functions take a fair bit of raster time. Will
continue to try and streamline the data structures and code to
reduce overhead.`
This commit is contained in:
Lucas Scharenbroich 2022-02-22 02:35:21 -06:00
parent 93ed3b3f16
commit 2507724a49
5 changed files with 204 additions and 41 deletions

View File

@ -81,11 +81,13 @@ BG1TileMapPtr equ 86
SCBArrayPtr equ 90 ; Used for palette binding SCBArrayPtr equ 90 ; Used for palette binding
SpriteBanks equ 94 ; Bank bytes for the sprite data and sprite mask SpriteBanks equ 94 ; Bank bytes for the sprite data and sprite mask
LastRender equ 96 ; Record which reder function was last executed LastRender equ 96 ; Record which reder function was last executed
; DamagedSprites equ 98 ; gap
SpriteMap equ 100 ; Bitmap of open sprite slots. SpriteMap equ 100 ; Bitmap of open sprite slots.
ActiveSpriteCount equ 102 ActiveSpriteCount equ 102
BankLoad equ 104 BankLoad equ 104
Next equ 106 TileStoreBankAndBank01 equ 106
TileStoreBankAndTileDataBank equ 108
Next equ 110
activeSpriteList equ 128 ; 32 bytes for the active sprite list (can persist across frames) activeSpriteList equ 128 ; 32 bytes for the active sprite list (can persist across frames)
AppSpace equ 160 ; 16 bytes of space reserved for application use AppSpace equ 160 ; 16 bytes of space reserved for application use
@ -155,7 +157,7 @@ MAX_TILES equ {26*41} ; Number of tiles in the code fiel
TILE_STORE_SIZE equ {MAX_TILES*2} ; The tile store contains a tile descriptor in each slot TILE_STORE_SIZE equ {MAX_TILES*2} ; The tile store contains a tile descriptor in each slot
TS_TILE_ID equ TILE_STORE_SIZE*0 ; tile descriptor for this location TS_TILE_ID equ TILE_STORE_SIZE*0 ; tile descriptor for this location
TS_DIRTY equ TILE_STORE_SIZE*1 ; Flag. Used to prevent a tile from being queues multiple times per frame TS_DIRTY equ TILE_STORE_SIZE*1 ; Flag. Used to prevent a tile from being queued multiple times per frame
TS_SPRITE_FLAG equ TILE_STORE_SIZE*2 ; Bitfield of all sprites that intersect this tile. 0 if no sprites. TS_SPRITE_FLAG equ TILE_STORE_SIZE*2 ; Bitfield of all sprites that intersect this tile. 0 if no sprites.
TS_TILE_ADDR equ TILE_STORE_SIZE*3 ; cached value, the address of the tiledata for this tile TS_TILE_ADDR equ TILE_STORE_SIZE*3 ; cached value, the address of the tiledata for this tile
TS_CODE_ADDR_LOW equ TILE_STORE_SIZE*4 ; const value, address of this tile in the code fields TS_CODE_ADDR_LOW equ TILE_STORE_SIZE*4 ; const value, address of this tile in the code fields
@ -164,3 +166,5 @@ TS_WORD_OFFSET equ TILE_STORE_SIZE*6 ; const value, word offset val
TS_BASE_ADDR equ TILE_STORE_SIZE*7 ; const value, because there are two rows of tiles per bank, this is set to $0000 ot $8000. TS_BASE_ADDR equ TILE_STORE_SIZE*7 ; const value, because there are two rows of tiles per bank, this is set to $0000 ot $8000.
TS_SCREEN_ADDR equ TILE_STORE_SIZE*8 ; cached value of on-screen location of tile. Used for DirtyRender. TS_SCREEN_ADDR equ TILE_STORE_SIZE*8 ; cached value of on-screen location of tile. Used for DirtyRender.
TS_VBUFF_ARRAY_ADDR equ TILE_STORE_SIZE*9 ; const value to an aligned 32-byte array starting at $8000 in TileStore bank TS_VBUFF_ARRAY_ADDR equ TILE_STORE_SIZE*9 ; const value to an aligned 32-byte array starting at $8000 in TileStore bank
TS_TILE_DISP equ TILE_STORE_SIZE*10 ; derived from TS_TILE_ID to optimize tile dispatch in the Render function
TS_LAST_VBUFF equ TILE_STORE_SIZE*11 ; a cached copy of one of the valid VBUFF values in the array

View File

@ -162,9 +162,16 @@ RenderDirty ENT
_RenderDirty _RenderDirty
lda LastRender ; If the full renderer was last called, we assume that lda LastRender ; If the full renderer was last called, we assume that
bne :norecalc ; the scroll positions have likely changed, so recalculate bne :norecalc ; the scroll positions have likely changed, so recalculate
lda #2 ; blue
jsr _SetBorderColor
jsr _RecalcTileScreenAddrs ; them to make sure sprites draw at the correct screen address jsr _RecalcTileScreenAddrs ; them to make sure sprites draw at the correct screen address
:norecalc :norecalc
lda #3 ; purple
jsr _SetBorderColor
jsr _RenderSprites jsr _RenderSprites
lda #4 ; dk. green
jsr _SetBorderColor
jsr _ApplyDirtyTiles jsr _ApplyDirtyTiles
lda #1 lda #1
sta LastRender sta LastRender
@ -192,50 +199,32 @@ _ApplyDirtyTiles
; Only render solid tiles and sprites ; Only render solid tiles and sprites
_RenderDirtyTile _RenderDirtyTile
pea >TileStore ; Need that addressing flexibility here. Callers responsible for restoring bank reg ldal TileStore+TS_SPRITE_FLAG,x ; This is a bitfield of all the sprites that intersect this tile, only care if non-zero or not
plb bne dirty_sprite
; The rest of this function handles that non-sprite blit, which is super fast since it blits directly from the
; tile data store to the graphics screen with no masking. The only extra work is selecting a blit function
; based on the tile flip flags.
pei TileStoreBankAndBank01 ; Special value that has the TileStore bank in LSB and $01 in MSB
plb plb
txy txy
lda TileStore+TS_SPRITE_FLAG,y ; This is a bitfield of all the sprites that intersect this tile, only care if non-zero or not ldx TileStore+TS_TILE_DISP,y ; get the finalized tile descriptor
beq :nosprite
jsr BuildActiveSpriteArray ; Build the sprite index list from the bit field
sta ActiveSpriteCount
lda TileStore+TS_VBUFF_ARRAY_ADDR,y ; Scratch space
sta _SPR_X_REG
phy
ldy spriteIdx
lda (_SPR_X_REG),y
sta _SPR_X_REG
ply
lda TileStore+TS_TILE_ID,y ; build the finalized tile descriptor
and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value
xba
tax
ldal DirtyTileSpriteProcs,x
stal :tiledisp+1
bra :sprite
:nosprite
lda TileStore+TS_TILE_ID,y ; build the finalized tile descriptor
and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value
xba
tax
ldal DirtyTileProcs,x ; load and patch in the appropriate subroutine ldal DirtyTileProcs,x ; load and patch in the appropriate subroutine
stal :tiledisp+1 stal :tiledisp+1
:sprite
ldx TileStore+TS_TILE_ADDR,y ; load the address of this tile's data (pre-calculated) ldx TileStore+TS_TILE_ADDR,y ; load the address of this tile's data (pre-calculated)
lda TileStore+TS_SCREEN_ADDR,y ; Get the on-screen address of this tile lda TileStore+TS_SCREEN_ADDR,y ; Get the on-screen address of this tile
pha tay
; pha
; lda TileStore+TS_WORD_OFFSET,y ; We don't support this in dirty rendering mode
; ply
; pea $0101
; plb
lda TileStore+TS_WORD_OFFSET,y
ply
pea $0101
plb
plb ; set the bank plb ; set the bank
; B is set to Bank 01 ; B is set to Bank 01
@ -245,6 +234,39 @@ _RenderDirtyTile
:tiledisp jmp $0000 ; render the tile :tiledisp jmp $0000 ; render the tile
; Handler for the sprite path
dirty_sprite
pei TileStoreBankAndTileDataBank ; Special value that has the TileStore bank in LSB and TileData bank in MSB
plb
; Now do all of the deferred work of actually drawing the sprites. We put considerable effort into
; figuring out if there is only one sprite or more than one since we optimize the former case as it
; is very common and can be done significantly faster.
;
; We use the logic operation of A & (A - 1) which removes the least-significant set bit position. If
; this results in a zero value, then we know that there is only a single sprite and can move to an
; optimized routine.
dec
and TileStore+TS_SPRITE_FLAG,x
bne multi_sprite
; Now we are on the fast path. There is only one sprite at this tile, so load the cached VBUFF address
; from the TileStore data structure. This is only guaranteed to be a VBUFF address from one of the
; sprites at the tile, but if there is only one, it's the value we want.
lda TileStore+TS_LAST_VBUFF,x
pha
ldy TileStore+TS_TILE_ADDR,x ; load the address of this tile's data
lda TileStore+TS_SCREEN_ADDR,x ; Get the on-screen address of this tile
plx ; Set the vbuff address
plb
jmp FastBlit
; This is the code path to handle tile with multiple, overlapping sprites.
multi_sprite
rts ; TBD
DirtyTileProcs dw _TBDirtyTile_00,_TBDirtyTile_0H,_TBDirtyTile_V0,_TBDirtyTile_VH DirtyTileProcs dw _TBDirtyTile_00,_TBDirtyTile_0H,_TBDirtyTile_V0,_TBDirtyTile_VH
DirtyTileSpriteProcs dw _TBDirtySpriteTile_00,_TBDirtySpriteTile_0H,_TBDirtySpriteTile_V0,_TBDirtySpriteTile_VH DirtyTileSpriteProcs dw _TBDirtySpriteTile_00,_TBDirtySpriteTile_0H,_TBDirtySpriteTile_V0,_TBDirtySpriteTile_VH
@ -384,6 +406,118 @@ BuildActiveSpriteArray
:out_3 lda #6 :out_3 lda #6
rts rts
; Fast blit that directly combines a tile with a single sprite and renders directly to the screen
;
; A = screen address
; X = sprite VBUFF address
; Y = tile data address
FastBlit
TILE_DATA_SPAN equ 4
phd
sei
clc
tcd
_R0W1
lda tiledata+{0*TILE_DATA_SPAN},y
andl spritemask+{0*SPRITE_PLANE_SPAN},x
oral spritedata+{0*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{0*TILE_DATA_SPAN}+2,y
andl spritemask+{0*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{0*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{1*TILE_DATA_SPAN},y
andl spritemask+{1*SPRITE_PLANE_SPAN},x
oral spritedata+{1*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{1*TILE_DATA_SPAN}+2,y
andl spritemask+{1*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{1*SPRITE_PLANE_SPAN}+2,x
sta $A2
tdc
adc #320
tcd
lda tiledata+{2*TILE_DATA_SPAN},y
andl spritemask+{2*SPRITE_PLANE_SPAN},x
oral spritedata+{2*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{2*TILE_DATA_SPAN}+2,y
andl spritemask+{2*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{2*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{3*TILE_DATA_SPAN},y
andl spritemask+{3*SPRITE_PLANE_SPAN},x
oral spritedata+{3*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{3*TILE_DATA_SPAN}+2,y
andl spritemask+{3*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{3*SPRITE_PLANE_SPAN}+2,x
sta $A2
tdc
adc #320
tcd
lda tiledata+{4*TILE_DATA_SPAN},y
andl spritemask+{4*SPRITE_PLANE_SPAN},x
oral spritedata+{4*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{4*TILE_DATA_SPAN}+2,y
andl spritemask+{4*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{4*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{5*TILE_DATA_SPAN},y
andl spritemask+{5*SPRITE_PLANE_SPAN},x
oral spritedata+{5*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{5*TILE_DATA_SPAN}+2,y
andl spritemask+{5*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{5*SPRITE_PLANE_SPAN}+2,x
sta $A2
tdc
adc #320
tcd
lda tiledata+{6*TILE_DATA_SPAN},y
andl spritemask+{6*SPRITE_PLANE_SPAN},x
oral spritedata+{6*SPRITE_PLANE_SPAN},x
sta $00
lda tiledata+{6*TILE_DATA_SPAN}+2,y
andl spritemask+{6*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{6*SPRITE_PLANE_SPAN}+2,x
sta $02
lda tiledata+{7*TILE_DATA_SPAN},y
andl spritemask+{7*SPRITE_PLANE_SPAN},x
oral spritedata+{7*SPRITE_PLANE_SPAN},x
sta $A0
lda tiledata+{7*TILE_DATA_SPAN}+2,y
andl spritemask+{7*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{7*SPRITE_PLANE_SPAN}+2,x
sta $A2
_R0W0
cli
pld
rts
; Run through all of the active sprites and put then on-screen. We have three different heuristics depending on ; Run through all of the active sprites and put then on-screen. We have three different heuristics depending on
; how many active sprites there are intersecting this tile. ; how many active sprites there are intersecting this tile.

View File

@ -535,12 +535,22 @@ _GetTileAt
clc clc
rts rts
; Small initialization routine to cache the banks for the sprite data and mask ; Small initialization routine to cache the banks for the sprite data and mask and tile/sprite stuff
_CacheSpriteBanks _CacheSpriteBanks
lda #>spritemask lda #>spritemask
and #$FF00 and #$FF00
ora #^spritedata ora #^spritedata
sta SpriteBanks sta SpriteBanks
lda #$0100
ora #^TileStore
sta TileStoreBankAndBank01
lda #>tiledata
and #$FF00
ora #^TileStore
sta TileStoreBankAndTileDataBank
rts rts
; This is 13 blocks wide ; This is 13 blocks wide

View File

@ -331,6 +331,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
; lda VBuffOrigin ; This is an interesting case. The mapping between the tile store ; lda VBuffOrigin ; This is an interesting case. The mapping between the tile store
; adc #{0*4}+{0*256} ; and the sprite buffers changes as the StartX, StartY values change ; adc #{0*4}+{0*256} ; and the sprite buffers changes as the StartX, StartY values change
@ -362,6 +363,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{0*4}+{1*8*SPRITE_PLANE_SPAN} adc #{0*4}+{1*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x
@ -382,6 +384,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{0*4}+{2*8*SPRITE_PLANE_SPAN} adc #{0*4}+{2*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x
@ -403,6 +406,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{1*4}+{0*8*SPRITE_PLANE_SPAN} adc #{1*4}+{0*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x
@ -424,6 +428,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{1*4}+{1*8*SPRITE_PLANE_SPAN} adc #{1*4}+{1*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x
@ -445,6 +450,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{1*4}+{2*8*SPRITE_PLANE_SPAN} adc #{1*4}+{2*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x
@ -466,6 +472,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{2*4}+{0*8*SPRITE_PLANE_SPAN} adc #{2*4}+{0*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x
@ -487,6 +494,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{2*4}+{1*8*SPRITE_PLANE_SPAN} adc #{2*4}+{1*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x
@ -508,6 +516,7 @@ _MarkDirtySprite
lda VBuffOrigin lda VBuffOrigin
adc #{2*4}+{2*8*SPRITE_PLANE_SPAN} adc #{2*4}+{2*8*SPRITE_PLANE_SPAN}
sta [tmp0],y sta [tmp0],y
stal TileStore+TS_LAST_VBUFF,x
lda SpriteBit lda SpriteBit
oral TileStore+TS_SPRITE_FLAG,x oral TileStore+TS_SPRITE_FLAG,x

View File

@ -590,6 +590,8 @@ InitTiles
lda #0 lda #0
stal TileStore+TS_TILE_ID,x ; clear the tile store with the special zero tile stal TileStore+TS_TILE_ID,x ; clear the tile store with the special zero tile
stal TileStore+TS_TILE_ADDR,x stal TileStore+TS_TILE_ADDR,x
stal TileStore+TS_TILE_DISP,x
stal TileStore+TS_LAST_VBUFF,x
stal TileStore+TS_SPRITE_FLAG,x ; no sprites are set at the beginning stal TileStore+TS_SPRITE_FLAG,x ; no sprites are set at the beginning
lda #$FFFF ; none of the tiles are dirty lda #$FFFF ; none of the tiles are dirty
@ -693,11 +695,15 @@ _SetTile
beq :nochange beq :nochange
stal TileStore+TS_TILE_ID,x ; Value is different, store it. stal TileStore+TS_TILE_ID,x ; Value is different, store it.
jsr _GetTileAddr jsr _GetTileAddr
stal TileStore+TS_TILE_ADDR,x ; Committed to drawing this tile, so get the address of the tile in the tiledata bank for later stal TileStore+TS_TILE_ADDR,x ; Committed to drawing this tile, so get the address of the tile in the tiledata bank for later
; txa ; Add this tile to the list of dirty tiles to refresh ldal TileStore+TS_TILE_ID,x
and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value
xba
stal TileStore+TS_TILE_DISP,x
; txa ; Add this tile to the list of dirty tiles to refresh
jmp _PushDirtyTileX ; on the next call to _ApplyTiles jmp _PushDirtyTileX ; on the next call to _ApplyTiles
:nochange rts :nochange rts