From 2507724a49efdc0b0e85ec09eb461ec34313cdb2 Mon Sep 17 00:00:00 2001 From: Lucas Scharenbroich Date: Tue, 22 Feb 2022 02:35:21 -0600 Subject: [PATCH] More sprite streamlining * Added a TS_LAST_VBUFF cached value in the tile store * Added a fast path for single sprite w/fast test * Improved raster timing granularity for visual profiling All of the code paths are generally good. Both the _RenderSprites and _ApplyDirtyTiles functions take a fair bit of raster time. Will continue to try and streamline the data structures and code to reduce overhead.` --- src/Defs.s | 10 ++- src/Render.s | 204 ++++++++++++++++++++++++++++++++++++-------- src/Sprite.s | 12 ++- src/Sprite2.s | 9 ++ src/blitter/Tiles.s | 10 ++- 5 files changed, 204 insertions(+), 41 deletions(-) diff --git a/src/Defs.s b/src/Defs.s index 970a391..896eb34 100644 --- a/src/Defs.s +++ b/src/Defs.s @@ -81,11 +81,13 @@ BG1TileMapPtr equ 86 SCBArrayPtr equ 90 ; Used for palette binding SpriteBanks equ 94 ; Bank bytes for the sprite data and sprite mask LastRender equ 96 ; Record which reder function was last executed -; DamagedSprites equ 98 +; gap SpriteMap equ 100 ; Bitmap of open sprite slots. ActiveSpriteCount equ 102 BankLoad equ 104 -Next equ 106 +TileStoreBankAndBank01 equ 106 +TileStoreBankAndTileDataBank equ 108 +Next equ 110 activeSpriteList equ 128 ; 32 bytes for the active sprite list (can persist across frames) AppSpace equ 160 ; 16 bytes of space reserved for application use @@ -155,7 +157,7 @@ MAX_TILES equ {26*41} ; Number of tiles in the code fiel TILE_STORE_SIZE equ {MAX_TILES*2} ; The tile store contains a tile descriptor in each slot TS_TILE_ID equ TILE_STORE_SIZE*0 ; tile descriptor for this location -TS_DIRTY equ TILE_STORE_SIZE*1 ; Flag. Used to prevent a tile from being queues multiple times per frame +TS_DIRTY equ TILE_STORE_SIZE*1 ; Flag. Used to prevent a tile from being queued multiple times per frame TS_SPRITE_FLAG equ TILE_STORE_SIZE*2 ; Bitfield of all sprites that intersect this tile. 0 if no sprites. TS_TILE_ADDR equ TILE_STORE_SIZE*3 ; cached value, the address of the tiledata for this tile TS_CODE_ADDR_LOW equ TILE_STORE_SIZE*4 ; const value, address of this tile in the code fields @@ -164,3 +166,5 @@ TS_WORD_OFFSET equ TILE_STORE_SIZE*6 ; const value, word offset val TS_BASE_ADDR equ TILE_STORE_SIZE*7 ; const value, because there are two rows of tiles per bank, this is set to $0000 ot $8000. TS_SCREEN_ADDR equ TILE_STORE_SIZE*8 ; cached value of on-screen location of tile. Used for DirtyRender. TS_VBUFF_ARRAY_ADDR equ TILE_STORE_SIZE*9 ; const value to an aligned 32-byte array starting at $8000 in TileStore bank +TS_TILE_DISP equ TILE_STORE_SIZE*10 ; derived from TS_TILE_ID to optimize tile dispatch in the Render function +TS_LAST_VBUFF equ TILE_STORE_SIZE*11 ; a cached copy of one of the valid VBUFF values in the array \ No newline at end of file diff --git a/src/Render.s b/src/Render.s index 28d5c5f..a7ecc17 100644 --- a/src/Render.s +++ b/src/Render.s @@ -162,9 +162,16 @@ RenderDirty ENT _RenderDirty lda LastRender ; If the full renderer was last called, we assume that bne :norecalc ; the scroll positions have likely changed, so recalculate + lda #2 ; blue + jsr _SetBorderColor jsr _RecalcTileScreenAddrs ; them to make sure sprites draw at the correct screen address :norecalc + lda #3 ; purple + jsr _SetBorderColor jsr _RenderSprites + + lda #4 ; dk. green + jsr _SetBorderColor jsr _ApplyDirtyTiles lda #1 sta LastRender @@ -192,50 +199,32 @@ _ApplyDirtyTiles ; Only render solid tiles and sprites _RenderDirtyTile - pea >TileStore ; Need that addressing flexibility here. Callers responsible for restoring bank reg - plb + ldal TileStore+TS_SPRITE_FLAG,x ; This is a bitfield of all the sprites that intersect this tile, only care if non-zero or not + bne dirty_sprite + +; The rest of this function handles that non-sprite blit, which is super fast since it blits directly from the +; tile data store to the graphics screen with no masking. The only extra work is selecting a blit function +; based on the tile flip flags. + + pei TileStoreBankAndBank01 ; Special value that has the TileStore bank in LSB and $01 in MSB plb txy - lda TileStore+TS_SPRITE_FLAG,y ; This is a bitfield of all the sprites that intersect this tile, only care if non-zero or not - beq :nosprite - - jsr BuildActiveSpriteArray ; Build the sprite index list from the bit field - sta ActiveSpriteCount - - lda TileStore+TS_VBUFF_ARRAY_ADDR,y ; Scratch space - sta _SPR_X_REG - phy - ldy spriteIdx - lda (_SPR_X_REG),y - sta _SPR_X_REG - ply - - lda TileStore+TS_TILE_ID,y ; build the finalized tile descriptor - and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value - xba - tax - ldal DirtyTileSpriteProcs,x - stal :tiledisp+1 - bra :sprite - -:nosprite - lda TileStore+TS_TILE_ID,y ; build the finalized tile descriptor - and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value - xba - tax + ldx TileStore+TS_TILE_DISP,y ; get the finalized tile descriptor ldal DirtyTileProcs,x ; load and patch in the appropriate subroutine stal :tiledisp+1 -:sprite ldx TileStore+TS_TILE_ADDR,y ; load the address of this tile's data (pre-calculated) lda TileStore+TS_SCREEN_ADDR,y ; Get the on-screen address of this tile - pha + tay + +; pha +; lda TileStore+TS_WORD_OFFSET,y ; We don't support this in dirty rendering mode +; ply + +; pea $0101 +; plb - lda TileStore+TS_WORD_OFFSET,y - ply - pea $0101 - plb plb ; set the bank ; B is set to Bank 01 @@ -245,6 +234,39 @@ _RenderDirtyTile :tiledisp jmp $0000 ; render the tile +; Handler for the sprite path +dirty_sprite + pei TileStoreBankAndTileDataBank ; Special value that has the TileStore bank in LSB and TileData bank in MSB + plb + +; Now do all of the deferred work of actually drawing the sprites. We put considerable effort into +; figuring out if there is only one sprite or more than one since we optimize the former case as it +; is very common and can be done significantly faster. +; +; We use the logic operation of A & (A - 1) which removes the least-significant set bit position. If +; this results in a zero value, then we know that there is only a single sprite and can move to an +; optimized routine. + + dec + and TileStore+TS_SPRITE_FLAG,x + bne multi_sprite + +; Now we are on the fast path. There is only one sprite at this tile, so load the cached VBUFF address +; from the TileStore data structure. This is only guaranteed to be a VBUFF address from one of the +; sprites at the tile, but if there is only one, it's the value we want. + + lda TileStore+TS_LAST_VBUFF,x + pha + ldy TileStore+TS_TILE_ADDR,x ; load the address of this tile's data + lda TileStore+TS_SCREEN_ADDR,x ; Get the on-screen address of this tile + plx ; Set the vbuff address + plb + jmp FastBlit + +; This is the code path to handle tile with multiple, overlapping sprites. +multi_sprite + rts ; TBD + DirtyTileProcs dw _TBDirtyTile_00,_TBDirtyTile_0H,_TBDirtyTile_V0,_TBDirtyTile_VH DirtyTileSpriteProcs dw _TBDirtySpriteTile_00,_TBDirtySpriteTile_0H,_TBDirtySpriteTile_V0,_TBDirtySpriteTile_VH @@ -384,6 +406,118 @@ BuildActiveSpriteArray :out_3 lda #6 rts +; Fast blit that directly combines a tile with a single sprite and renders directly to the screen +; +; A = screen address +; X = sprite VBUFF address +; Y = tile data address +FastBlit +TILE_DATA_SPAN equ 4 + + phd + sei + clc + tcd + + _R0W1 + + lda tiledata+{0*TILE_DATA_SPAN},y + andl spritemask+{0*SPRITE_PLANE_SPAN},x + oral spritedata+{0*SPRITE_PLANE_SPAN},x + sta $00 + + lda tiledata+{0*TILE_DATA_SPAN}+2,y + andl spritemask+{0*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{0*SPRITE_PLANE_SPAN}+2,x + sta $02 + + lda tiledata+{1*TILE_DATA_SPAN},y + andl spritemask+{1*SPRITE_PLANE_SPAN},x + oral spritedata+{1*SPRITE_PLANE_SPAN},x + sta $A0 + + lda tiledata+{1*TILE_DATA_SPAN}+2,y + andl spritemask+{1*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{1*SPRITE_PLANE_SPAN}+2,x + sta $A2 + + tdc + adc #320 + tcd + + lda tiledata+{2*TILE_DATA_SPAN},y + andl spritemask+{2*SPRITE_PLANE_SPAN},x + oral spritedata+{2*SPRITE_PLANE_SPAN},x + sta $00 + + lda tiledata+{2*TILE_DATA_SPAN}+2,y + andl spritemask+{2*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{2*SPRITE_PLANE_SPAN}+2,x + sta $02 + + lda tiledata+{3*TILE_DATA_SPAN},y + andl spritemask+{3*SPRITE_PLANE_SPAN},x + oral spritedata+{3*SPRITE_PLANE_SPAN},x + sta $A0 + + lda tiledata+{3*TILE_DATA_SPAN}+2,y + andl spritemask+{3*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{3*SPRITE_PLANE_SPAN}+2,x + sta $A2 + + tdc + adc #320 + tcd + + lda tiledata+{4*TILE_DATA_SPAN},y + andl spritemask+{4*SPRITE_PLANE_SPAN},x + oral spritedata+{4*SPRITE_PLANE_SPAN},x + sta $00 + + lda tiledata+{4*TILE_DATA_SPAN}+2,y + andl spritemask+{4*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{4*SPRITE_PLANE_SPAN}+2,x + sta $02 + + lda tiledata+{5*TILE_DATA_SPAN},y + andl spritemask+{5*SPRITE_PLANE_SPAN},x + oral spritedata+{5*SPRITE_PLANE_SPAN},x + sta $A0 + + lda tiledata+{5*TILE_DATA_SPAN}+2,y + andl spritemask+{5*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{5*SPRITE_PLANE_SPAN}+2,x + sta $A2 + + tdc + adc #320 + tcd + + lda tiledata+{6*TILE_DATA_SPAN},y + andl spritemask+{6*SPRITE_PLANE_SPAN},x + oral spritedata+{6*SPRITE_PLANE_SPAN},x + sta $00 + + lda tiledata+{6*TILE_DATA_SPAN}+2,y + andl spritemask+{6*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{6*SPRITE_PLANE_SPAN}+2,x + sta $02 + + lda tiledata+{7*TILE_DATA_SPAN},y + andl spritemask+{7*SPRITE_PLANE_SPAN},x + oral spritedata+{7*SPRITE_PLANE_SPAN},x + sta $A0 + + lda tiledata+{7*TILE_DATA_SPAN}+2,y + andl spritemask+{7*SPRITE_PLANE_SPAN}+2,x + oral spritedata+{7*SPRITE_PLANE_SPAN}+2,x + sta $A2 + + _R0W0 + cli + pld + rts + ; Run through all of the active sprites and put then on-screen. We have three different heuristics depending on ; how many active sprites there are intersecting this tile. diff --git a/src/Sprite.s b/src/Sprite.s index 68ebe23..096eab1 100644 --- a/src/Sprite.s +++ b/src/Sprite.s @@ -535,12 +535,22 @@ _GetTileAt clc rts -; Small initialization routine to cache the banks for the sprite data and mask +; Small initialization routine to cache the banks for the sprite data and mask and tile/sprite stuff _CacheSpriteBanks lda #>spritemask and #$FF00 ora #^spritedata sta SpriteBanks + + lda #$0100 + ora #^TileStore + sta TileStoreBankAndBank01 + + lda #>tiledata + and #$FF00 + ora #^TileStore + sta TileStoreBankAndTileDataBank + rts ; This is 13 blocks wide diff --git a/src/Sprite2.s b/src/Sprite2.s index be125ff..5fbfa84 100644 --- a/src/Sprite2.s +++ b/src/Sprite2.s @@ -331,6 +331,7 @@ _MarkDirtySprite lda VBuffOrigin sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x ; lda VBuffOrigin ; This is an interesting case. The mapping between the tile store ; adc #{0*4}+{0*256} ; and the sprite buffers changes as the StartX, StartY values change @@ -362,6 +363,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{0*4}+{1*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x @@ -382,6 +384,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{0*4}+{2*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x @@ -403,6 +406,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{1*4}+{0*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x @@ -424,6 +428,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{1*4}+{1*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x @@ -445,6 +450,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{1*4}+{2*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x @@ -466,6 +472,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{2*4}+{0*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x @@ -487,6 +494,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{2*4}+{1*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x @@ -508,6 +516,7 @@ _MarkDirtySprite lda VBuffOrigin adc #{2*4}+{2*8*SPRITE_PLANE_SPAN} sta [tmp0],y + stal TileStore+TS_LAST_VBUFF,x lda SpriteBit oral TileStore+TS_SPRITE_FLAG,x diff --git a/src/blitter/Tiles.s b/src/blitter/Tiles.s index b38615a..0dfa016 100644 --- a/src/blitter/Tiles.s +++ b/src/blitter/Tiles.s @@ -590,6 +590,8 @@ InitTiles lda #0 stal TileStore+TS_TILE_ID,x ; clear the tile store with the special zero tile stal TileStore+TS_TILE_ADDR,x + stal TileStore+TS_TILE_DISP,x + stal TileStore+TS_LAST_VBUFF,x stal TileStore+TS_SPRITE_FLAG,x ; no sprites are set at the beginning lda #$FFFF ; none of the tiles are dirty @@ -693,11 +695,15 @@ _SetTile beq :nochange stal TileStore+TS_TILE_ID,x ; Value is different, store it. - jsr _GetTileAddr stal TileStore+TS_TILE_ADDR,x ; Committed to drawing this tile, so get the address of the tile in the tiledata bank for later -; txa ; Add this tile to the list of dirty tiles to refresh + ldal TileStore+TS_TILE_ID,x + and #TILE_VFLIP_BIT+TILE_HFLIP_BIT ; get the lookup value + xba + stal TileStore+TS_TILE_DISP,x + +; txa ; Add this tile to the list of dirty tiles to refresh jmp _PushDirtyTileX ; on the next call to _ApplyTiles :nochange rts