Complete unrolled loop for clearing sprites from the tile store

This commit is contained in:
Lucas Scharenbroich 2022-05-23 15:18:34 -05:00
parent 9da3616c1d
commit 5577105be8
5 changed files with 135 additions and 217 deletions

View File

@ -94,8 +94,9 @@ UserId equ 112 ; Memory manager user Id to use
ToolNum equ 114 ; Tool number assigned to us ToolNum equ 114 ; Tool number assigned to us
LastKey equ 116 LastKey equ 116
LastTick equ 118 LastTick equ 118
ForceSpriteFlag equ 120
Next equ 120 Next equ 122
activeSpriteList equ 128 ; 32 bytes for the active sprite list (can persist across frames) activeSpriteList equ 128 ; 32 bytes for the active sprite list (can persist across frames)
; tiletmp equ 178 ; 16 bytes of temp storage for the tile renderers ; tiletmp equ 178 ; 16 bytes of temp storage for the tile renderers

View File

@ -28,7 +28,7 @@ _Render
jsr _ApplyBG0XPosPre jsr _ApplyBG0XPosPre
; jsr _ApplyBG1XPosPre ; jsr _ApplyBG1XPosPre
; jsr _RenderSprites ; Once the BG0 X and Y positions are committed, update sprite data jsr _RenderSprites ; Once the BG0 X and Y positions are committed, update sprite data
; jsr _UpdateBG0TileMap ; and the tile maps. These subroutines build up a list of tiles ; jsr _UpdateBG0TileMap ; and the tile maps. These subroutines build up a list of tiles
; jsr _UpdateBG1TileMap ; that need to be updated in the code field ; jsr _UpdateBG1TileMap ; that need to be updated in the code field

View File

@ -132,11 +132,39 @@ _AddSprite
rts rts
; Macro to make the unrolled loop more concise
;
; The macro
;
; 1. Load the tile store address from a fixed offset
; 2. Clears the sprite bit from the TS_SPRITE_FLAG location
; 3. Checks if the tile is dirty and marks it
; 4. If the tile was dirty, save the tile store address to be added to the DirtyTiles list later
TSClearSprite mac
ldy TileStoreLookup+]1,x
lda TileStore+TS_SPRITE_FLAG,y
and tmp0
sta TileStore+TS_SPRITE_FLAG,y
lda TileStore+TS_DIRTY,y
bne next
inc
sta TileStore+TS_DIRTY,y
phy
next
<<<
; Alternate implementation that uses the TS_COVERAGE_SIZE and TS_LOOKUP_INDEX properties to ; Alternate implementation that uses the TS_COVERAGE_SIZE and TS_LOOKUP_INDEX properties to
; load the old values directly from the TileStoreLookup table, rather than caching them. ; load the old values directly from the TileStoreLookup table, rather than caching them.
; This is more efficient, because the work in MarkDirtySprite is independent of the ; This is more efficient, because the work in MarkDirtySprite is independent of the
; sprite size and, by inlining the _PushDirtyTile logic, we can save a fair amount of overhead ; sprite size and, by inlining the _PushDirtyTile logic, we can save a fair amount of overhead
_ClearSpriteFromTileStore2 _ClearSpriteFromTileStore
tsc
sta tmp1 ; We use the stack as a counter
lda _SpriteBitsNot,y ; Cache this value in a direct page location
sta tmp0
ldx _Sprites+TS_COVERAGE_SIZE,y ldx _Sprites+TS_COVERAGE_SIZE,y
jmp (csfts_tbl,x) jmp (csfts_tbl,x)
csfts_tbl dw csfts_1x1,csfts_1x2,csfts_1x3,csfts_out csfts_tbl dw csfts_1x1,csfts_1x2,csfts_1x3,csfts_out
@ -144,216 +172,112 @@ csfts_tbl dw csfts_1x1,csfts_1x2,csfts_1x3,csfts_out
dw csfts_3x1,csfts_3x2,csfts_3x3,csfts_out dw csfts_3x1,csfts_3x2,csfts_3x3,csfts_out
dw csfts_out,csfts_out,csfts_out,csfts_out dw csfts_out,csfts_out,csfts_out,csfts_out
; Just a single value to clear and add to the dirty tile list csfts_3x3 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 2
TSClearSprite 4
TSClearSprite 1*{TS_LOOKUP_SPAN*2}
TSClearSprite 1*{TS_LOOKUP_SPAN*2}+2
TSClearSprite 1*{TS_LOOKUP_SPAN*2}+4
TSClearSprite 2*{TS_LOOKUP_SPAN*2}
TSClearSprite 2*{TS_LOOKUP_SPAN*2}+2
TSClearSprite 2*{TS_LOOKUP_SPAN*2}+4
jmp csfts_finish
csfts_3x2 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 2
TSClearSprite 1*{TS_LOOKUP_SPAN*2}
TSClearSprite 1*{TS_LOOKUP_SPAN*2}+2
TSClearSprite 2*{TS_LOOKUP_SPAN*2}
TSClearSprite 2*{TS_LOOKUP_SPAN*2}+2
jmp csfts_finish
csfts_3x1 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 1*{TS_LOOKUP_SPAN*2}
TSClearSprite 2*{TS_LOOKUP_SPAN*2}
jmp csfts_finish
csfts_2x3 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 2
TSClearSprite 4
TSClearSprite 1*{TS_LOOKUP_SPAN*2}
TSClearSprite 1*{TS_LOOKUP_SPAN*2}+2
TSClearSprite 1*{TS_LOOKUP_SPAN*2}+4
jmp csfts_finish
csfts_2x2 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 2
TSClearSprite 1*{TS_LOOKUP_SPAN*2}
TSClearSprite 1*{TS_LOOKUP_SPAN*2}+2
jmp csfts_finish
csfts_2x1 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 1*{TS_LOOKUP_SPAN*2}
jmp csfts_finish
csfts_1x3 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 2
TSClearSprite 4
jmp csfts_finish
csfts_1x2 ldx _Sprites+TS_LOOKUP_INDEX,y
TSClearSprite 0
TSClearSprite 2
jmp csfts_finish
csfts_1x1 ldx _Sprites+TS_LOOKUP_INDEX,y csfts_1x1 ldx _Sprites+TS_LOOKUP_INDEX,y
lda TileStoreLookup,x TSClearSprite 0
tax
lda TileStore+TS_SPRITE_FLAG,x ; Second phase; put all the dirty tiles on the DirtyTiles list
and _SpriteBitsNot,y csfts_finish
sta TileStore+TS_SPRITE_FLAG,x tsc
eor #$FFFF
sec
adc tmp1 ; Looks weird, but calculates (tmp1 - acc)
lda TileStore+TS_DIRTY,x tax ; This is 2 * N where N is the number of dirty tiles
bne csfts_1x1_out ldy DirtyTileCount ; Grab a copy of the old index (for addressing)
inc ; any non-zero value will work clc
sta TileStore+TS_DIRTY,x ; and is 1 cycle faster than loading a constant value adc DirtyTileCount ; Add the new items to the list
sta DirtyTileCount
txa jmp (dtloop,x)
ldx DirtyTileCount dtloop dw csfts_out, dtloop1, dtloop2, dtloop3
sta DirtyTiles,x dw dtloop4, dtloop5, dtloop6, dtloop7
inx dw dtloop8, dtloop9, dtloop10, dtloop11
inx
stx DirtyTileCount
csfts_1x2
csfts_1x3
csfts_2x1
csfts_2x3
csfts_3x1
csfts_3x2
csfts_3x3
csfts_1x1_out
rts
; This is a more interesting case where the ability to batch things up starts to produce some dtloop11 pla
; efficiency gains sta DirtyTiles+20,y
csfts_2x2 ldx _Sprites+TS_LOOKUP_INDEX,y ; Get the address of the old top-left corner dtloop10 pla
tay sta DirtyTiles+18,y
ldx TileStoreLookup,y dtloop9 pla
sta DirtyTiles+16,y
lda TileStore+TS_SPRITE_FLAG,x dtloop8 pla
and _SpriteBits sta DirtyTiles+14,y
sta TileStore+TS_SPRITE_FLAG,x dtloop7 pla
sta DirtyTiles+12,y
lda TileStore+TS_DIRTY,x dtloop6 pla
beq *+3 sta DirtyTiles+10,y
phx dtloop5 pla
sta DirtyTiles+8,y
dtloop4 pla
ldx TileStoreLookup+2,y sta DirtyTiles+6,y
dtloop3 pla
lda TileStore+TS_SPRITE_FLAG,x sta DirtyTiles+4,y
and _SpriteBits dtloop2 pla
sta TileStore+TS_SPRITE_FLAG,x sta DirtyTiles+2,y
dtloop1 pla
lda TileStore+TS_DIRTY,x sta DirtyTiles+0,y
beq *+3
phx
ldx TileStoreLookup+TS_LOOKUP_SPAN,y
lda TileStore+TS_SPRITE_FLAG,x
and _SpriteBits
sta TileStore+TS_SPRITE_FLAG,x
lda TileStore+TS_DIRTY,x
beq *+3
phx
ldx TileStoreLookup+TS_LOOKUP_SPAN+2,y
lda TileStore+TS_SPRITE_FLAG,x
and _SpriteBits
sta TileStore+TS_SPRITE_FLAG,x
ldy DirtyTileCount
lda TileStore+TS_DIRTY,x
beq skip_2x2
txa
sta DirtyTiles,y
sta TileStore+TS_DIRTY,x
skip_2x2
pla
beq :done1
sta DirtyTiles+2,x
tay
sta TileStore+TS_DIRTY,y
pla
beq :done2
sta DirtyTiles+4,x
tay
sta TileStore+TS_DIRTY,y
pla
beq :done3
sta DirtyTiles+6,x
tay
sta TileStore+TS_DIRTY,y
; Maximum number of dirty tiles reached. Just fall through.
pla
txa
adc #8
sta DirtyTileCount
rts
:done3
txa
adc #6
sta DirtyTileCount
rts
:done2
txa
adc #4
sta DirtyTileCount
rts
:done1
inx
inx
stx DirtyTileCount
rts
lda _SpriteBitsNot,y ; Cache the bit value for this sprite
ldy TileStoreLookup,x ; Get the tile store offset
and TileStore+TS_SPRITE_FLAG,y
sta TileStore+TS_SPRITE_FLAG,y
csfts_out rts csfts_out rts
; Run through the list of tile store offsets that this sprite was last drawn into and mark
; those tiles as dirty. The largest number of tiles that a sprite could possibly cover is 20
; (an unaligned 4x3 sprite), covering a 5x4 area of play field tiles.
;
; Y register = sprite record index
_CSFTS_Out rts
_ClearSpriteFromTileStore
; ldx _Sprites+TILE_STORE_ADDR_1,y
; beq _CSFTS_Out
; ldal TileStore+TS_SPRITE_FLAG,x ; Clear the bit in the bit field. This seems wasteful, but
; and _SpriteBitsNot,y ; there is no indexed form of TSB/TRB and caching the value in
; stal TileStore+TS_SPRITE_FLAG,x ; a direct page location, only saves 1 or 2 cycles per and costs 10.
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_2,y
; beq _CSFTS_Out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_3,y
; beq _CSFTS_Out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_4,y
; beq _CSFTS_Out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_5,y
; beq :out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_6,y
; beq :out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_7,y
; beq :out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_8,y
; beq :out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jsr _PushDirtyTileX
; ldx _Sprites+TILE_STORE_ADDR_9,y
; beq :out
; ldal TileStore+TS_SPRITE_FLAG,x
; and _SpriteBitsNot,y
; stal TileStore+TS_SPRITE_FLAG,x
; jmp _PushDirtyTileX
:out rts
; This function looks at the sprite list and renders the sprite plane data into the appropriate ; This function looks at the sprite list and renders the sprite plane data into the appropriate
; tiles in the code field. There are a few phases to this routine. The assumption is that ; tiles in the code field. There are a few phases to this routine. The assumption is that
; any sprite that needs to be re-drawn has been marked as DIRTY or DAMAGED. ; any sprite that needs to be re-drawn has been marked as DIRTY or DAMAGED.
@ -437,7 +361,7 @@ phase1 dw :phase1_0
; tile store locations to the dirty tile list. ; tile store locations to the dirty tile list.
_DoPhase1 _DoPhase1
lda _Sprites+SPRITE_STATUS,y lda _Sprites+SPRITE_STATUS,y
ora forceSpriteFlag ora ForceSpriteFlag
bit #SPRITE_STATUS_MOVED+SPRITE_STATUS_REMOVED bit #SPRITE_STATUS_MOVED+SPRITE_STATUS_REMOVED
beq :no_clear beq :no_clear
jsr _ClearSpriteFromTileStore jsr _ClearSpriteFromTileStore
@ -520,7 +444,7 @@ phase2 dw :phase2_0
_DoPhase2 _DoPhase2
lda _Sprites+SPRITE_STATUS,y lda _Sprites+SPRITE_STATUS,y
beq :out ; If phase 1 marked us as empty, do nothing beq :out ; If phase 1 marked us as empty, do nothing
ora forceSpriteFlag ora ForceSpriteFlag
and #SPRITE_STATUS_ADDED+SPRITE_STATUS_MOVED+SPRITE_STATUS_UPDATED and #SPRITE_STATUS_ADDED+SPRITE_STATUS_MOVED+SPRITE_STATUS_UPDATED
beq :out beq :out
@ -580,7 +504,6 @@ RebuildSpriteArray
stx ActiveSpriteCount stx ActiveSpriteCount
rts rts
forceSpriteFlag ds 2
_RenderSprites _RenderSprites
; Check to see if any sprites have been added or removed. If so, then we regenerate the active ; Check to see if any sprites have been added or removed. If so, then we regenerate the active
@ -617,7 +540,7 @@ _RenderSprites
; occupies and old locations that it no longer covers. It's possible that just testing ; occupies and old locations that it no longer covers. It's possible that just testing
; for equality would be the easiest win to know when we can skip everything. ; for equality would be the easiest win to know when we can skip everything.
stz forceSpriteFlag stz ForceSpriteFlag
lda StartX lda StartX
cmp OldStartX cmp OldStartX
bne :force_update bne :force_update
@ -628,7 +551,7 @@ _RenderSprites
:force_update :force_update
lda #SPRITE_STATUS_MOVED lda #SPRITE_STATUS_MOVED
sta forceSpriteFlag sta ForceSpriteFlag
:no_change :no_change
; Dispatch to the first phase of rendering the sprites. By pre-building the list, we know exactly ; Dispatch to the first phase of rendering the sprites. By pre-building the list, we know exactly
@ -636,12 +559,12 @@ _RenderSprites
; of an iterating variable ; of an iterating variable
ldx ActiveSpriteCount ldx ActiveSpriteCount
jmp (phase1,x) ; jmp (phase1,x)
phase1_rtn phase1_rtn
; Dispatch to the second phase of rendering the sprites. ; Dispatch to the second phase of rendering the sprites.
ldx ActiveSpriteCount ldx ActiveSpriteCount
jmp (phase2,x) ; jmp (phase2,x)
phase2_rtn phase2_rtn
rts rts

View File

@ -744,7 +744,3 @@ skip
;_SpriteWidth dw 4,8,4,8 ;_SpriteWidth dw 4,8,4,8
;_SpriteWidthMinus1 dw 3,7,3,7 ;_SpriteWidthMinus1 dw 3,7,3,7
;_SpriteCols dw 1,2,1,2 ;_SpriteCols dw 1,2,1,2
; Convert sprite index to a bit position
_SpriteBits dw $0001,$0002,$0004,$0008,$0010,$0020,$0040,$0080,$0100,$0200,$0400,$0800,$1000,$2000,$4000,$8000
_SpriteBitsNot dw $FFFE,$FFFD,$FFFB,$FFF7,$FFEF,$FFDF,$FFBF,$FF7F,$FEFF,$FDFF,$FBFF,$F7FF,$EFFF,$DFFF,$BFFF,$7FFF

View File

@ -223,8 +223,6 @@ xPos equ FirstParam+2
lda xPos,s lda xPos,s
jsr _SetBG0XPos jsr _SetBG0XPos
lda yPos,s lda yPos,s
bpl *+5
lda #0
jsr _SetBG0YPos jsr _SetBG0YPos
_TSExit #0;#4 _TSExit #0;#4