iigs-game-engine/src/blitter/Tiles10000.s
Lucas Scharenbroich 8bb17895a9 Rough outline of streamlined sprite subsystem
* Split the creation of the sprite stamps from adding the
  sprites themselves.  This allows for 48 stamps that can
  be pre-rendered and quickly reassigned to sprites for
  animations.

* Inlined all calls to PushDirtyTile.  This both removed
  significant overhead from calling the small function and,
  since almost all callers we checking multiple tiles, we
  were able to avoid incrementing the count each time and
  just add a single incrments at the end.

* Switched from recording each tile that a sprite intersects
  with each from to only recording the top-left tile and the
  overlap size.  This reduced overhead for larger sprites
  and removed the needs for an end-of-list marker.

* Much more aggressive caching of Sprite and Tile Store
  values in order to streamline the inner tile dispatch
  routines.

* Moving TileStore and Sprites (and other supporting
  data structures) into a separate data bank.  Needed just
  for size purposes and provide micro-optimizations by
  opening up the use of abs,y addressing modes.

* Revamped multi-sprite rendering code to avoid the need to
  copy any masks and all stacked sprites can be drawn
  via a sequence of and [addrX],y; ora (addrX),y where
  addrX is set once per tile.

* General streamlining to reduct overhead. This work was
  focused on removing as much per-tile overhead as possible.
2022-04-20 07:43:16 -05:00

214 lines
7.8 KiB
ArmAsm

; _TBSolidSpriteTile
;
; Renders solid tiles with sprites layered on top of the tile data. Because we need to combine
; data from the sprite plane, tile data and write to the code field (which are all in different banks),
; there is no way to do everything inline, so a composite tile is created on the fly and written to
; a direct page buffer. This direct page buffer is then used to render the tile.
_TBSolidSpriteTile_00
_TBSolidSpriteTile_0H
jsr _TBCopyTileDataToCBuff ; Copy the tile into the compositing buffer (using correct x-register)
jsr _TBApplySpriteData ; Overlay the data from the sprite plane (and copy into the code field)
jmp _TBFillPEAOpcode ; Fill in the code field opcodes
_TBSolidSpriteTile_V0
_TBSolidSpriteTile_VH
jsr _TBCopyTileDataToCBuffV
jsr _TBApplySpriteData
jmp _TBFillPEAOpcode
; Fast variation that does not need to set the opcode
_TBFastSpriteTile_00
_TBFastSpriteTile_0H
jsr _TBCopyTileDataToCBuff ; Copy the tile into the compositing buffer
jmp _TBApplySpriteData ; Overlay the data form the sprite plane (and copy into the code field)
_TBFastSpriteTile_V0
_TBFastSpriteTile_VH
jsr _TBCopyTileDataToCBuffV
jmp _TBApplySpriteData
; Need to update the X-register before calling this
_TBApplySpriteData
ldx _SPR_X_REG ; set to the unaligned tile block address in the sprite plane
]line equ 0
lup 8
lda blttmp+{]line*4}
andl spritemask+{]line*SPRITE_PLANE_SPAN},x
oral spritedata+{]line*SPRITE_PLANE_SPAN},x
sta: $0004+{]line*$1000},y
lda blttmp+{]line*4}+2
andl spritemask+{]line*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{]line*SPRITE_PLANE_SPAN}+2,x
sta: $0001+{]line*$1000},y
]line equ ]line+1
--^
rts
_TBApplySpriteDataOne
ldx spriteIdx
]line equ 0
lup 8
lda blttmp+{]line*4}
andl spritemask+{]line*SPRITE_PLANE_SPAN},x
oral spritedata+{]line*SPRITE_PLANE_SPAN},x
sta: $0004+{]line*$1000},y
lda blttmp+{]line*4}+2
andl spritemask+{]line*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{]line*SPRITE_PLANE_SPAN}+2,x
sta: $0001+{]line*$1000},y
]line equ ]line+1
--^
rts
_TBApplySpriteDataTwo
]line equ 0
lup 8
lda blttmp+{]line*4}
ldx spriteIdx+2
andl spritemask+{]line*SPRITE_PLANE_SPAN},x
oral spritedata+{]line*SPRITE_PLANE_SPAN},x
ldx spriteIdx
andl spritemask+{]line*SPRITE_PLANE_SPAN},x
oral spritedata+{]line*SPRITE_PLANE_SPAN},x
sta: $0004+{]line*$1000},y
lda blttmp+{]line*4}+2
ldx spriteIdx+2
andl spritemask+{]line*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{]line*SPRITE_PLANE_SPAN}+2,x
ldx spriteIdx
andl spritemask+{]line*SPRITE_PLANE_SPAN}+2,x
oral spritedata+{]line*SPRITE_PLANE_SPAN}+2,x
sta: $0001+{]line*$1000},y
]line equ ]line+1
--^
rts
; Copy tile data into the direct page compositing buffer. The main reason to do this in full passes is
; because we can avoid needing to use both the X and Y registers during the compositing process and
; reserve Y to hold the code field address.
;
; Also, we can get away with not setting the bank register, this is a wash in terms of speed, but results
; in simpler, more composable subroutines
_TBCopyTileDataToCBuff
]line equ 0
lup 8
ldal tiledata+{]line*4},x
sta blttmp+{]line*4}
ldal tiledata+{]line*4}+2,x
sta blttmp+{]line*4}+2
]line equ ]line+1
--^
rts
;_TBCopyTileDataToCBuffH
;]line equ 0
; lup 8
; ldal tiledata+{]line*4}+64,x
; sta blttmp+{]line*4}
;
; ldal tiledata+{]line*4}+64+2,x
; sta blttmp+{]line*4}+2
;]line equ ]line+1
; --^
; rts
_TBCopyTileDataToCBuffV
]src equ 7
]dest equ 0
lup 8
ldal tiledata+{]src*4},x
sta blttmp+{]dest*4}
ldal tiledata+{]src*4}+2,x
sta blttmp+{]dest*4}+2
]src equ ]src-1
]dest equ ]dest+1
--^
rts
;_TBCopyTileDataToCBuffVH
;]src equ 7
;]dest equ 0
; lup 8
; ldal tiledata+{]src*4}+64,x
; sta blttmp+{]dest*4}
;
; ldal tiledata+{]src*4}+64+2,x
; sta blttmp+{]dest*4}+2
;]src equ ]src-1
;]dest equ ]dest+1
; --^
; rts
; Copy tile mask data into the direct page compositing buffer.
_TBCopyTileMaskToCBuff
]line equ 0
lup 8
ldal tiledata+{]line*4}+32,x
sta blttmp+{]line*4}+32
ldal tiledata+{]line*4}+32+2,x
sta blttmp+{]line*4}+32+2
]line equ ]line+1
--^
rts
;_TBCopyTileMaskToCBuffH
;]line equ 0
; lup 8
; ldal tiledata+{]line*4}+32+64,x
; sta blttmp+{]line*4}+32
;
; ldal tiledata+{]line*4}+32+64+2,x
; sta blttmp+{]line*4}+32+2
;]line equ ]line+1
; --^
; rts
_TBCopyTileMaskToCBuffV
]src equ 7
]dest equ 0
lup 8
ldal tiledata+{]src*4}+32,x
sta blttmp+{]dest*4}+32
ldal tiledata+{]src*4}+32+2,x
sta blttmp+{]dest*4}+32+2
]src equ ]src-1
]dest equ ]dest+1
--^
rts
;_TBCopyTileMaskToCBuffVH
;]src equ 7
;]dest equ 0
; lup 8
; ldal tiledata+{]src*4}+32+64,x
; sta blttmp+{]dest*4}+32
;
; ldal tiledata+{]src*4}+32+64+2,x
; sta blttmp+{]dest*4}+32+2
;]src equ ]src-1
;]dest equ ]dest+1
; --^
; rts
; Copy just the data into the code field from the composite buffer
_TBSolidComposite
]line equ 0
lup 8
lda blttmp+{]line*4}
sta: $0004+{]line*$1000},y
lda blttmp+{]line*4}+2
sta: $0001+{]line*$1000},y
]line equ ]line+1
--^
rts