Mostly working PPU integration with GTE rendering pipeline

This commit is contained in:
Lucas Scharenbroich 2023-05-21 17:50:05 -05:00
parent 64f7106143
commit d3a7eecc6e
6 changed files with 516 additions and 52 deletions

View File

@ -37,6 +37,9 @@ Tmp3 equ 246
Tmp4 equ 248
Tmp5 equ 250
FTblPtr equ 224
FTblTmp equ 228
phk
plb
sta MyUserId ; GS/OS passes the memory manager user ID for the application into the program
@ -1080,6 +1083,6 @@ nmiTask
ds \,$00 ; pad to the next page boundary
PPU_MEM
CHR_ROM put chr2.s ; 8K of CHR-ROM at PPU memory $0000 - $2000
PPU_NT ds $2000 ; Nametable memory from $2000 - $3000, $3F00 - $3F14 is palette RAM
PPU_OAM ds 256 ; 256 bytes of separate OAM RAM
PPU_NT ds $2000 ; Nametable memory from $2000 - $3000, $3F00 - $3F14 is palette RAM
PPU_OAM ds 256 ; 256 bytes of separate OAM RAM

View File

@ -2,6 +2,29 @@
;
; Any read/write to the PPU registers in the ROM is intercepted and passed here.
const8 mac
db ]1,]1,]1,]1,]1,]1,]1,]1
<<<
const32 mac
const8 ]1
const8 ]1+1
const8 ]1+2
const8 ]1+3
<<<
rep8 mac
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
<<<
mx %11
dw $a5a5 ; marker to find in memory
ppuaddr ds 2 ; 16-bit ppu address
@ -495,34 +518,409 @@ PPUDMA_WRITE ENT
y_offset equ 16
x_offset equ 16
drawOAMSprites
:tmp equ 238
; Scan the OAM memory and copy the values of the sprites that need to be drawn. There are two reasons to do this
;
; 1. Freeze the OAM memory at this instanct so that the NES ISR can keep running without changing values
; 2. We have to scan this list twice -- once to build up the shadow list and once to actually render the sprites
OAM_COPY ds 256
spriteCount ds 0
db 0 ; Pad in case we can to access using 16-bit instructions
; 248 is reserved for the blitter
mx %00
scanOAMSprites
sep #$30
ldx #4 ; Always skip sprite 0
ldy #0
:loop
lda PPU_OAM,x ; Y-coordinate
cmp #200+y_offset-9
bcs :skip
lda PPU_OAM+1,x ; $FC is an empty tile, don't draw it
cmp #$FC
beq :skip
lda PPU_OAM+3,x ; If X-coordinate is off the edge skip it, too.
cmp #241
bcs :skip
rep #$20
lda PPU_OAM,x
sta OAM_COPY,y
lda PPU_OAM+2,x
sta OAM_COPY+2,y
sep #$20
iny
iny
iny
iny
:skip
inx
inx
inx
inx
bne :loop
sty spriteCount ; Count * 4
rep #$30
rts
; Screen is 200 lines tall. It's worth it be exact when building the list because one extra
; draw + shadow sequence takes at least 1,000 cycles.
shadowBitmap ds 32 ; Provide enough space for the full ppu range (240 lines) + 16 since the y coordinate can be off-screen
; A representation of the list as [top, bot) pairs
shadowListCount dw 0 ; Pad for 16-bit comparisons
shadowListTop ds 64
shadowListBot ds 64
mx %00
buildShadowBitmap
; zero out the bitmap (16-bit writes)
]n equ 0
lup 15
stz shadowBitmap+]n
]n = ]n+2
--^
; Run through the list of visible sprites and ORA in the bits that represent them
sep #$30
ldx #0
cpx spriteCount
beq :exit
:loop
phx
; ldy PPU_OAM,x
ldy OAM_COPY,x
iny ; This is the y-coordinate of the top of the sprite
ldx y2idx,y ; Get the index into the shadowBitmap array for this y coordinate
lda y2low,y ; Get the bit pattern for the first byte
ora shadowBitmap,x
sta shadowBitmap,x
lda y2high,y ; Get the bit pattern for the second byte
ora shadowBitmap+1,x
sta shadowBitmap+1,x
plx
inx
inx
inx
inx
cpx spriteCount
bcc :loop
:exit
rep #$30
rts
y2idx const32 $00
const32 $04
const32 $08
const32 $0C ; 128 bytes
const32 $10
const32 $14
const32 $18
const32 $1C
; Repeating pattern of 8 consecutive 1 bits
y2low rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
y2high rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
; 25 entries to multiple steps in the shadow bitmap to scanlines
mul8 db $00,$08,$10,$18,$20,$28,$30,$38
db $40,$48,$50,$58,$60,$68,$70,$78
db $80,$88,$90,$98,$A0,$A8,$B0,$B8
db $C0,$C8,$D0,$D8,$E0,$E8,$F0,$F8
; Given a bit pattern, create a LUT that count to the first set bit (MSB -> LSB), e.g. $0F = 4, $3F = 2
offset db 0,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 ; 0, 1, 2, 4, 8, 16
db 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 ; 32
db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
; Scan the bitmap list and call BltRange on the ranges
mx %00
drawShadowList
ldx #0
cpx shadowListCount
beq :exit
:loop
phx
lda shadowListBot,x
and #$00FF
tay
cpy #201
bcc *+4
brk $cc
lda shadowListTop,x
and #$00FF
tax
cpx #200
bcc *+4
brk $dd
lda #0 ; Invoke the BltRange function
jsl LngJmp
plx
inx
cpx shadowListCount
bcc :loop
:exit
rts
; Altername between BltRange and PEISlam to expose the screen
exposeShadowList
:last equ Tmp0
:top equ Tmp1
:bottom equ Tmp2
ldx #0
stx :last
cpx shadowListCount
beq :exit
:loop
phx
lda shadowListTop,x
and #$00FF
sta :top
cmp #200
bcc *+4
brk $44
lda shadowListBot,x
and #$00FF
sta :bottom
cmp #201
bcc *+4
brk $66
cmp :top
bcs *+4
brk $55
ldx :last
ldy :top
lda #0
jsl LngJmp ; Draw the background up to this range
ldx :top
ldy :bottom
sty :last ; This is where we ended
lda #1
jsl LngJmp ; Expose the already-drawn sprites
plx
inx
cpx shadowListCount
bcc :loop
:exit
ldx :last ; Expose the final part
ldy #200
lda #0
jsl LngJmp
rts
; This routine needs to adjust the y-coordinates based of the offset of the GTE playfield within
; the PPU RAM
shadowBitmapToList
:top equ Tmp0
:bottom equ Tmp2
sep #$30
ldx #2 ; Start at he third row (y_offset = 16) walk the bitmap for 25 bytes (200 lines of height)
lda #0
sta shadowListCount ; zero out the shadow list count
; This loop is called when we are not tracking a sprite range
:zero_loop
ldy shadowBitmap,x
beq :zero_next
lda mul8-2,x ; This is the scanline we're on (offset by the starting byte)
clc
adc offset,y ; This is the first line defined by the bit pattern
sta :top
bra :one_next
:zero_next
inx
cpx #28 ; End at byte 27
bcc :zero_loop
bra :exit ; ended while not tracking a sprite, so exit the function
:one_loop
lda shadowBitmap,x ; if the next byte is all sprite, just continue
eor #$FF
beq :one_next
tay ; Use the inverted bitfield in order to re-use the same lookup table
lda mul8-2,x
clc
adc offset,y
ldy shadowListCount
sta shadowListBot,y
lda :top
sta shadowListTop,y
iny
sty shadowListCount
bra :zero_next
:one_next
inx
cpx #28
bcc :one_loop
; If we end while tracking a sprite, add to the list as the last item
ldx shadowListCount
lda :top
sta shadowListTop,x
lda #200
sta shadowListBot,x
inx
stx shadowListCount
:exit
rep #$30
lda shadowListCount
cmp #64
bcc *+4
brk $13
rts
; Helper to bounce into the function in the FTblPtr. See IIgs TN #90
LngJmp
sty FTblTmp
asl
asl
tay
iny
lda [FTblPtr],y
pha
dey
lda [FTblPtr],y
dec
phb
php
sta 1,s
ldy FTblTmp ; Restore the y register
rtl
; Callback entrypoint from the GTE renderer
drawOAMSprites
phb
phd
phk
plb
pha
lda DPSave
tcd
; Save the pointer to the function table
sty FTblPtr
stx FTblPtr+2
pla
; Check what phase we're in
;
; Phase 1: A = 0
; Phase 2: A = 1
cmp #0
bne :phase2
; This is phase 1. We will build the sprite list and draw the background in the areas covered by
; sprites. This phase draws the sprites, too
ldal nmiCount
pha
; We need to "freeze" the OAM values, otherwise they can change between when we build the rendering pipeline
sei
jsr scanOAMSprites ; Filter out any sprites that don't need to be drawn
pla
cmpl nmiCount
beq *+4
brk $1F ; Should not have serviced the VBL interrupt here....
cli
jsr buildShadowBitmap ; Run though and quickly create a bitmap of lines with sprites
jsr shadowBitmapToList ; Can the bitmap and create (top, bottom) pairs of ranges
jsr drawShadowList ; Draw the background lines that have sprite on them
jsr drawSprites ; Draw the sprites on top of the lines they occupy
bra :exit
; In Phase 2 we scan the shadow list and alternately blit the background in empty areas and
; PEI slam the sprite regions
:phase2
jsr exposeShadowList ; Show everything on the SHR screen
; Return form the callback
:exit
pld
plb
rtl
drawSprites
:tmp equ Tmp0
sep #$30 ; 8-bit cpu
ldx #4 ; Ok to always skip sprite 0
:oam_loop
lda PPU_OAM+3,x ; remove this test once we can clip sprites
cmp #241
bcs :hidden
; Run through the copy of the OAM memory
lda PPU_OAM+1,x ; $FC is an empty tile, don't draw it
cmp #$FC
beq :hidden
ldx #0
cpx spriteCount
bne oam_loop
rep #$30
rts
lda PPU_OAM,x ; Y-coordinate
cmp #200+y_offset-9
bcs :hidden
mx %11
oam_loop
phx ; Save x
phx
lda OAM_COPY,x ; Y-coordinate
inc ; Compensate for PPU delayed scanline
rep #$30
and #$00FF
asl
@ -539,26 +937,26 @@ drawOAMSprites
adc #$2000-{y_offset*160}+x_offset
sta :tmp
lda PPU_OAM+3,x
lda OAM_COPY+3,x
lsr
and #$007F
clc
adc :tmp
tay
lda PPU_OAM+2,x
lda OAM_COPY+2,x
pha
bit #$0040 ; horizontal flip
bne :hflip
lda PPU_OAM,x ; Load the tile index into the high byte (x256)
lda OAM_COPY,x ; Load the tile index into the high byte (x256)
and #$FF00
lsr ; multiple by 128
tax
bra :noflip
:hflip
lda PPU_OAM,x ; Load the tile index into the high byte (x256)
lda OAM_COPY,x ; Load the tile index into the high byte (x256)
and #$FF00
lsr ; multiple by 128
adc #64 ; horizontal flip
@ -567,22 +965,20 @@ drawOAMSprites
:noflip
pla
asl
; and #$0080 ; Set the vflip bit
and #$0106 ; Set the vflip bit and palette select bits
drawTilePatch
jsl $000000 ; Draw the tile on the graphics screen
sep #$30
plx
plx ; Restore the counter
inx
inx
inx
inx
cpx spriteCount
bcc oam_loop
:hidden
inx
inx
inx
inx
bne :oam_loop
rep #$30
rts
plp
plb
rtl

View File

@ -153,6 +153,11 @@ _DoOverlay
:disp jsl $000000
rts
; Callback structure with pointers to internal rendering functions
ExtFuncBlock
adrl BltRange
adrl PEISlam
; Special NES renderer that externalizes the sprite rendering in order to exceed the internal limit of 16 sprites
_RenderNES
jsr _ApplyBG0YPos
@ -172,7 +177,6 @@ _RenderNES
:no_tile
jsr _ApplyTiles ; This function actually draws the new tiles into the code field
; jsr _ApplyBG0XPos ; Patch the code field instructions with exit BRA opcode
stz tmp1 ; virt_line_x2
lda #16*2
@ -194,22 +198,45 @@ _RenderNES
lda tmp4
stal nesBottomOffset
ldx #0 ; Blit the full virtual buffer to the screen
ldy ScreenHeight
jsr _BltRange
; This is a tricky part. The NES does not keep sprites sorted, so we need an alternative way to figure out
; which lines to shadow and which ones not to. Our compromise is to build a bitmap of lines that the sprite
; occupy and then scan through that quickly.
;
; This is handled by the callback in two phases. We pass pointers to the internal function the callback needs
; access to. If there is no function defined, do nothing
lda ExtSpriteRenderer
ora ExtSpriteRenderer+2
beq :no_sprite
beq :no_render
lda ExtSpriteRenderer
stal :patch+1
stal :patch1+1
stal :patch2+1
lda ExtSpriteRenderer+1
stal :patch+2
:patch jsl $000000
stal :patch1+2
stal :patch2+2
:no_sprite
; Start the two-phase rendering process. First turn off shading and invoke the callback to
; draw sprite regions
jsr _ShadowOff
lda #0 ; Signal we're in phase 1 (shadowing off)
ldx #^ExtFuncBlock
ldy #ExtFuncBlock
:patch1 jsl $000000
; Now perform the second phase which renders the whole screen and exposes the sprites that were
; drawins in the first phase
jsr _ShadowOn
lda #1 ; Signal we're in phase 2 (shadowing on)
ldx #^ExtFuncBlock
ldy #ExtFuncBlock
:patch2 jsl $000000
:no_render
stz tmp1 ; :virt_line_x2
lda #16*2
sta tmp2 ; :lines_left_x2
@ -402,7 +429,6 @@ _DrawFinalPass
ldy _Sprites+SPRITE_CLIP_TOP,x ; PEI Slam to the top of the overlay (:bottom is greater than this value)
ldx :cursor
sty :cursor
; brk $44
jsr _PEISlam
lda 3,s ; Retrieve the sprite index
tax

View File

@ -144,6 +144,7 @@ zpToUse = userId+4
lda zpToUse,s ; Get the direct page address
phd ; Save the current direct page
tcd ; Set to our working direct page space
stal tool_direct_page ; Stash a copy in memory
txa
and #$00FF ; Get just the tool number

View File

@ -85,6 +85,7 @@ _BltRange
lda BlitterDP ; Set the direct page to the blitter data
tcd
php ; save the current processor flags
sei ; disable interrupts
_R0W1
tsc ; save the stack pointer
@ -95,7 +96,7 @@ blt_entry jml $000000 ; Jump into the blitter code $XX/YY00
blt_return _R0W0
stk_save lda #0000 ; load the stack
tcs
cli ; re-enable interrupts
plp ; re-enable interrupts (maybe, if interrupts disabled when we are called, they are not re-endabled)
pld ; restore the direct page
sep #$20
@ -106,3 +107,16 @@ stk_save lda #0000 ; load the stack
plb ; restore the bank
rts
; External entry point. Can be called directly from another bank
BltRange
phd
phb
ldal tool_direct_page
tcd
jsr _SetDataBank ; only affects accumulator
jsr _BltRange
plb
pld
rtl

View File

@ -14,17 +14,19 @@
; Y = last line (exclusive), valid range >X up to 200
_PEISlam
cpx #200
bcc *+3
rts
bcc *+4
brk $14
; rts
cpy #201
bcc *+3
rts
bcc *+4
brk $15
; rts
txa
tya ; x must be less than y
stal :screen_width_1
tya
txa
cmpl :screen_width_1
bcs *+3
bcc *+3
rts
@ -74,6 +76,10 @@ _PEISlam
adcl :screen_width_1
tcs
cmp #$9D00
bcc *+4
brk $85 ; Kill if stack it out of range
dey ; decrement the total counter, if zero then we're done
beq :exit
@ -115,3 +121,21 @@ _PEISlam
:stk_save ds 2
:screen_width_1 ds 2
; A stashed memory location just in case we need it. This is filled in the GTEStartUp()
tool_direct_page ds 2
; External entry point. Can be called directly from another bank
PEISlam
phd
phb
ldal tool_direct_page
tcd
jsr _SetDataBank ; only affects accumulator
jsr _PEISlam
plb
pld
rtl