Mostly working PPU integration with GTE rendering pipeline

This commit is contained in:
Lucas Scharenbroich 2023-05-21 17:50:05 -05:00
parent 64f7106143
commit d3a7eecc6e
6 changed files with 516 additions and 52 deletions

View File

@ -37,6 +37,9 @@ Tmp3 equ 246
Tmp4 equ 248 Tmp4 equ 248
Tmp5 equ 250 Tmp5 equ 250
FTblPtr equ 224
FTblTmp equ 228
phk phk
plb plb
sta MyUserId ; GS/OS passes the memory manager user ID for the application into the program sta MyUserId ; GS/OS passes the memory manager user ID for the application into the program
@ -1080,6 +1083,6 @@ nmiTask
ds \,$00 ; pad to the next page boundary ds \,$00 ; pad to the next page boundary
CHR_ROM put chr2.s ; 8K of CHR-ROM at PPU memory $0000 - $2000 CHR_ROM put chr2.s ; 8K of CHR-ROM at PPU memory $0000 - $2000
PPU_NT ds $2000 ; Nametable memory from $2000 - $3000, $3F00 - $3F14 is palette RAM PPU_NT ds $2000 ; Nametable memory from $2000 - $3000, $3F00 - $3F14 is palette RAM
PPU_OAM ds 256 ; 256 bytes of separate OAM RAM PPU_OAM ds 256 ; 256 bytes of separate OAM RAM

View File

@ -2,6 +2,29 @@
; ;
; Any read/write to the PPU registers in the ROM is intercepted and passed here. ; Any read/write to the PPU registers in the ROM is intercepted and passed here.
const8 mac
db ]1,]1,]1,]1,]1,]1,]1,]1
const32 mac
const8 ]1
const8 ]1+1
const8 ]1+2
const8 ]1+3
rep8 mac
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
mx %11 mx %11
dw $a5a5 ; marker to find in memory dw $a5a5 ; marker to find in memory
ppuaddr ds 2 ; 16-bit ppu address ppuaddr ds 2 ; 16-bit ppu address
@ -495,34 +518,409 @@ PPUDMA_WRITE ENT
y_offset equ 16 y_offset equ 16
x_offset equ 16 x_offset equ 16
drawOAMSprites ; Scan the OAM memory and copy the values of the sprites that need to be drawn. There are two reasons to do this
:tmp equ 238 ;
; 1. Freeze the OAM memory at this instanct so that the NES ISR can keep running without changing values
; 2. We have to scan this list twice -- once to build up the shadow list and once to actually render the sprites
OAM_COPY ds 256
spriteCount ds 0
db 0 ; Pad in case we can to access using 16-bit instructions
; 248 is reserved for the blitter mx %00
sep #$30
ldx #4 ; Always skip sprite 0
ldy #0
lda PPU_OAM,x ; Y-coordinate
cmp #200+y_offset-9
bcs :skip
lda PPU_OAM+1,x ; $FC is an empty tile, don't draw it
cmp #$FC
beq :skip
lda PPU_OAM+3,x ; If X-coordinate is off the edge skip it, too.
cmp #241
bcs :skip
rep #$20
lda PPU_OAM,x
sta OAM_COPY,y
lda PPU_OAM+2,x
sta OAM_COPY+2,y
sep #$20
bne :loop
sty spriteCount ; Count * 4
rep #$30
; Screen is 200 lines tall. It's worth it be exact when building the list because one extra
; draw + shadow sequence takes at least 1,000 cycles.
shadowBitmap ds 32 ; Provide enough space for the full ppu range (240 lines) + 16 since the y coordinate can be off-screen
; A representation of the list as [top, bot) pairs
shadowListCount dw 0 ; Pad for 16-bit comparisons
shadowListTop ds 64
shadowListBot ds 64
mx %00
; zero out the bitmap (16-bit writes)
]n equ 0
lup 15
stz shadowBitmap+]n
]n = ]n+2
; Run through the list of visible sprites and ORA in the bits that represent them
sep #$30
ldx #0
cpx spriteCount
beq :exit
; ldy PPU_OAM,x
ldy OAM_COPY,x
iny ; This is the y-coordinate of the top of the sprite
ldx y2idx,y ; Get the index into the shadowBitmap array for this y coordinate
lda y2low,y ; Get the bit pattern for the first byte
ora shadowBitmap,x
sta shadowBitmap,x
lda y2high,y ; Get the bit pattern for the second byte
ora shadowBitmap+1,x
sta shadowBitmap+1,x
cpx spriteCount
bcc :loop
rep #$30
y2idx const32 $00
const32 $04
const32 $08
const32 $0C ; 128 bytes
const32 $10
const32 $14
const32 $18
const32 $1C
; Repeating pattern of 8 consecutive 1 bits
y2low rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
y2high rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
; 25 entries to multiple steps in the shadow bitmap to scanlines
mul8 db $00,$08,$10,$18,$20,$28,$30,$38
db $40,$48,$50,$58,$60,$68,$70,$78
db $80,$88,$90,$98,$A0,$A8,$B0,$B8
db $C0,$C8,$D0,$D8,$E0,$E8,$F0,$F8
; Given a bit pattern, create a LUT that count to the first set bit (MSB -> LSB), e.g. $0F = 4, $3F = 2
offset db 0,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 ; 0, 1, 2, 4, 8, 16
db 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 ; 32
db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
; Scan the bitmap list and call BltRange on the ranges
mx %00
ldx #0
cpx shadowListCount
beq :exit
lda shadowListBot,x
and #$00FF
cpy #201
bcc *+4
brk $cc
lda shadowListTop,x
and #$00FF
cpx #200
bcc *+4
brk $dd
lda #0 ; Invoke the BltRange function
jsl LngJmp
cpx shadowListCount
bcc :loop
; Altername between BltRange and PEISlam to expose the screen
:last equ Tmp0
:top equ Tmp1
:bottom equ Tmp2
ldx #0
stx :last
cpx shadowListCount
beq :exit
lda shadowListTop,x
and #$00FF
sta :top
cmp #200
bcc *+4
brk $44
lda shadowListBot,x
and #$00FF
sta :bottom
cmp #201
bcc *+4
brk $66
cmp :top
bcs *+4
brk $55
ldx :last
ldy :top
lda #0
jsl LngJmp ; Draw the background up to this range
ldx :top
ldy :bottom
sty :last ; This is where we ended
lda #1
jsl LngJmp ; Expose the already-drawn sprites
cpx shadowListCount
bcc :loop
ldx :last ; Expose the final part
ldy #200
lda #0
jsl LngJmp
; This routine needs to adjust the y-coordinates based of the offset of the GTE playfield within
; the PPU RAM
:top equ Tmp0
:bottom equ Tmp2
sep #$30
ldx #2 ; Start at he third row (y_offset = 16) walk the bitmap for 25 bytes (200 lines of height)
lda #0
sta shadowListCount ; zero out the shadow list count
; This loop is called when we are not tracking a sprite range
ldy shadowBitmap,x
beq :zero_next
lda mul8-2,x ; This is the scanline we're on (offset by the starting byte)
adc offset,y ; This is the first line defined by the bit pattern
sta :top
bra :one_next
cpx #28 ; End at byte 27
bcc :zero_loop
bra :exit ; ended while not tracking a sprite, so exit the function
lda shadowBitmap,x ; if the next byte is all sprite, just continue
eor #$FF
beq :one_next
tay ; Use the inverted bitfield in order to re-use the same lookup table
lda mul8-2,x
adc offset,y
ldy shadowListCount
sta shadowListBot,y
lda :top
sta shadowListTop,y
sty shadowListCount
bra :zero_next
cpx #28
bcc :one_loop
; If we end while tracking a sprite, add to the list as the last item
ldx shadowListCount
lda :top
sta shadowListTop,x
lda #200
sta shadowListBot,x
stx shadowListCount
rep #$30
lda shadowListCount
cmp #64
bcc *+4
brk $13
; Helper to bounce into the function in the FTblPtr. See IIgs TN #90
sty FTblTmp
lda [FTblPtr],y
lda [FTblPtr],y
phb phb
php sta 1,s
ldy FTblTmp ; Restore the y register
; Callback entrypoint from the GTE renderer
phk phk
plb plb
lda DPSave
; Save the pointer to the function table
sty FTblPtr
stx FTblPtr+2
; Check what phase we're in
; Phase 1: A = 0
; Phase 2: A = 1
cmp #0
bne :phase2
; This is phase 1. We will build the sprite list and draw the background in the areas covered by
; sprites. This phase draws the sprites, too
ldal nmiCount
; We need to "freeze" the OAM values, otherwise they can change between when we build the rendering pipeline
jsr scanOAMSprites ; Filter out any sprites that don't need to be drawn
cmpl nmiCount
beq *+4
brk $1F ; Should not have serviced the VBL interrupt here....
jsr buildShadowBitmap ; Run though and quickly create a bitmap of lines with sprites
jsr shadowBitmapToList ; Can the bitmap and create (top, bottom) pairs of ranges
jsr drawShadowList ; Draw the background lines that have sprite on them
jsr drawSprites ; Draw the sprites on top of the lines they occupy
bra :exit
; In Phase 2 we scan the shadow list and alternately blit the background in empty areas and
; PEI slam the sprite regions
jsr exposeShadowList ; Show everything on the SHR screen
; Return form the callback
:tmp equ Tmp0
sep #$30 ; 8-bit cpu sep #$30 ; 8-bit cpu
ldx #4 ; Ok to always skip sprite 0
:oam_loop ; Run through the copy of the OAM memory
lda PPU_OAM+3,x ; remove this test once we can clip sprites
cmp #241
bcs :hidden
lda PPU_OAM+1,x ; $FC is an empty tile, don't draw it ldx #0
cmp #$FC cpx spriteCount
beq :hidden bne oam_loop
rep #$30
lda PPU_OAM,x ; Y-coordinate mx %11
cmp #200+y_offset-9 oam_loop
bcs :hidden phx ; Save x
phx lda OAM_COPY,x ; Y-coordinate
inc ; Compensate for PPU delayed scanline inc ; Compensate for PPU delayed scanline
rep #$30 rep #$30
and #$00FF and #$00FF
asl asl
@ -539,26 +937,26 @@ drawOAMSprites
adc #$2000-{y_offset*160}+x_offset adc #$2000-{y_offset*160}+x_offset
sta :tmp sta :tmp
lda PPU_OAM+3,x lda OAM_COPY+3,x
lsr lsr
and #$007F and #$007F
clc clc
adc :tmp adc :tmp
tay tay
lda PPU_OAM+2,x lda OAM_COPY+2,x
pha pha
bit #$0040 ; horizontal flip bit #$0040 ; horizontal flip
bne :hflip bne :hflip
lda PPU_OAM,x ; Load the tile index into the high byte (x256) lda OAM_COPY,x ; Load the tile index into the high byte (x256)
and #$FF00 and #$FF00
lsr ; multiple by 128 lsr ; multiple by 128
tax tax
bra :noflip bra :noflip
:hflip :hflip
lda PPU_OAM,x ; Load the tile index into the high byte (x256) lda OAM_COPY,x ; Load the tile index into the high byte (x256)
and #$FF00 and #$FF00
lsr ; multiple by 128 lsr ; multiple by 128
adc #64 ; horizontal flip adc #64 ; horizontal flip
@ -567,22 +965,20 @@ drawOAMSprites
:noflip :noflip
pla pla
asl asl
; and #$0080 ; Set the vflip bit
and #$0106 ; Set the vflip bit and palette select bits and #$0106 ; Set the vflip bit and palette select bits
drawTilePatch drawTilePatch
jsl $000000 ; Draw the tile on the graphics screen jsl $000000 ; Draw the tile on the graphics screen
sep #$30 sep #$30
plx plx ; Restore the counter
cpx spriteCount
bcc oam_loop
:hidden rep #$30
inx rts
bne :oam_loop

View File

@ -153,6 +153,11 @@ _DoOverlay
:disp jsl $000000 :disp jsl $000000
rts rts
; Callback structure with pointers to internal rendering functions
adrl BltRange
adrl PEISlam
; Special NES renderer that externalizes the sprite rendering in order to exceed the internal limit of 16 sprites ; Special NES renderer that externalizes the sprite rendering in order to exceed the internal limit of 16 sprites
_RenderNES _RenderNES
jsr _ApplyBG0YPos jsr _ApplyBG0YPos
@ -172,7 +177,6 @@ _RenderNES
:no_tile :no_tile
jsr _ApplyTiles ; This function actually draws the new tiles into the code field jsr _ApplyTiles ; This function actually draws the new tiles into the code field
; jsr _ApplyBG0XPos ; Patch the code field instructions with exit BRA opcode
stz tmp1 ; virt_line_x2 stz tmp1 ; virt_line_x2
lda #16*2 lda #16*2
@ -194,22 +198,45 @@ _RenderNES
lda tmp4 lda tmp4
stal nesBottomOffset stal nesBottomOffset
ldx #0 ; Blit the full virtual buffer to the screen ; This is a tricky part. The NES does not keep sprites sorted, so we need an alternative way to figure out
ldy ScreenHeight ; which lines to shadow and which ones not to. Our compromise is to build a bitmap of lines that the sprite
jsr _BltRange ; occupy and then scan through that quickly.
; This is handled by the callback in two phases. We pass pointers to the internal function the callback needs
; access to. If there is no function defined, do nothing
lda ExtSpriteRenderer lda ExtSpriteRenderer
ora ExtSpriteRenderer+2 ora ExtSpriteRenderer+2
beq :no_sprite beq :no_render
lda ExtSpriteRenderer lda ExtSpriteRenderer
stal :patch+1 stal :patch1+1
stal :patch2+1
lda ExtSpriteRenderer+1 lda ExtSpriteRenderer+1
stal :patch+2 stal :patch1+2
:patch jsl $000000 stal :patch2+2
:no_sprite ; Start the two-phase rendering process. First turn off shading and invoke the callback to
; draw sprite regions
jsr _ShadowOff
lda #0 ; Signal we're in phase 1 (shadowing off)
ldx #^ExtFuncBlock
ldy #ExtFuncBlock
:patch1 jsl $000000
; Now perform the second phase which renders the whole screen and exposes the sprites that were
; drawins in the first phase
jsr _ShadowOn
lda #1 ; Signal we're in phase 2 (shadowing on)
ldx #^ExtFuncBlock
ldy #ExtFuncBlock
:patch2 jsl $000000
stz tmp1 ; :virt_line_x2 stz tmp1 ; :virt_line_x2
lda #16*2 lda #16*2
sta tmp2 ; :lines_left_x2 sta tmp2 ; :lines_left_x2
@ -402,7 +429,6 @@ _DrawFinalPass
ldy _Sprites+SPRITE_CLIP_TOP,x ; PEI Slam to the top of the overlay (:bottom is greater than this value) ldy _Sprites+SPRITE_CLIP_TOP,x ; PEI Slam to the top of the overlay (:bottom is greater than this value)
ldx :cursor ldx :cursor
sty :cursor sty :cursor
; brk $44
jsr _PEISlam jsr _PEISlam
lda 3,s ; Retrieve the sprite index lda 3,s ; Retrieve the sprite index
tax tax

View File

@ -144,6 +144,7 @@ zpToUse = userId+4
lda zpToUse,s ; Get the direct page address lda zpToUse,s ; Get the direct page address
phd ; Save the current direct page phd ; Save the current direct page
tcd ; Set to our working direct page space tcd ; Set to our working direct page space
stal tool_direct_page ; Stash a copy in memory
txa txa
and #$00FF ; Get just the tool number and #$00FF ; Get just the tool number

View File

@ -85,6 +85,7 @@ _BltRange
lda BlitterDP ; Set the direct page to the blitter data lda BlitterDP ; Set the direct page to the blitter data
tcd tcd
php ; save the current processor flags
sei ; disable interrupts sei ; disable interrupts
_R0W1 _R0W1
tsc ; save the stack pointer tsc ; save the stack pointer
@ -95,7 +96,7 @@ blt_entry jml $000000 ; Jump into the blitter code $XX/YY00
blt_return _R0W0 blt_return _R0W0
stk_save lda #0000 ; load the stack stk_save lda #0000 ; load the stack
tcs tcs
cli ; re-enable interrupts plp ; re-enable interrupts (maybe, if interrupts disabled when we are called, they are not re-endabled)
pld ; restore the direct page pld ; restore the direct page
sep #$20 sep #$20
@ -106,3 +107,16 @@ stk_save lda #0000 ; load the stack
plb ; restore the bank plb ; restore the bank
rts rts
; External entry point. Can be called directly from another bank
ldal tool_direct_page
jsr _SetDataBank ; only affects accumulator
jsr _BltRange

View File

@ -14,17 +14,19 @@
; Y = last line (exclusive), valid range >X up to 200 ; Y = last line (exclusive), valid range >X up to 200
_PEISlam _PEISlam
cpx #200 cpx #200
bcc *+3 bcc *+4
rts brk $14
; rts
cpy #201 cpy #201
bcc *+3 bcc *+4
rts brk $15
; rts
txa tya ; x must be less than y
stal :screen_width_1 stal :screen_width_1
tya txa
cmpl :screen_width_1 cmpl :screen_width_1
bcs *+3 bcc *+3
rts rts
@ -74,6 +76,10 @@ _PEISlam
adcl :screen_width_1 adcl :screen_width_1
tcs tcs
cmp #$9D00
bcc *+4
brk $85 ; Kill if stack it out of range
dey ; decrement the total counter, if zero then we're done dey ; decrement the total counter, if zero then we're done
beq :exit beq :exit
@ -115,3 +121,21 @@ _PEISlam
:stk_save ds 2 :stk_save ds 2
:screen_width_1 ds 2 :screen_width_1 ds 2
; A stashed memory location just in case we need it. This is filled in the GTEStartUp()
tool_direct_page ds 2
; External entry point. Can be called directly from another bank
ldal tool_direct_page
jsr _SetDataBank ; only affects accumulator
jsr _PEISlam