diff --git a/demos/smb/Main.s b/demos/smb/Main.s index e1abedb..9087b8e 100644 --- a/demos/smb/Main.s +++ b/demos/smb/Main.s @@ -37,6 +37,9 @@ Tmp3 equ 246 Tmp4 equ 248 Tmp5 equ 250 +FTblPtr equ 224 +FTblTmp equ 228 + phk plb sta MyUserId ; GS/OS passes the memory manager user ID for the application into the program @@ -1080,6 +1083,6 @@ nmiTask ds \,$00 ; pad to the next page boundary PPU_MEM CHR_ROM put chr2.s ; 8K of CHR-ROM at PPU memory $0000 - $2000 -PPU_NT ds $2000 ; Nametable memory from $2000 - $3000, $3F00 - $3F14 is palette RAM -PPU_OAM ds 256 ; 256 bytes of separate OAM RAM - +PPU_NT ds $2000 ; Nametable memory from $2000 - $3000, $3F00 - $3F14 is palette RAM +PPU_OAM ds 256 ; 256 bytes of separate OAM RAM + diff --git a/demos/smb/ppu.s b/demos/smb/ppu.s index 6e1df35..2b05252 100644 --- a/demos/smb/ppu.s +++ b/demos/smb/ppu.s @@ -2,6 +2,29 @@ ; ; Any read/write to the PPU registers in the ROM is intercepted and passed here. + +const8 mac + db ]1,]1,]1,]1,]1,]1,]1,]1 + <<< + +const32 mac + const8 ]1 + const8 ]1+1 + const8 ]1+2 + const8 ]1+3 + <<< + +rep8 mac + db ]1 + db ]1 + db ]1 + db ]1 + db ]1 + db ]1 + db ]1 + db ]1 + <<< + mx %11 dw $a5a5 ; marker to find in memory ppuaddr ds 2 ; 16-bit ppu address @@ -495,34 +518,409 @@ PPUDMA_WRITE ENT y_offset equ 16 x_offset equ 16 -drawOAMSprites -:tmp equ 238 +; Scan the OAM memory and copy the values of the sprites that need to be drawn. There are two reasons to do this +; +; 1. Freeze the OAM memory at this instanct so that the NES ISR can keep running without changing values +; 2. We have to scan this list twice -- once to build up the shadow list and once to actually render the sprites +OAM_COPY ds 256 +spriteCount ds 0 + db 0 ; Pad in case we can to access using 16-bit instructions -; 248 is reserved for the blitter + mx %00 +scanOAMSprites + sep #$30 + + ldx #4 ; Always skip sprite 0 + ldy #0 + +:loop + lda PPU_OAM,x ; Y-coordinate + cmp #200+y_offset-9 + bcs :skip + + lda PPU_OAM+1,x ; $FC is an empty tile, don't draw it + cmp #$FC + beq :skip + + lda PPU_OAM+3,x ; If X-coordinate is off the edge skip it, too. + cmp #241 + bcs :skip + + rep #$20 + lda PPU_OAM,x + sta OAM_COPY,y + lda PPU_OAM+2,x + sta OAM_COPY+2,y + sep #$20 + + iny + iny + iny + iny + +:skip + inx + inx + inx + inx + bne :loop + + sty spriteCount ; Count * 4 + rep #$30 + rts + +; Screen is 200 lines tall. It's worth it be exact when building the list because one extra +; draw + shadow sequence takes at least 1,000 cycles. +shadowBitmap ds 32 ; Provide enough space for the full ppu range (240 lines) + 16 since the y coordinate can be off-screen + +; A representation of the list as [top, bot) pairs +shadowListCount dw 0 ; Pad for 16-bit comparisons +shadowListTop ds 64 +shadowListBot ds 64 + + mx %00 +buildShadowBitmap + +; zero out the bitmap (16-bit writes) +]n equ 0 + lup 15 + stz shadowBitmap+]n +]n = ]n+2 + --^ + +; Run through the list of visible sprites and ORA in the bits that represent them + sep #$30 + + ldx #0 + cpx spriteCount + beq :exit + +:loop + phx + +; ldy PPU_OAM,x + ldy OAM_COPY,x + iny ; This is the y-coordinate of the top of the sprite + + ldx y2idx,y ; Get the index into the shadowBitmap array for this y coordinate + lda y2low,y ; Get the bit pattern for the first byte + ora shadowBitmap,x + sta shadowBitmap,x + lda y2high,y ; Get the bit pattern for the second byte + ora shadowBitmap+1,x + sta shadowBitmap+1,x + + plx + inx + inx + inx + inx + cpx spriteCount + bcc :loop + +:exit + rep #$30 + rts + +y2idx const32 $00 + const32 $04 + const32 $08 + const32 $0C ; 128 bytes + const32 $10 + const32 $14 + const32 $18 + const32 $1C + +; Repeating pattern of 8 consecutive 1 bits +y2low rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01 + rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01 + rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01 + rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01 + +y2high rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE + rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE + rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE + rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE + +; 25 entries to multiple steps in the shadow bitmap to scanlines +mul8 db $00,$08,$10,$18,$20,$28,$30,$38 + db $40,$48,$50,$58,$60,$68,$70,$78 + db $80,$88,$90,$98,$A0,$A8,$B0,$B8 + db $C0,$C8,$D0,$D8,$E0,$E8,$F0,$F8 + +; Given a bit pattern, create a LUT that count to the first set bit (MSB -> LSB), e.g. $0F = 4, $3F = 2 +offset db 0,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 ; 0, 1, 2, 4, 8, 16 + db 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 ; 32 + db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 + db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 + db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + +; Scan the bitmap list and call BltRange on the ranges + mx %00 +drawShadowList + ldx #0 + cpx shadowListCount + beq :exit + +:loop + phx + + lda shadowListBot,x + and #$00FF + tay + cpy #201 + bcc *+4 + brk $cc + + lda shadowListTop,x + and #$00FF + tax + cpx #200 + bcc *+4 + brk $dd + + lda #0 ; Invoke the BltRange function + jsl LngJmp + + plx + inx + cpx shadowListCount + bcc :loop +:exit + rts + +; Altername between BltRange and PEISlam to expose the screen +exposeShadowList +:last equ Tmp0 +:top equ Tmp1 +:bottom equ Tmp2 + + ldx #0 + stx :last + cpx shadowListCount + beq :exit + +:loop + phx + + lda shadowListTop,x + and #$00FF + sta :top + + cmp #200 + bcc *+4 + brk $44 + + lda shadowListBot,x + and #$00FF + sta :bottom + + cmp #201 + bcc *+4 + brk $66 + + cmp :top + bcs *+4 + brk $55 + + ldx :last + ldy :top + lda #0 + jsl LngJmp ; Draw the background up to this range + + ldx :top + ldy :bottom + sty :last ; This is where we ended + lda #1 + jsl LngJmp ; Expose the already-drawn sprites + + plx + inx + cpx shadowListCount + bcc :loop + +:exit + ldx :last ; Expose the final part + ldy #200 + lda #0 + jsl LngJmp + rts + +; This routine needs to adjust the y-coordinates based of the offset of the GTE playfield within +; the PPU RAM +shadowBitmapToList +:top equ Tmp0 +:bottom equ Tmp2 + + sep #$30 + + ldx #2 ; Start at he third row (y_offset = 16) walk the bitmap for 25 bytes (200 lines of height) + lda #0 + sta shadowListCount ; zero out the shadow list count + +; This loop is called when we are not tracking a sprite range +:zero_loop + ldy shadowBitmap,x + beq :zero_next + + lda mul8-2,x ; This is the scanline we're on (offset by the starting byte) + clc + adc offset,y ; This is the first line defined by the bit pattern + sta :top + bra :one_next + +:zero_next + inx + cpx #28 ; End at byte 27 + bcc :zero_loop + bra :exit ; ended while not tracking a sprite, so exit the function + +:one_loop + lda shadowBitmap,x ; if the next byte is all sprite, just continue + eor #$FF + beq :one_next + + tay ; Use the inverted bitfield in order to re-use the same lookup table + lda mul8-2,x + clc + adc offset,y + + ldy shadowListCount + sta shadowListBot,y + lda :top + sta shadowListTop,y + iny + sty shadowListCount + bra :zero_next + +:one_next + inx + cpx #28 + bcc :one_loop + +; If we end while tracking a sprite, add to the list as the last item + + ldx shadowListCount + lda :top + sta shadowListTop,x + lda #200 + sta shadowListBot,x + inx + stx shadowListCount + +:exit + rep #$30 + lda shadowListCount + cmp #64 + bcc *+4 + brk $13 + + + rts + +; Helper to bounce into the function in the FTblPtr. See IIgs TN #90 +LngJmp + sty FTblTmp + asl + asl + tay + iny + lda [FTblPtr],y + pha + dey + lda [FTblPtr],y + dec phb - php + sta 1,s + ldy FTblTmp ; Restore the y register + rtl + +; Callback entrypoint from the GTE renderer +drawOAMSprites + phb + phd phk plb + pha + + lda DPSave + tcd + +; Save the pointer to the function table + + sty FTblPtr + stx FTblPtr+2 + + pla + +; Check what phase we're in +; +; Phase 1: A = 0 +; Phase 2: A = 1 + + cmp #0 + bne :phase2 + +; This is phase 1. We will build the sprite list and draw the background in the areas covered by +; sprites. This phase draws the sprites, too + + ldal nmiCount + pha + +; We need to "freeze" the OAM values, otherwise they can change between when we build the rendering pipeline + + sei + jsr scanOAMSprites ; Filter out any sprites that don't need to be drawn + pla + cmpl nmiCount + beq *+4 + brk $1F ; Should not have serviced the VBL interrupt here.... + cli + + jsr buildShadowBitmap ; Run though and quickly create a bitmap of lines with sprites + jsr shadowBitmapToList ; Can the bitmap and create (top, bottom) pairs of ranges + + jsr drawShadowList ; Draw the background lines that have sprite on them + jsr drawSprites ; Draw the sprites on top of the lines they occupy + + bra :exit + +; In Phase 2 we scan the shadow list and alternately blit the background in empty areas and +; PEI slam the sprite regions +:phase2 + jsr exposeShadowList ; Show everything on the SHR screen + +; Return form the callback +:exit + pld + plb + rtl + +drawSprites +:tmp equ Tmp0 + sep #$30 ; 8-bit cpu - ldx #4 ; Ok to always skip sprite 0 -:oam_loop - lda PPU_OAM+3,x ; remove this test once we can clip sprites - cmp #241 - bcs :hidden +; Run through the copy of the OAM memory - lda PPU_OAM+1,x ; $FC is an empty tile, don't draw it - cmp #$FC - beq :hidden + ldx #0 + cpx spriteCount + bne oam_loop + rep #$30 + rts - lda PPU_OAM,x ; Y-coordinate - cmp #200+y_offset-9 - bcs :hidden + mx %11 +oam_loop + phx ; Save x - phx + lda OAM_COPY,x ; Y-coordinate inc ; Compensate for PPU delayed scanline + rep #$30 and #$00FF asl @@ -539,26 +937,26 @@ drawOAMSprites adc #$2000-{y_offset*160}+x_offset sta :tmp - lda PPU_OAM+3,x + lda OAM_COPY+3,x lsr and #$007F clc adc :tmp tay - lda PPU_OAM+2,x + lda OAM_COPY+2,x pha bit #$0040 ; horizontal flip bne :hflip - lda PPU_OAM,x ; Load the tile index into the high byte (x256) + lda OAM_COPY,x ; Load the tile index into the high byte (x256) and #$FF00 lsr ; multiple by 128 tax bra :noflip :hflip - lda PPU_OAM,x ; Load the tile index into the high byte (x256) + lda OAM_COPY,x ; Load the tile index into the high byte (x256) and #$FF00 lsr ; multiple by 128 adc #64 ; horizontal flip @@ -567,22 +965,20 @@ drawOAMSprites :noflip pla asl -; and #$0080 ; Set the vflip bit and #$0106 ; Set the vflip bit and palette select bits drawTilePatch jsl $000000 ; Draw the tile on the graphics screen sep #$30 - plx + plx ; Restore the counter + inx + inx + inx + inx + cpx spriteCount + bcc oam_loop -:hidden - inx - inx - inx - inx - bne :oam_loop + rep #$30 + rts - plp - plb - rtl \ No newline at end of file diff --git a/src/Render.s b/src/Render.s index 5c870e6..3fb9b4c 100644 --- a/src/Render.s +++ b/src/Render.s @@ -153,6 +153,11 @@ _DoOverlay :disp jsl $000000 rts +; Callback structure with pointers to internal rendering functions +ExtFuncBlock + adrl BltRange + adrl PEISlam + ; Special NES renderer that externalizes the sprite rendering in order to exceed the internal limit of 16 sprites _RenderNES jsr _ApplyBG0YPos @@ -172,7 +177,6 @@ _RenderNES :no_tile jsr _ApplyTiles ; This function actually draws the new tiles into the code field -; jsr _ApplyBG0XPos ; Patch the code field instructions with exit BRA opcode stz tmp1 ; virt_line_x2 lda #16*2 @@ -194,22 +198,45 @@ _RenderNES lda tmp4 stal nesBottomOffset - ldx #0 ; Blit the full virtual buffer to the screen - ldy ScreenHeight - jsr _BltRange +; This is a tricky part. The NES does not keep sprites sorted, so we need an alternative way to figure out +; which lines to shadow and which ones not to. Our compromise is to build a bitmap of lines that the sprite +; occupy and then scan through that quickly. +; +; This is handled by the callback in two phases. We pass pointers to the internal function the callback needs +; access to. If there is no function defined, do nothing lda ExtSpriteRenderer ora ExtSpriteRenderer+2 - beq :no_sprite + beq :no_render lda ExtSpriteRenderer - stal :patch+1 + stal :patch1+1 + stal :patch2+1 lda ExtSpriteRenderer+1 - stal :patch+2 -:patch jsl $000000 + stal :patch1+2 + stal :patch2+2 -:no_sprite +; Start the two-phase rendering process. First turn off shading and invoke the callback to +; draw sprite regions + jsr _ShadowOff + + lda #0 ; Signal we're in phase 1 (shadowing off) + ldx #^ExtFuncBlock + ldy #ExtFuncBlock +:patch1 jsl $000000 + +; Now perform the second phase which renders the whole screen and exposes the sprites that were +; drawins in the first phase + + jsr _ShadowOn + + lda #1 ; Signal we're in phase 2 (shadowing on) + ldx #^ExtFuncBlock + ldy #ExtFuncBlock +:patch2 jsl $000000 + +:no_render stz tmp1 ; :virt_line_x2 lda #16*2 sta tmp2 ; :lines_left_x2 @@ -402,7 +429,6 @@ _DrawFinalPass ldy _Sprites+SPRITE_CLIP_TOP,x ; PEI Slam to the top of the overlay (:bottom is greater than this value) ldx :cursor sty :cursor -; brk $44 jsr _PEISlam lda 3,s ; Retrieve the sprite index tax diff --git a/src/Tool.s b/src/Tool.s index e0f64ba..348840d 100644 --- a/src/Tool.s +++ b/src/Tool.s @@ -144,6 +144,7 @@ zpToUse = userId+4 lda zpToUse,s ; Get the direct page address phd ; Save the current direct page tcd ; Set to our working direct page space + stal tool_direct_page ; Stash a copy in memory txa and #$00FF ; Get just the tool number diff --git a/src/blitter/Blitter.s b/src/blitter/Blitter.s index bf5419f..25de41b 100644 --- a/src/blitter/Blitter.s +++ b/src/blitter/Blitter.s @@ -85,6 +85,7 @@ _BltRange lda BlitterDP ; Set the direct page to the blitter data tcd + php ; save the current processor flags sei ; disable interrupts _R0W1 tsc ; save the stack pointer @@ -95,7 +96,7 @@ blt_entry jml $000000 ; Jump into the blitter code $XX/YY00 blt_return _R0W0 stk_save lda #0000 ; load the stack tcs - cli ; re-enable interrupts + plp ; re-enable interrupts (maybe, if interrupts disabled when we are called, they are not re-endabled) pld ; restore the direct page sep #$20 @@ -106,3 +107,16 @@ stk_save lda #0000 ; load the stack plb ; restore the bank rts + +; External entry point. Can be called directly from another bank +BltRange + phd + phb + + ldal tool_direct_page + tcd + jsr _SetDataBank ; only affects accumulator + jsr _BltRange + plb + pld + rtl \ No newline at end of file diff --git a/src/blitter/PEISlammer.s b/src/blitter/PEISlammer.s index 4f1d199..adea5e4 100644 --- a/src/blitter/PEISlammer.s +++ b/src/blitter/PEISlammer.s @@ -14,17 +14,19 @@ ; Y = last line (exclusive), valid range >X up to 200 _PEISlam cpx #200 - bcc *+3 - rts + bcc *+4 + brk $14 +; rts cpy #201 - bcc *+3 - rts + bcc *+4 + brk $15 +; rts - txa + tya ; x must be less than y stal :screen_width_1 - tya + txa cmpl :screen_width_1 - bcs *+3 + bcc *+3 rts @@ -74,6 +76,10 @@ _PEISlam adcl :screen_width_1 tcs + cmp #$9D00 + bcc *+4 + brk $85 ; Kill if stack it out of range + dey ; decrement the total counter, if zero then we're done beq :exit @@ -115,3 +121,21 @@ _PEISlam :stk_save ds 2 :screen_width_1 ds 2 + +; A stashed memory location just in case we need it. This is filled in the GTEStartUp() +tool_direct_page ds 2 + +; External entry point. Can be called directly from another bank +PEISlam + phd + phb + + ldal tool_direct_page + tcd + jsr _SetDataBank ; only affects accumulator + jsr _PEISlam + plb + pld + rtl + +