iigs-game-engine/demos/smb/ppu.s
2023-05-24 14:31:49 -05:00

1094 lines
23 KiB
ArmAsm

; PPU simulator
;
; Any read/write to the PPU registers in the ROM is intercepted and passed here.
const8 mac
db ]1,]1,]1,]1,]1,]1,]1,]1
<<<
const32 mac
const8 ]1
const8 ]1+1
const8 ]1+2
const8 ]1+3
<<<
rep8 mac
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
db ]1
<<<
mx %11
dw $a5a5 ; marker to find in memory
ppuaddr ENT
ds 2 ; 16-bit ppu address
w_bit dw 1 ; currently writing to high or low to the address latch
vram_buff dw 0 ; latched data when reading VRAM ($0000 - $3EFF)
ppuincr dw 1 ; 1 or 32 depending on bit 2 of PPUCTRL
spadr dw $0000 ; Sprite pattern table ($0000 or $1000) depending on bit 3 of PPUCTRL
ntaddr dw $2000 ; Base nametable address ($2000, $2400, $2800, $2C00), bits 0 and 1 of PPUCTRL
bgadr dw $0000 ; Background pattern table address
ppuctrl dw 0 ; Copy of the ppu ctrl byte
ppumask dw 0 ; Copy of the ppu mask byte
ppustatus dw 0
oamaddr dw 0 ; Typically this will always be 0
ppuscroll dw 0 ; Y X coordinates
ntbase db $20,$24,$28,$2c
assert_lt mac
cmp ]1
bcc ok
brk ]2
ok
<<<
assert_x_lt mac
cpx ]1
bcc ok
brk ]2
ok
<<<
cond mac
bit ]1
beq cond_0
lda ]3
bra cond_s
cond_0 lda ]2
cond_s sta ]4
<<<
; $2000 - PPUCTRL (Write only)
PPUCTRL_WRITE ENT
php
phb
phk
plb
sta ppuctrl
phx
; Set the pattern table base address
and #$03
tax
lda ntbase,x
sta ntaddr+1
; Set the vram increment
lda ppuctrl
cond #$04;#$01;#$20;ppuincr
; Set the sprite table address
lda ppuctrl
cond #$08;#$00;#$10;spadr+1
; Set the background table address
lda ppuctrl
cond #$10;#$00;#$10;bgadr+1
plx
lda ppuctrl
plb
plp
rtl
; $2001 - PPUMASK (Write only)
PPUMASK_WRITE ENT
stal ppumask
rtl
; $2002 - PPUSTATUS For "ldx ppustatus"
PPUSTATUS_READ_X ENT
php
pha
lda #1
stal w_bit ; Reset the address latch used by PPUSCROLL and PPUADDR
ldal ppustatus
tax
and #$7F ; Clear the VBL flag
stal ppustatus
pla ; Restore the accumulator (return value in X)
plp
phx ; re-read x to set any relevant flags
plx
rtl
PPUSTATUS_READ ENT
php
lda #1
stal w_bit ; Reset the address latch used by PPUSCROLL and PPUADDR
ldal ppustatus
pha
and #$7F ; Clear the VBL flag
stal ppustatus
pla ; pop the return value
plp
pha ; re-read accumulator to set any relevant flags
pla
rtl
; $2003
OAMADDR_WRITE ENT
stal oamaddr
rtl
; $2005 - PPU SCROLL
PPUSCROLL_WRITE ENT
php
phb
phk
plb
phx
pha
ldx w_bit
sta ppuscroll,x
txa
eor #$01
sta w_bit
pla
plx
plb
plp
rtl
; $2006 - PPUADDR
PPUADDR_WRITE ENT
php
phb
phk
plb
phx
pha
ldx w_bit
sta ppuaddr,x
; assert_lt #$40;$D0
txa
eor #$01
sta w_bit
lda ppuaddr+1 ; Stay within the mirrored memory space
and #$3F
sta ppuaddr+1
pla
plx
plb
plp
rtl
; 2007 - PPUDATA (Read/Write)
;
; If reading from the $0000 - $3EFF range, the value from vram_buff is returned and the actual data is loaded
; post-fetch.
PPUDATA_READ ENT
php
phb
phk
plb
phx
rep #$30 ; do a 16-bit update of the address
ldx ppuaddr
txa
; assert_lt #$4000;$d1
clc
adc ppuincr
and #$3FFF
sta ppuaddr
sep #$20 ; back to 8-bit acc for the read itself
cpx #$3F00 ; check which range of memory we are accessing?
bcc :buff_read
lda PPU_MEM,x
bra :out
:buff_read
lda vram_buff ; read from the buffer
pha
lda PPU_MEM,x ; put the data in the buffer for the next read
sta vram_buff
pla ; pop the return value
:out
sep #$10
plx
plb
plp
pha
pla
rtl
ppu_write_log_len dw 0
ppu_write_log ds 100 ; record the first 50 PPU write addresses in each frame
nt_queue_front dw 0
nt_queue_end dw 0
nt_queue ds 2*{NT_QUEUE_SIZE}
PPUDATA_WRITE ENT
php
phb
phk
plb
pha
phx
rep #$10
ldx ppuaddr
* cpx #$3F00 ; Just log nametable access, not palette info
* bcs :nolog
* phy
* pha
* ldy ppu_write_log_len
* cpy #50
* bcs :log_full
* rep #$20
* txa
* sta ppu_write_log,y
* lda 1,s
* and #$00FF
* sta ppu_write_log+50,y
* iny
* iny
* sty ppu_write_log_len
* sep #$20
* :log_full
* pla
* ply
* :nolog
; cmp #$47
; bne :nobrk
; cpx #$2308
; bne :nobrk
; brk $FD
;:nobrk
cmp PPU_MEM,x
beq :nochange
sta PPU_MEM,x
rep #$30
txa
clc
adc ppuincr
and #$3FFF
sta ppuaddr
; Anything between $2000 and $3000, we need to add to the queue. We can't reject updates here because we may not
; actually update the GTE tile store for several game frames and the position of the tile within the tile store
; may change if the screen is scrolling
cpx #$3000
bcs :nocache
cpx #$2000 ; Change to $2080 to ignore score field updates
bcc :nocache
phy
lda nt_queue_end
tay
inc
inc
and #NT_QUEUE_MOD
cmp nt_queue_front
beq :full
sta nt_queue_end
txa
sta nt_queue,y
:full
lda #1
jsr setborder
ply
:nocache
cpx #$3F00
bcs :extra
bra :done
:nochange
rep #$30
txa
clc
adc ppuincr
and #$3FFF
sta ppuaddr
:done
sep #$30
plx
pla
plb
plp
rtl
setborder
php
sep #$20
eorl $E0C034
and #$F0
eorl $E0C034
stal $E0C034
plp
rts
; Do some extra work to keep palette data in sync
;
; Based on the palette data that SMB uses, we map the NES palette entries as
;
; NES Description IIgs Palette
; ----------------------------------------
; BG0 Background color 0
; BG0,1 Light Green 1
; BG0,2 Dark Green 2
; BG0,3 Black 3
; BG1,1 Peach 4
; BG1,2 Brown 5
; BG1,3 Black 3
; BG2,1 White 6
; BG2,2 Light Blue 7
; BG2,3 Black 3
; BG3,1 Cycle 8 ; Coins / Blocks
; BG3,2 Brown 5
; BG3,3 Black 3
; SP0 0
; SP0,1 Red 9
; SP0,2 Orange 10
; SP0,3 Olive 11
; SP1,1 Dark Green 2
; SP1,2 White 6
; SP1,3 Orange 10
; SP2,1 Red 9
; SP2,2 White 6
; SP2,3 Orange 10
; SP3,1 Black 3
; SP3,2 Peach 4
; SP3,3 Brown 5
;
; There are 4 color to spare in case we need to add more entries. This mapping table is important because
; we have to have a custom tile rendering function and custom sprite rendering function that will dynamically
; map the 2-bit tile data into the proper palette range. This will likely be implemented with an 8-bit
; swizzle table. Possible optimization later on is to pre-swizzle certain tiles assuming that the palette
; assignments never change.
;
; BG Palette 2 can probably be ignored because it's just for the top of the screen and we can use a separate
; SCB palette for that line
mx %00
:extra
txa
and #$001F
asl
tax
jmp (palTbl,x)
palTbl dw ppu_3F00,ppu_3F01,ppu_3F02,ppu_3F03
dw ppu_3F04,ppu_3F05,ppu_3F06,ppu_3F07
dw ppu_3F08,ppu_3F09,ppu_3F0A,ppu_3F0B
dw ppu_3F0C,ppu_3F0D,ppu_3F0E,ppu_3F0F
dw ppu_3F10,ppu_3F11,ppu_3F12,ppu_3F13
dw ppu_3F14,ppu_3F15,ppu_3F16,ppu_3F17
dw ppu_3F18,ppu_3F19,ppu_3F1A,ppu_3F1B
dw ppu_3F1C,ppu_3F1D,ppu_3F1E,ppu_3F1F
; Background color
ppu_3F00
lda PPU_MEM+$3F00
ldx #0
brl extra_out
; Background Palette 0
ppu_3F01
lda PPU_MEM+$3F01
ldx #2
brl extra_out
ppu_3F02
lda PPU_MEM+$3F02
ldx #4
brl extra_out
ppu_3F03
lda PPU_MEM+$3F03
ldx #6
brl extra_out
; Shadow for background color
ppu_3F10
lda PPU_MEM+$3F10
ldx #0
brl extra_out
; Sprite Palette 0
ppu_3F11
lda PPU_MEM+$3F11
ldx #8
brl extra_out
ppu_3F12
lda PPU_MEM+$3F12
ldx #10
brl extra_out
ppu_3F13
lda PPU_MEM+$3F13
ldx #12
brl extra_out
; Sprite Palette 1
ppu_3F15
lda PPU_MEM+$3F15
ldx #14
brl extra_out
ppu_3F16
lda PPU_MEM+$3F16
ldx #16
brl extra_out
ppu_3F17
lda PPU_MEM+$3F17
ldx #18
brl extra_out
; Sprite Palette 2
ppu_3F19
lda PPU_MEM+$3F19
ldx #20
brl extra_out
ppu_3F1A
lda PPU_MEM+$3F1A
ldx #22
brl extra_out
ppu_3F1B
lda PPU_MEM+$3F1B
ldx #24
brl extra_out
; Sprite Palette 3
ppu_3F1D
lda PPU_MEM+$3F1D
ldx #26
brl extra_out
ppu_3F1E
lda PPU_MEM+$3F1E
ldx #28
brl extra_out
ppu_3F1F
lda PPU_MEM+$3F1F
ldx #30
brl extra_out
ppu_3F04
ppu_3F05
ppu_3F06
ppu_3F07
ppu_3F08
ppu_3F09
ppu_3F0A
ppu_3F0B
ppu_3F0C
ppu_3F0D
ppu_3F0E
ppu_3F0F
ppu_3F14
ppu_3F18
ppu_3F1C
brl no_pal
; Exit code to set a IIgs palette entry from the PPU memory
;
; A = NES palette value
; X = IIgs Palette index
extra_out
phy
and #$00FF
asl
tay
lda nesPalette,y
ply
stal $E19E00,x
no_pal
sep #$30
plx
pla
plb
plp
rtl
; Trigger a copy from a page of memory to OAM. Since this is a DMA operation, we can cheat and do a 16-bit copy
PPUDMA_WRITE ENT
php
phb
phk
plb
phx
pha
rep #$30
xba
and #$FF00
tax
]n equ 0
lup 128
ldal ROMBase+]n,x
sta PPU_OAM+]n
]n = ]n+2
--^
sep #$30
pla
plx
plb
plp
rtl
y_offset_rows equ 2
y_height_rows equ 25
y_offset equ {y_offset_rows*8}
y_height equ {y_height_rows*8}
x_offset equ 16
; Scan the OAM memory and copy the values of the sprites that need to be drawn. There are two reasons to do this
;
; 1. Freeze the OAM memory at this instanct so that the NES ISR can keep running without changing values
; 2. We have to scan this list twice -- once to build up the shadow list and once to actually render the sprites
OAM_COPY ds 256
spriteCount ds 0
db 0 ; Pad in case we can to access using 16-bit instructions
mx %00
scanOAMSprites
stz Tmp5
sep #$30
ldx #4 ; Always skip sprite 0
ldy #0
:loop
lda PPU_OAM,x ; Y-coordinate
cmp #y_height+y_offset-9
bcs :skip
cmp #y_offset
bcc :skip
lda PPU_OAM+1,x ; $FC is an empty tile, don't draw it
cmp #$FC
beq :skip
lda PPU_OAM+3,x ; If X-coordinate is off the edge skip it, too.
cmp #241
bcs :skip
rep #$20
lda PPU_OAM,x
sta OAM_COPY,y
lda PPU_OAM+2,x
sta OAM_COPY+2,y
sep #$20
* ; Debug OAM values
* phy
* phx
* rep #$30
* ldx Tmp5
* cpx #{160*190}
* bcs :nodraw
* lda OAM_COPY+2,y
* pha
* lda OAM_COPY,y
* ldy #$FFFF
* jsr DrawWord
* lda Tmp5
* clc
* adc #128+16
* tax
* ldy #$FFFF
* pla
* jsr DrawWord
* lda Tmp5
* clc
* adc #8*160
* sta Tmp5
* :nodraw
* sep #$30
* plx
* ply
iny
iny
iny
iny
:skip
inx
inx
inx
inx
bne :loop
sty spriteCount ; Count * 4
rep #$30
rts
; Screen is 200 lines tall. It's worth it be exact when building the list because one extra
; draw + shadow sequence takes at least 1,000 cycles.
shadowBitmap ds 32 ; Provide enough space for the full ppu range (240 lines) + 16 since the y coordinate can be off-screen
; A representation of the list as [top, bot) pairs
shadowListCount dw 0 ; Pad for 16-bit comparisons
shadowListTop ds 64
shadowListBot ds 64
mx %00
buildShadowBitmap
; zero out the bitmap (16-bit writes)
]n equ 0
lup 15
stz shadowBitmap+]n
]n = ]n+2
--^
; Run through the list of visible sprites and ORA in the bits that represent them
sep #$30
ldx #0
cpx spriteCount
beq :exit
:loop
phx
; ldy PPU_OAM,x
ldy OAM_COPY,x
iny ; This is the y-coordinate of the top of the sprite
ldx y2idx,y ; Get the index into the shadowBitmap array for this y coordinate
lda y2low,y ; Get the bit pattern for the first byte
ora shadowBitmap,x
sta shadowBitmap,x
lda y2high,y ; Get the bit pattern for the second byte
ora shadowBitmap+1,x
sta shadowBitmap+1,x
plx
inx
inx
inx
inx
cpx spriteCount
bcc :loop
:exit
rep #$30
rts
y2idx const32 $00
const32 $04
const32 $08
const32 $0C ; 128 bytes
const32 $10
const32 $14
const32 $18
const32 $1C
; Repeating pattern of 8 consecutive 1 bits
y2low rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
rep8 $FF,$7F,$3F,$1F,$0F,$07,$03,$01
y2high rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
rep8 $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
; 25 entries to multiple steps in the shadow bitmap to scanlines
mul8 db $00,$08,$10,$18,$20,$28,$30,$38
db $40,$48,$50,$58,$60,$68,$70,$78
db $80,$88,$90,$98,$A0,$A8,$B0,$B8
db $C0,$C8,$D0,$D8,$E0,$E8,$F0,$F8
; Given a bit pattern, create a LUT that count to the first set bit (MSB -> LSB), e.g. $0F = 4, $3F = 2
offset db 0,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 ; 0, 1, 2, 4, 8, 16
db 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 ; 32
db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
db 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
; Scan the bitmap list and call BltRange on the ranges
mx %00
drawShadowList
ldx #0
cpx shadowListCount
beq :exit
:loop
phx
lda shadowListBot,x
and #$00FF
tay
; cpy #201
; bcc *+4
; brk $cc
lda shadowListTop,x
and #$00FF
tax
; cpx #200
; bcc *+4
; brk $dd
lda #0 ; Invoke the BltRange function
jsl LngJmp
plx
inx
cpx shadowListCount
bcc :loop
:exit
rts
; Altername between BltRange and PEISlam to expose the screen
exposeShadowList
:last equ Tmp0
:top equ Tmp1
:bottom equ Tmp2
ldx #0
stx :last
cpx shadowListCount
beq :exit
:loop
phx
lda shadowListTop,x
and #$00FF
sta :top
cmp #200
bcc *+4
brk $44
lda shadowListBot,x
and #$00FF
sta :bottom
cmp #201
bcc *+4
brk $66
cmp :top
bcs *+4
brk $55
ldx :last
ldy :top
lda #0
jsl LngJmp ; Draw the background up to this range
ldx :top
ldy :bottom
sty :last ; This is where we ended
lda #1
jsl LngJmp ; Expose the already-drawn sprites
plx
inx
cpx shadowListCount
bcc :loop
:exit
ldx :last ; Expose the final part
ldy #y_height
lda #0
jsl LngJmp
rts
; This routine needs to adjust the y-coordinates based of the offset of the GTE playfield within
; the PPU RAM
shadowBitmapToList
:top equ Tmp0
:bottom equ Tmp2
sep #$30
ldx #y_offset_rows ; Start at he third row (y_offset = 16) walk the bitmap for 25 bytes (200 lines of height)
lda #0
sta shadowListCount ; zero out the shadow list count
; This loop is called when we are not tracking a sprite range
:zero_loop
ldy shadowBitmap,x
beq :zero_next
lda mul8-y_offset_rows,x ; This is the scanline we're on (offset by the starting byte)
clc
adc offset,y ; This is the first line defined by the bit pattern
sta :top
bra :one_next
:zero_next
inx
cpx #y_height_rows+y_offset_rows+1 ; End at byte 27
bcc :zero_loop
bra :exit ; ended while not tracking a sprite, so exit the function
:one_loop
lda shadowBitmap,x ; if the next byte is all sprite, just continue
eor #$FF
beq :one_next
tay ; Use the inverted bitfield in order to re-use the same lookup table
lda mul8-y_offset_rows,x
clc
adc offset,y
ldy shadowListCount
sta shadowListBot,y
lda :top
sta shadowListTop,y
iny
sty shadowListCount
bra :zero_next
:one_next
inx
cpx #y_height_rows+y_offset_rows+1
bcc :one_loop
; If we end while tracking a sprite, add to the list as the last item
ldx shadowListCount
lda :top
sta shadowListTop,x
lda #y_height
sta shadowListBot,x
inx
stx shadowListCount
:exit
rep #$30
lda shadowListCount
cmp #64
bcc *+4
brk $13
rts
; Helper to bounce into the function in the FTblPtr. See IIgs TN #90
LngJmp
sty FTblTmp
asl
asl
tay
iny
lda [FTblPtr],y
pha
dey
lda [FTblPtr],y
dec
phb
sta 1,s
ldy FTblTmp ; Restore the y register
rtl
; Callback entrypoint from the GTE renderer
drawOAMSprites
phb
phd
phk
plb
pha
lda DPSave
tcd
; Save the pointer to the function table
sty FTblPtr
stx FTblPtr+2
pla
; Check what phase we're in
;
; Phase 1: A = 0
; Phase 2: A = 1
cmp #0
bne :phase2
; This is phase 1. We will build the sprite list and draw the background in the areas covered by
; sprites. This phase draws the sprites, too
; We need to "freeze" the OAM values, otherwise they can change between when we build the rendering pipeline
sei
ldal nmiCount
pha
jsr scanOAMSprites ; Filter out any sprites that don't need to be drawn
pla
cmpl nmiCount
beq *+4
brk $1F ; Should not have serviced the VBL interrupt here....
cli
jsr buildShadowBitmap ; Run though and quickly create a bitmap of lines with sprites
jsr shadowBitmapToList ; Can the bitmap and create (top, bottom) pairs of ranges
jsr drawShadowList ; Draw the background lines that have sprite on them
jsr drawSprites ; Draw the sprites on top of the lines they occupy
bra :exit
; In Phase 2 we scan the shadow list and alternately blit the background in empty areas and
; PEI slam the sprite regions
:phase2
jsr exposeShadowList ; Show everything on the SHR screen
; Return form the callback
:exit
pld
plb
rtl
drawSprites
:tmp equ Tmp0
sep #$30 ; 8-bit cpu
; Run through the copy of the OAM memory
ldx #0
cpx spriteCount
bne oam_loop
rep #$30
rts
mx %11
oam_loop
phx ; Save x
lda OAM_COPY,x ; Y-coordinate
inc ; Compensate for PPU delayed scanline
rep #$30
and #$00FF
asl
asl
asl
asl
asl
sta :tmp
asl
asl
clc
adc :tmp
clc
adc #$2000-{y_offset*160}+x_offset
sta :tmp
lda OAM_COPY+3,x
lsr
and #$007F
clc
adc :tmp
tay
lda OAM_COPY+2,x
pha
bit #$0040 ; horizontal flip
bne :hflip
lda OAM_COPY,x ; Load the tile index into the high byte (x256)
and #$FF00
lsr ; multiple by 128
tax
bra :noflip
:hflip
lda OAM_COPY,x ; Load the tile index into the high byte (x256)
and #$FF00
lsr ; multiple by 128
adc #64 ; horizontal flip
tax
:noflip
pla
asl
and #$0146 ; Set the vflip bit, priority, and palette select bits
drawTilePatch
jsl $000000 ; Draw the tile on the graphics screen
sep #$30
plx ; Restore the counter
inx
inx
inx
inx
cpx spriteCount
bcc oam_loop
rep #$30
rts