Get the PEI Slammer working for exposing shadowed writes

This commit is contained in:
Lucas Scharenbroich 2021-07-19 22:42:51 -05:00
parent d9e3ee14e8
commit c5eb84ea37
8 changed files with 322 additions and 129 deletions

View File

@ -15,7 +15,8 @@
"debug": "%npm_package_config_crossrunner% src\\GTETestApp -Source src\\GTETestApp_S02_MAINSEG_Output.txt -Debug -CompatibilityLayer",
"build": "%npm_package_config_merlin32% -V %npm_package_config_macros% src\\App.s",
"build:watch": "watch \"npm run build\" src",
"build:assets": "node ./tools/pngtoiigs.js ./assets/donut-plains-2-8-color.png ./emu/bg1a.bin --start-index 6 && node ./tools/pngtoiigs.js ./assets/donut-plains-2-8-color-shift.png ./emu/bg1b.bin --start-index 6 && node ./tools/pngtoiigs.js ./assets/donut-plains-1-6-color.png ./emu/fg1.bin"
"build:assets-smw": "node ./tools/pngtoiigs.js ./assets/donut-plains-2-8-color.png ./emu/bg1a.bin --start-index 6 && node ./tools/pngtoiigs.js ./assets/donut-plains-2-8-color-shift.png ./emu/bg1b.bin --start-index 6 && node ./tools/pngtoiigs.js ./assets/donut-plains-1-6-color.png ./emu/fg1.bin",
"build:assets-fatdog": "node ./tools/pngtoiigs.js ./assets/armada-7-color.png ./emu/bg1a.bin --start-index 8 && node ./tools/pngtoiigs.js ./assets/armada-7-color-shift.png ./emu/bg1b.bin --start-index 8 && node ./tools/pngtoiigs.js ./assets/armada-7-color-shuffle.png ./emu/fg1.bin --start-index 1"
},
"repository": {
"type": "git",

View File

@ -63,9 +63,11 @@ Demo
lda OneSecondCounter
sta oldOneSecondCounter
stz frameCount
:loop
lda #1
jsr MoveLeft
inc frameCount
ldal KBD_STROBE_REG
@ -74,6 +76,8 @@ Demo
and #$007F
cmp #'s'
bne :nokey
pla
rts
:nokey
@ -109,3 +113,23 @@ FPSStr str 'FPS'

View File

@ -42,7 +42,7 @@ SHR_PALETTES equ $E19E00
tiledata ext
; Feature flags
NO_INTERRUPTS equ 0 ; turn off for crossrunner debugging
NO_INTERRUPTS equ 1 ; turn off for crossrunner debugging
; Typical init
@ -1284,6 +1284,8 @@ qtRec adrl $0000

View File

@ -61,9 +61,6 @@ Render
; byte, then we may have to change the CODE_ENTRY values or restore/set new OPCODE
; values, but not both.
jsr ShadowOff
jsr ShadowOn
; It's important to do _ApplyBG0YPos first because it calculates the value of StartY % 208 which is
; used in all of the other loops
@ -72,7 +69,21 @@ Render
jsr _ApplyBG1YPos ; Adjust the index values into the BG1 bank buffer
jsr _ApplyBG1XPos ; Adjust the direct page pointers to the BG1 bank
; The code fields are locked in now and reder to be rendered
jsr ShadowOff
ldx #0 ; Blit the full virtual buffer to the screen
ldy #8
jsr _BltRange
jsr ShadowOn
ldx #0 ; Expose the top 8 rows
ldy #8
jsr _PEISlam
ldx #8 ; Blit the full virtual buffer to the screen
ldy ScreenHeight
jsr _BltRange
@ -90,8 +101,3 @@ Render

View File

@ -62,22 +62,61 @@ _ApplyBG1XPos
lda BlitterDP ; blitter direct page space and fill in the addresses
tcd
tya
ldx #162
:loop
tya
clc
adc affine,x
sta 00,x ; store the value
dec
dec
bpl *+6
dey
dey
bpl :nowrap
tya
clc
adc #164
tay
:nowrap
dex
dex
bpl :loop
pld
rts
affine ds 164
; Pass accumulator to set every (A / 256) pitch
SetAffine
ldx #0
ldy #0
and #$00FF
pha ; step size
pea $0000
:loop lda 1,s
clc
adc 3,s
cmp #256
bcc :skip
tya
clc
adc #256 ; Move to next BG1 line
tay
:skip and #$00FF ; always clamp to 256
sta 1,s
tya
sta affine,x
inx
inx
cpx #164
bcc :loop
pla
pla
rts
_ClearBG1Buffer
phb
pha
@ -266,6 +305,35 @@ CopyBG1YTableToBG1Addr

View File

@ -10,33 +10,33 @@
; on the SHR screen or the current value of StartY
_BltRange
:exit_ptr equ tmp0
:jmp_low_save equ tmp2
:exit_ptr equ tmp0
:jmp_low_save equ tmp2
phb ; preserve the bank register
clc`
phb ; preserve the bank register
clc`
dey
tya ; Get the address of the line that we want to return from
adc StartY ; and create a pointer to it
asl
tay
lda BTableLow,y
sta :exit_ptr
lda BTableHigh,y
sta :exit_ptr+2
dey
tya ; Get the address of the line that we want to return from
adc StartY ; and create a pointer to it
asl
tay
lda BTableLow,y
sta :exit_ptr
lda BTableHigh,y
sta :exit_ptr+2
txa ; get the first line (0 - 199)
adc StartY ; add in the virtual offset (0, 207) -- max value of 406
asl
tax ; this is the offset into the blitter table
txa ; get the first line (0 - 199)
adc StartY ; add in the virtual offset (0, 207) -- max value of 406
asl
tax ; this is the offset into the blitter table
sep #$20 ; 8-bit Acc
lda BTableHigh,x ; patch in the bank
sta blt_entry+3
sep #$20 ; 8-bit Acc
lda BTableHigh,x ; patch in the bank
sta blt_entry+3
lda BTableLow+1,x ; patch in the page
sta blt_entry+2
lda BTableLow+1,x ; patch in the page
sta blt_entry+2
; The way we patch the exit code is subtle, but very fast. The CODE_EXIT offset points to
; an JMP/JML instruction that transitions to the next line after all of the code has been
@ -45,49 +45,53 @@ _BltRange
; The trick we use is to patch the low byte to force the code to jump to a special return
; function (jml blt_return) in the *next* code field line.
ldy #CODE_EXIT+1 ; this is a JMP or JML instruction that points to the next line.
lda [:exit_ptr],y
sta :jmp_low_save
lda #FULL_RETURN ; this is the offset of the return code
sta [:exit_ptr],y ; patch out the low byte of the JMP/JML
ldy #CODE_EXIT+1 ; this is a JMP or JML instruction that points to the next line.
lda [:exit_ptr],y
sta :jmp_low_save
lda #FULL_RETURN ; this is the offset of the return code
sta [:exit_ptr],y ; patch out the low byte of the JMP/JML
; Now we need to set up the Bank, Stack Pointer and Direct Page registers for calling into
; the code field
lda StartX
bit #$01
beq :primary
lda BG1AltBank
bra :alt
:primary lda BG1DataBank
lda StartX
bit #$01
beq :primary
lda BG1AltBank
bra :alt
:primary lda BG1DataBank
:alt
pha
plb
rep #$20
pha
plb
rep #$20
phd ; Save the application direct page
lda BlitterDP ; Set the direct page to the blitter data
tcd
phd ; Save the application direct page
lda BlitterDP ; Set the direct page to the blitter data
tcd
sei ; disable interrupts
_R0W1
tsc ; save the stack pointer
stal stk_save+1
sei ; disable interrupts
_R0W1
tsc ; save the stack pointer
stal stk_save+1
blt_entry jml $000000 ; Jump into the blitter code $XX/YY00
blt_entry jml $000000 ; Jump into the blitter code $XX/YY00
blt_return _R0W0
stk_save lda #0000 ; load the stack
tcs
cli ; re-enable interrupts
pld ; restore the direct page
blt_return _R0W0
stk_save lda #0000 ; load the stack
tcs
cli ; re-enable interrupts
pld ; restore the direct page
sep #$20
ldy #CODE_EXIT+1
lda :jmp_low_save
sta [:exit_ptr],y
rep #$20
sep #$20
ldy #CODE_EXIT+1
lda :jmp_low_save
sta [:exit_ptr],y
rep #$20
plb ; restore the bank
rts
plb ; restore the bank
rts
; Placeholder for actual sprite drawing. The implementation will be simple because
; we don't do anything sprite related; just call function pointers provided to us.
_RenderSprites
rts

View File

@ -10,82 +10,119 @@
; 12 additional instructions, so this is an optimization that is unlikely to lead to a net
; improvement.
;
; A = base address of top-left edge of the screen
; Y = number of scanlines to blit
; X = width of the screen in bytes
PEISlam
stx :screen_width ; save the width
; X = first line (inclusive), valid range of 0 to 199
; Y = last line (exclusive), valid range >X up to 200
_PEISlam
lda ScreenWidth
dec
sta :screen_width_1 ; save the width-1 outside of the direct page
phd ; save the current direct page and assign the base
tcd ; screen address to the direct page register
clc
adc :screen_width ; screen address of the right edge (will go in stack)
tax ; but cache in x register for a bit....
lda #:pei_end ; patch the PEI entry address
and #$FFFE ; should always be even, but....
sec
sbc ScreenWidth
sta :inner+1
tsc
sta :stk_save ; save the stack pointer to restore later
phx
tya
sec
sbc 1,s
ply
tay ; get the number of lines in the y register
lda #:pei_end ; patch the PEI entry address
sec
sbc :screen_width
sta :inner+1
txa
asl
tax
lda RTable,x ; This is the right visible byte, so add one to get the
tax ; left visible byte (cache in x-reg)
sec
sbc ScreenWidth
inc
clc ; clear before the loop -- nothing in the loop affect the carry bit
brl :outer ; hop into the entry point.
phd ; save the current direct page and assign the base
tcd ; screen address to the direct page register
]dp equ 158
lup 80 ; A full width screen is 160 bytes / 80 words
pei ]dp
]dp equ ]dp-2
--^
tsc
sta :stk_save ; save the stack pointer to restore later
clc ; clear before the loop -- nothing in the loop affect the carry bit
brl :outer ; hop into the entry point.
]dp equ 158
lup 80 ; A full width screen is 160 bytes / 80 words
pei ]dp
]dp equ ]dp-2
--^
:pei_end
tdc ; Move to the next line
adc #160
tcd
adc :screen_width
tcs
tdc ; Move to the next line
adc #160
tcd
adc :screen_width_1
tcs
dey ; decrement the total counter, if zero then we're done
beq :exit
dey ; decrement the total counter, if zero then we're done
beq :exit
dex ; decrement the inner counter. Both counters are set
beq :restore ; up so that they fall-through by default to save a cycle
; per loop iteration.
dex ; decrement the inner counter. Both counters are set
beq :restore ; up so that they fall-through by default to save a cycle
; per loop iteration.
:inner jmp $0000 ; 25 cycles of overhead per line. A full width slam executes all
; 80 of the PEI instructions which we expect to take 7 cycles
; since the direct page is not aligned. So total overhead is
; 25 / (25 + 7 * 80) = 4.27% of execution
;
; Without the interrupt breaks, we could remove the dex/beq test
; and save 4 cycles per loop which takes the overhead down to
; only 3.6%
:inner jmp $0000 ; 25 cycles of overhead per line. A full width slam executes all
; 80 of the PEI instructions which we expect to take 7 cycles
; since the direct page is not aligned. So total overhead is
; 25 / (25 + 7 * 80) = 4.27% of execution
;
; Without the interrupt breaks, we could remove the dex/beq test
; and save 4 cycles per loop which takes the overhead down to
; only 3.6%
:restore
tsx ; save the current stack
_R0W0 ; restore the execution environment and
lda :stk_save ; give a few cycles to catch some interrupts
tcs
cli ; fall through here -- saves a BRA instruction
tsx ; save the current stack
_R0W0 ; restore the execution environment and
lda :stk_save ; give a few cycles to catch some interrupts
tcs
cli ; fall through here -- saves a BRA instruction
:outer
sei
txs ; set the stack address to the right edge
ldx #8 ; Enable interrupts at least once every 8 lines
_R1W1
bra :inner
sei
txs ; set the stack address to the right edge
ldx #8 ; Enable interrupts at least once every 8 lines
_R1W1
bra :inner
:exit
_R0W0
lda :stk_save
tcs
cli
_R0W0
lda :stk_save
tcs
cli
pld
rts
:stk_save ds 2
:screen_width_1 ds 2
pld
rts
:stk_save ds 2
:screen_width ds 2

View File

@ -186,6 +186,57 @@ CopyTile
sta $7001,y
rts
; Primitives to render a dynamic tile
;
; LDA 00,x / PHA where the operand is fixed when the tile is rendered
; $B5 $00 $48
;
; A = dynamic tile id (must be an 8-bit value)
:DynTile
and #$00FF
ora #$4800
sta: $0004,y
sta $1004,y
sta $2004,y
sta $3004,y
sta $4004,y
sta $5004,y
sta $6004,y
sta $7004,y
inc
inc
sta: $0001,y
sta $1001,y
sta $2001,y
sta $3001,y
sta $4001,y
sta $5001,y
sta $6001,y
sta $7001,y
sep #$20
lda #$B5
sta: $0000,y
sta: $0003,y
sta $1000,y
sta $1003,y
sta $2000,y
sta $2003,y
sta $3000,y
sta $3003,y
sta $4000,y
sta $4003,y
sta $5000,y
sta $5003,y
sta $6000,y
sta $6003,y
sta $7000,y
sta $7003,y
rep #$20
rts