chiptune: seem to have stripped-down rasterbars going

This commit is contained in:
Vince Weaver 2018-02-26 12:36:36 -05:00
parent 2ac79fa80a
commit 7481dbd72c
6 changed files with 194 additions and 133 deletions

View File

@ -46,6 +46,7 @@ hlin_double:
;=================================
; hlin_double_continue: width
;=================================
; GBASL has correct offset for row/col
; width in X
hlin_double_continue:

View File

@ -252,6 +252,51 @@ reasonable sized songs on our play list (KRW(3) in table at end).
For double buffer, then we need 256*2*14*2 (14k) for decompress
and 16k for file size which still works.
VISUALIZATION
~~~~~~~~~~~~~
Originally I had the volume bars and rasterbars in userspace running,
so it didn't matter how long they took to draw (they'd just get a worse
frame rate if the interrupts were taking a while).
But then I had to move the decompression to userspace, and the visualization
into the interrupt handler.
Then things got interesting. The visualization was taking so much time that
userspace was starved and decompression was not finishing in time, so the
sound was corrupted and finished early.
Thus it was time for some cycle analysis. Here's what I found.
Approx max 20,000 cycles in an interrupt
1,500 used by music decode
7,500 used by volume bars
16,200 (!) used by raster bars
2,000 for misc rest
So the problem can be seen here! That 16,200 for raster bars was
worst-case, it usually would have been a little less.
If takes roughly 700,000 cycles to LZ4 decode a block, so even with
no interrupt can take 35 frames (0.7s) to finish.
I added a variable TIME_TAKEN ($88) that you can use to find out how
long the last LZ4 decode took.
With rasterbars turned off:
INTRO2: 60@19, 60@36, 62@50 61@1:03 61@1:32 60@2:05 61@2:32
So roughly $60 (96) frames, or about 2 seconds.
I went in and optimized the rasterbars code a lot and got it down to
about 10k cycles worst case (6k probably average case).
So now it takes $A0 (160) frames, or about 3 seconds. This seems to
be workable.
FIGURES/TABLES
~~~~~~~~~~~~~~
Memory Map
@ -321,29 +366,9 @@ Interesting bugs that were hard to debug:
one too far and it was writing A0 to the first byte of the
hlin routine, and A0 is a LDY # instruction.
Know the current problem, taking longer than 5s to decode file.
Thought it only took 1s max? Not in face of interrupts.
Every 20,000 an interrupt
1,500 for music
7,500 for volume bars
16,200 (!)for raster bars
2,000 for misc rest
Roughly 13,000 cycles, leaving only 7000 to userspace
If takes 700,000 cycles to decode a block, will take 100
Hz cycles, or 2s to finish? that should be doable.
why does it instead take 15?
Can play fine if I turn the raster bars off.
TIME_TAKEN ($88) stores how long took to decode
INTRO2: 60@19, 60@36, 62@50 61@1:03 61@1:32 60@2:05 61@2:32
+ Our old friend, forget the # so we're comparing against some random
zero page value rather than a constsant
+ Our old friend: forgetting the '#' so we're comparing against some random
zero page value rather than a constant
+ Related, the accidentally put in a $ when I meant for it to be decimal.
I was copying to $14 pages instead of 14, overwriting the DOS buffers
which I didn't notice until I tried to load the next file.

View File

@ -1,6 +1,3 @@
+ Loop support
+ Right/Left arrows on screen
+ Keyboard, Right/Left/Pause
+ Calculate maximum decode time for songs
+ Song loop support (hard if looping to the middle of a block)
+ Put graphics update in interrupt routine, put debug in normal space
+ Fix the dos33 tool, it has issues when the disk is nearly full

View File

@ -224,7 +224,6 @@ done_play:
jsr clear_bottoms
jsr new_song
cli ; re-enable interrupts

View File

@ -266,7 +266,7 @@ done_interrupt:
;============================
jsr clear_top
; jsr draw_rasters
jsr draw_rasters
jsr volume_bars
jsr page_flip

View File

@ -1,5 +1,12 @@
; Not quite a raster-bar, but why not
; OPTIMIZATION (as originally it was 16,200 instructions, a bit much
; for a max 20,000 cycle interrupt handler)
; -120 : Unroll the zero loop, saved 120 cycles
; -5000 : Inline the vlin_double code
;===========
; CONSTANTS
;===========
@ -20,46 +27,63 @@ draw_rasters:
; clear rows
ldy #(NUM_ROWS-1) ; 2
; ldy #(NUM_ROWS-1) ; 2
lda #0 ; 2
init_rows:
sta row_color,Y ; 5
dey ; 2
bpl init_rows ; 2nt/3
;==============
; 4+20*10 = 204
sta row_color+0 ; 4
sta row_color+1
sta row_color+2
sta row_color+3
sta row_color+4
sta row_color+5
sta row_color+6
sta row_color+7
sta row_color+8
sta row_color+9
sta row_color+10
sta row_color+11
sta row_color+12
sta row_color+13
sta row_color+14
sta row_color+15
sta row_color+16
sta row_color+17
sta row_color+18
sta row_color+19
; sta row_color,Y ; 5
; dey ; 2
; bpl init_rows ; 2nt/3
;==============
; Originally 4+20*10 = 204 cyles / 10 bytes
; now 2+4*20 = 82 cycles / 62 bytes
;================
; set colors
lda #COLOR_BOTH_AQUA ; aqua ; 2
ldy SCREEN_Y ; 3
lda #COLOR_BOTH_DARKBLUE ; red ; 2
jsr set_row_color ; 6+136
lda #COLOR_BOTH_MEDIUMBLUE ; medium blue ; 2
lda #COLOR_BOTH_MEDIUMBLUE ; red ; 2
jsr set_row_color ; 6+136
lda #COLOR_BOTH_LIGHTGREEN ; light green ; 2
lda #COLOR_BOTH_AQUA ; red ; 2
jsr set_row_color ; 6+136
lda #COLOR_BOTH_DARKGREEN ; green ; 2
jsr set_row_color ; 6+136
lda #COLOR_BOTH_YELLOW ; yellow ; 2
jsr set_row_color ; 6+136
lda #COLOR_BOTH_ORANGE ; orange ; 2
jsr set_row_color ; 6+136
lda #COLOR_BOTH_PINK ; pink ; 2
lda #COLOR_BOTH_PINK ; red ; 2
jsr set_row_color ; 6+136
lda #COLOR_BOTH_RED ; red ; 2
jsr set_row_color ; 6+136
;==============
; 8 * 144
; 1152
; new = 5 * 142 = 710
; original = 1152
;=================
; draw rows
@ -71,23 +95,38 @@ draw_rows_loop:
sta COLOR ; 3
sty TEMPY ; 3
tya ; 2
pha ; 3
asl ; 2
tay ; 2
; hlin_setup inlined
lda gr_offsets,Y ; lookup low-res memory address ; 4
sta GBASL ; 3
iny ; 2
lda gr_offsets,Y ; 4
clc ; 2
adc DRAW_PAGE ; add in draw page offset ; 3
sta GBASH ; 3
ldy #39 ; 2
sty V2 ; 3
ldy #0 ; 2
jsr hlin_double ; hlin y,V2 at A ; 63+(40*16)
pla ; 4
tay ; 2
lda COLOR ; 3
double_loop:
sta (GBASL),Y ; 6
dey ; 2
bpl double_loop ; 2nt/3
ldy TEMPY ; 3
draw_rows_skip:
dey ; 2
bpl draw_rows_loop ; 3/2nt
;=============
; 20 * 741
; 14,820
;==============================
; Original: 20 * 741 = 14,820
; new = 2+ 20*(53+11*40)=9862
; (note, worst case)
;==================
; update y pointer
;==================
@ -104,8 +143,10 @@ not_there:
;===========
; 24
;=============================
; total= 16,200
;=====================================
; original total= 16,200
; new total (worst case)= 10,678
; (realistic) = 5,748
;===================
;===================
@ -125,9 +166,7 @@ set_row_color:
lda fine_sine,X ; lookup sine value ; 4
; pre-shifted right by 4, sign-extended
clc ; 2
adc #18 ; add in 18 to center on screen ; 2
; 18 added to center
sin_no_more:
@ -144,7 +183,7 @@ sin_no_more:
rts ; 6
;=============
; 136
; 132
;==================
; put_color
@ -187,74 +226,74 @@ row_color:
.byte $00,$00,$00,$00,$00, $00,$00,$00,$00,$00
.byte $00,$00,$00,$00,$00, $00,$00,$00,$00,$00
; arithmatically shifted right by 4
; arithmatically shifted right by 4, sign extended, added 18 to center
; FIXME: exploit symmetry and get rid of 3/4 of this table
; possibly not worth the extra code
fine_sine:
.byte $00 ; 0.000000
.byte $01 ; 0.098017
.byte $03 ; 0.195090
.byte $04 ; 0.290285
.byte $06 ; 0.382683
.byte $07 ; 0.471397
.byte $08 ; 0.555570
.byte $0A ; 0.634393
.byte $0B ; 0.707107
.byte $0C ; 0.773010
.byte $0D ; 0.831470
.byte $0E ; 0.881921
.byte $0E ; 0.923880
.byte $0F ; 0.956940
.byte $0F ; 0.980785
.byte $0F ; 0.995185
.byte $0F ; 1.000000
.byte $0F ; 0.995185
.byte $0F ; 0.980785
.byte $0F ; 0.956940
.byte $0E ; 0.923880
.byte $0E ; 0.881921
.byte $0D ; 0.831470
.byte $0C ; 0.773010
.byte $0B ; 0.707107
.byte $0A ; 0.634393
.byte $08 ; 0.555570
.byte $07 ; 0.471397
.byte $06 ; 0.382683
.byte $04 ; 0.290285
.byte $03 ; 0.195090
.byte $01 ; 0.098017
.byte $00 ; 0.000000
.byte $00+18 ; 0.000000
.byte $01+18 ; 0.098017
.byte $03+18 ; 0.195090
.byte $04+18 ; 0.290285
.byte $06+18 ; 0.382683
.byte $07+18 ; 0.471397
.byte $08+18 ; 0.555570
.byte $0A+18 ; 0.634393
.byte $0B+18 ; 0.707107
.byte $0C+18 ; 0.773010
.byte $0D+18 ; 0.831470
.byte $0E+18 ; 0.881921
.byte $0E+18 ; 0.923880
.byte $0F+18 ; 0.956940
.byte $0F+18 ; 0.980785
.byte $0F+18 ; 0.995185
.byte $0F+18 ; 1.000000
.byte $0F+18 ; 0.995185
.byte $0F+18 ; 0.980785
.byte $0F+18 ; 0.956940
.byte $0E+18 ; 0.923880
.byte $0E+18 ; 0.881921
.byte $0D+18 ; 0.831470
.byte $0C+18 ; 0.773010
.byte $0B+18 ; 0.707107
.byte $0A+18 ; 0.634393
.byte $08+18 ; 0.555570
.byte $07+18 ; 0.471397
.byte $06+18 ; 0.382683
.byte $04+18 ; 0.290285
.byte $03+18 ; 0.195090
.byte $01+18 ; 0.098017
.byte $00+18 ; 0.000000
.byte $FE ; -0.098017
.byte $FC ; -0.195090
.byte $FB ; -0.290285
.byte $F9 ; -0.382683
.byte $F8 ; -0.471397
.byte $F7 ; -0.555570
.byte $F5 ; -0.634393
.byte $F4 ; -0.707107
.byte $F3 ; -0.773010
.byte $F2 ; -0.831470
.byte $F1 ; -0.881921
.byte $F1 ; -0.923880
.byte $F0 ; -0.956940
.byte $F0 ; -0.980785
.byte $F0 ; -0.995185
.byte $F0 ; -1.000000
.byte $F0 ; -0.995185
.byte $F0 ; -0.980785
.byte $F0 ; -0.956940
.byte $F1 ; -0.923880
.byte $F1 ; -0.881921
.byte $F2 ; -0.831470
.byte $F3 ; -0.773010
.byte $F4 ; -0.707107
.byte $F5 ; -0.634393
.byte $F7 ; -0.555570
.byte $F8 ; -0.471397
.byte $F9 ; -0.382683
.byte $FB ; -0.290285
.byte $FC ; -0.195090
.byte $FE ; -0.098017
.byte ($FE+18)&$ff ; -0.098017
.byte ($FC+18)&$ff ; -0.195090
.byte ($FB+18)&$ff ; -0.290285
.byte ($F9+18)&$ff ; -0.382683
.byte ($F8+18)&$ff ; -0.471397
.byte ($F7+18)&$ff ; -0.555570
.byte ($F5+18)&$ff ; -0.634393
.byte ($F4+18)&$ff ; -0.707107
.byte ($F3+18)&$ff ; -0.773010
.byte ($F2+18)&$ff ; -0.831470
.byte ($F1+18)&$ff ; -0.881921
.byte ($F1+18)&$ff ; -0.923880
.byte ($F0+18)&$ff ; -0.956940
.byte ($F0+18)&$ff ; -0.980785
.byte ($F0+18)&$ff ; -0.995185
.byte ($F0+18)&$ff ; -1.000000
.byte ($F0+18)&$ff ; -0.995185
.byte ($F0+18)&$ff ; -0.980785
.byte ($F0+18)&$ff ; -0.956940
.byte ($F1+18)&$ff ; -0.923880
.byte ($F1+18)&$ff ; -0.881921
.byte ($F2+18)&$ff ; -0.831470
.byte ($F3+18)&$ff ; -0.773010
.byte ($F4+18)&$ff ; -0.707107
.byte ($F5+18)&$ff ; -0.634393
.byte ($F7+18)&$ff ; -0.555570
.byte ($F8+18)&$ff ; -0.471397
.byte ($F9+18)&$ff ; -0.382683
.byte ($FB+18)&$ff ; -0.290285
.byte ($FC+18)&$ff ; -0.195090
.byte ($FE+18)&$ff ; -0.098017