From 7481dbd72c0469f44348567aeb42fedbe82817ae Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Mon, 26 Feb 2018 12:36:36 -0500 Subject: [PATCH] chiptune: seem to have stripped-down rasterbars going --- asm_routines/gr_hlin.s | 1 + chiptune_player/README.chiptune | 73 ++++++--- chiptune_player/TODO | 7 +- chiptune_player/chiptune_player.s | 1 - chiptune_player/interrupt_handler.s | 2 +- chiptune_player/rasterbars.s | 243 ++++++++++++++++------------ 6 files changed, 194 insertions(+), 133 deletions(-) diff --git a/asm_routines/gr_hlin.s b/asm_routines/gr_hlin.s index fb9d816e..29efa598 100644 --- a/asm_routines/gr_hlin.s +++ b/asm_routines/gr_hlin.s @@ -46,6 +46,7 @@ hlin_double: ;================================= ; hlin_double_continue: width ;================================= + ; GBASL has correct offset for row/col ; width in X hlin_double_continue: diff --git a/chiptune_player/README.chiptune b/chiptune_player/README.chiptune index 5cb355e1..350a8a0f 100644 --- a/chiptune_player/README.chiptune +++ b/chiptune_player/README.chiptune @@ -252,6 +252,51 @@ reasonable sized songs on our play list (KRW(3) in table at end). For double buffer, then we need 256*2*14*2 (14k) for decompress and 16k for file size which still works. +VISUALIZATION +~~~~~~~~~~~~~ + + Originally I had the volume bars and rasterbars in userspace running, + so it didn't matter how long they took to draw (they'd just get a worse + frame rate if the interrupts were taking a while). + + But then I had to move the decompression to userspace, and the visualization + into the interrupt handler. + + Then things got interesting. The visualization was taking so much time that + userspace was starved and decompression was not finishing in time, so the + sound was corrupted and finished early. + + Thus it was time for some cycle analysis. Here's what I found. + + Approx max 20,000 cycles in an interrupt + 1,500 used by music decode + 7,500 used by volume bars + 16,200 (!) used by raster bars + 2,000 for misc rest + + So the problem can be seen here! That 16,200 for raster bars was + worst-case, it usually would have been a little less. + + If takes roughly 700,000 cycles to LZ4 decode a block, so even with + no interrupt can take 35 frames (0.7s) to finish. + + I added a variable TIME_TAKEN ($88) that you can use to find out how + long the last LZ4 decode took. + + With rasterbars turned off: + INTRO2: 60@19, 60@36, 62@50 61@1:03 61@1:32 60@2:05 61@2:32 + + So roughly $60 (96) frames, or about 2 seconds. + + I went in and optimized the rasterbars code a lot and got it down to + about 10k cycles worst case (6k probably average case). + + So now it takes $A0 (160) frames, or about 3 seconds. This seems to + be workable. + + +FIGURES/TABLES +~~~~~~~~~~~~~~ Memory Map @@ -321,29 +366,9 @@ Interesting bugs that were hard to debug: one too far and it was writing A0 to the first byte of the hlin routine, and A0 is a LDY # instruction. - - - -Know the current problem, taking longer than 5s to decode file. -Thought it only took 1s max? Not in face of interrupts. - - Every 20,000 an interrupt - 1,500 for music - 7,500 for volume bars - 16,200 (!)for raster bars - 2,000 for misc rest - - Roughly 13,000 cycles, leaving only 7000 to userspace - - If takes 700,000 cycles to decode a block, will take 100 - Hz cycles, or 2s to finish? that should be doable. - why does it instead take 15? - Can play fine if I turn the raster bars off. - - TIME_TAKEN ($88) stores how long took to decode - INTRO2: 60@19, 60@36, 62@50 61@1:03 61@1:32 60@2:05 61@2:32 - -+ Our old friend, forget the # so we're comparing against some random - zero page value rather than a constsant ++ Our old friend: forgetting the '#' so we're comparing against some random + zero page value rather than a constant + Related, the accidentally put in a $ when I meant for it to be decimal. + I was copying to $14 pages instead of 14, overwriting the DOS buffers + which I didn't notice until I tried to load the next file. diff --git a/chiptune_player/TODO b/chiptune_player/TODO index 020c4f33..cac34089 100644 --- a/chiptune_player/TODO +++ b/chiptune_player/TODO @@ -1,6 +1,3 @@ -+ Loop support -+ Right/Left arrows on screen -+ Keyboard, Right/Left/Pause -+ Calculate maximum decode time for songs ++ Song loop support (hard if looping to the middle of a block) -+ Put graphics update in interrupt routine, put debug in normal space ++ Fix the dos33 tool, it has issues when the disk is nearly full diff --git a/chiptune_player/chiptune_player.s b/chiptune_player/chiptune_player.s index 581b04b0..bf926fce 100644 --- a/chiptune_player/chiptune_player.s +++ b/chiptune_player/chiptune_player.s @@ -224,7 +224,6 @@ done_play: jsr clear_bottoms - jsr new_song cli ; re-enable interrupts diff --git a/chiptune_player/interrupt_handler.s b/chiptune_player/interrupt_handler.s index d66873ef..36d42fd5 100644 --- a/chiptune_player/interrupt_handler.s +++ b/chiptune_player/interrupt_handler.s @@ -266,7 +266,7 @@ done_interrupt: ;============================ jsr clear_top -; jsr draw_rasters + jsr draw_rasters jsr volume_bars jsr page_flip diff --git a/chiptune_player/rasterbars.s b/chiptune_player/rasterbars.s index 6608bc99..ea9d1314 100644 --- a/chiptune_player/rasterbars.s +++ b/chiptune_player/rasterbars.s @@ -1,5 +1,12 @@ ; Not quite a raster-bar, but why not + +; OPTIMIZATION (as originally it was 16,200 instructions, a bit much +; for a max 20,000 cycle interrupt handler) + +; -120 : Unroll the zero loop, saved 120 cycles +; -5000 : Inline the vlin_double code + ;=========== ; CONSTANTS ;=========== @@ -20,46 +27,63 @@ draw_rasters: ; clear rows - ldy #(NUM_ROWS-1) ; 2 +; ldy #(NUM_ROWS-1) ; 2 lda #0 ; 2 init_rows: - sta row_color,Y ; 5 - dey ; 2 - bpl init_rows ; 2nt/3 - ;============== - ; 4+20*10 = 204 + sta row_color+0 ; 4 + sta row_color+1 + sta row_color+2 + sta row_color+3 + sta row_color+4 + sta row_color+5 + sta row_color+6 + sta row_color+7 + sta row_color+8 + sta row_color+9 + sta row_color+10 + sta row_color+11 + sta row_color+12 + sta row_color+13 + sta row_color+14 + sta row_color+15 + sta row_color+16 + sta row_color+17 + sta row_color+18 + sta row_color+19 + + +; sta row_color,Y ; 5 +; dey ; 2 +; bpl init_rows ; 2nt/3 + ;============== + ; Originally 4+20*10 = 204 cyles / 10 bytes + ; now 2+4*20 = 82 cycles / 62 bytes + ;================ ; set colors - lda #COLOR_BOTH_AQUA ; aqua ; 2 ldy SCREEN_Y ; 3 + + lda #COLOR_BOTH_DARKBLUE ; red ; 2 jsr set_row_color ; 6+136 - lda #COLOR_BOTH_MEDIUMBLUE ; medium blue ; 2 + lda #COLOR_BOTH_MEDIUMBLUE ; red ; 2 jsr set_row_color ; 6+136 - lda #COLOR_BOTH_LIGHTGREEN ; light green ; 2 + lda #COLOR_BOTH_AQUA ; red ; 2 jsr set_row_color ; 6+136 - lda #COLOR_BOTH_DARKGREEN ; green ; 2 - jsr set_row_color ; 6+136 - - lda #COLOR_BOTH_YELLOW ; yellow ; 2 - jsr set_row_color ; 6+136 - - lda #COLOR_BOTH_ORANGE ; orange ; 2 - jsr set_row_color ; 6+136 - - lda #COLOR_BOTH_PINK ; pink ; 2 + lda #COLOR_BOTH_PINK ; red ; 2 jsr set_row_color ; 6+136 lda #COLOR_BOTH_RED ; red ; 2 jsr set_row_color ; 6+136 + ;============== - ; 8 * 144 - ; 1152 + ; new = 5 * 142 = 710 + ; original = 1152 ;================= ; draw rows @@ -71,23 +95,38 @@ draw_rows_loop: sta COLOR ; 3 - + sty TEMPY ; 3 tya ; 2 - pha ; 3 asl ; 2 + tay ; 2 + + ; hlin_setup inlined + + lda gr_offsets,Y ; lookup low-res memory address ; 4 + sta GBASL ; 3 + iny ; 2 + lda gr_offsets,Y ; 4 + clc ; 2 + adc DRAW_PAGE ; add in draw page offset ; 3 + sta GBASH ; 3 ldy #39 ; 2 - sty V2 ; 3 - ldy #0 ; 2 - jsr hlin_double ; hlin y,V2 at A ; 63+(40*16) - pla ; 4 - tay ; 2 + lda COLOR ; 3 +double_loop: + sta (GBASL),Y ; 6 + dey ; 2 + bpl double_loop ; 2nt/3 + + ldy TEMPY ; 3 + draw_rows_skip: dey ; 2 bpl draw_rows_loop ; 3/2nt - ;============= - ; 20 * 741 - ; 14,820 + + ;============================== + ; Original: 20 * 741 = 14,820 + ; new = 2+ 20*(53+11*40)=9862 + ; (note, worst case) ;================== ; update y pointer ;================== @@ -104,8 +143,10 @@ not_there: ;=========== ; 24 - ;============================= - ; total= 16,200 + ;===================================== + ; original total= 16,200 + ; new total (worst case)= 10,678 + ; (realistic) = 5,748 ;=================== ;=================== @@ -125,9 +166,7 @@ set_row_color: lda fine_sine,X ; lookup sine value ; 4 ; pre-shifted right by 4, sign-extended - - clc ; 2 - adc #18 ; add in 18 to center on screen ; 2 + ; 18 added to center sin_no_more: @@ -144,7 +183,7 @@ sin_no_more: rts ; 6 ;============= - ; 136 + ; 132 ;================== ; put_color @@ -187,74 +226,74 @@ row_color: .byte $00,$00,$00,$00,$00, $00,$00,$00,$00,$00 .byte $00,$00,$00,$00,$00, $00,$00,$00,$00,$00 -; arithmatically shifted right by 4 +; arithmatically shifted right by 4, sign extended, added 18 to center ; FIXME: exploit symmetry and get rid of 3/4 of this table ; possibly not worth the extra code fine_sine: -.byte $00 ; 0.000000 -.byte $01 ; 0.098017 -.byte $03 ; 0.195090 -.byte $04 ; 0.290285 -.byte $06 ; 0.382683 -.byte $07 ; 0.471397 -.byte $08 ; 0.555570 -.byte $0A ; 0.634393 -.byte $0B ; 0.707107 -.byte $0C ; 0.773010 -.byte $0D ; 0.831470 -.byte $0E ; 0.881921 -.byte $0E ; 0.923880 -.byte $0F ; 0.956940 -.byte $0F ; 0.980785 -.byte $0F ; 0.995185 -.byte $0F ; 1.000000 -.byte $0F ; 0.995185 -.byte $0F ; 0.980785 -.byte $0F ; 0.956940 -.byte $0E ; 0.923880 -.byte $0E ; 0.881921 -.byte $0D ; 0.831470 -.byte $0C ; 0.773010 -.byte $0B ; 0.707107 -.byte $0A ; 0.634393 -.byte $08 ; 0.555570 -.byte $07 ; 0.471397 -.byte $06 ; 0.382683 -.byte $04 ; 0.290285 -.byte $03 ; 0.195090 -.byte $01 ; 0.098017 -.byte $00 ; 0.000000 +.byte $00+18 ; 0.000000 +.byte $01+18 ; 0.098017 +.byte $03+18 ; 0.195090 +.byte $04+18 ; 0.290285 +.byte $06+18 ; 0.382683 +.byte $07+18 ; 0.471397 +.byte $08+18 ; 0.555570 +.byte $0A+18 ; 0.634393 +.byte $0B+18 ; 0.707107 +.byte $0C+18 ; 0.773010 +.byte $0D+18 ; 0.831470 +.byte $0E+18 ; 0.881921 +.byte $0E+18 ; 0.923880 +.byte $0F+18 ; 0.956940 +.byte $0F+18 ; 0.980785 +.byte $0F+18 ; 0.995185 +.byte $0F+18 ; 1.000000 +.byte $0F+18 ; 0.995185 +.byte $0F+18 ; 0.980785 +.byte $0F+18 ; 0.956940 +.byte $0E+18 ; 0.923880 +.byte $0E+18 ; 0.881921 +.byte $0D+18 ; 0.831470 +.byte $0C+18 ; 0.773010 +.byte $0B+18 ; 0.707107 +.byte $0A+18 ; 0.634393 +.byte $08+18 ; 0.555570 +.byte $07+18 ; 0.471397 +.byte $06+18 ; 0.382683 +.byte $04+18 ; 0.290285 +.byte $03+18 ; 0.195090 +.byte $01+18 ; 0.098017 +.byte $00+18 ; 0.000000 -.byte $FE ; -0.098017 -.byte $FC ; -0.195090 -.byte $FB ; -0.290285 -.byte $F9 ; -0.382683 -.byte $F8 ; -0.471397 -.byte $F7 ; -0.555570 -.byte $F5 ; -0.634393 -.byte $F4 ; -0.707107 -.byte $F3 ; -0.773010 -.byte $F2 ; -0.831470 -.byte $F1 ; -0.881921 -.byte $F1 ; -0.923880 -.byte $F0 ; -0.956940 -.byte $F0 ; -0.980785 -.byte $F0 ; -0.995185 -.byte $F0 ; -1.000000 -.byte $F0 ; -0.995185 -.byte $F0 ; -0.980785 -.byte $F0 ; -0.956940 -.byte $F1 ; -0.923880 -.byte $F1 ; -0.881921 -.byte $F2 ; -0.831470 -.byte $F3 ; -0.773010 -.byte $F4 ; -0.707107 -.byte $F5 ; -0.634393 -.byte $F7 ; -0.555570 -.byte $F8 ; -0.471397 -.byte $F9 ; -0.382683 -.byte $FB ; -0.290285 -.byte $FC ; -0.195090 -.byte $FE ; -0.098017 +.byte ($FE+18)&$ff ; -0.098017 +.byte ($FC+18)&$ff ; -0.195090 +.byte ($FB+18)&$ff ; -0.290285 +.byte ($F9+18)&$ff ; -0.382683 +.byte ($F8+18)&$ff ; -0.471397 +.byte ($F7+18)&$ff ; -0.555570 +.byte ($F5+18)&$ff ; -0.634393 +.byte ($F4+18)&$ff ; -0.707107 +.byte ($F3+18)&$ff ; -0.773010 +.byte ($F2+18)&$ff ; -0.831470 +.byte ($F1+18)&$ff ; -0.881921 +.byte ($F1+18)&$ff ; -0.923880 +.byte ($F0+18)&$ff ; -0.956940 +.byte ($F0+18)&$ff ; -0.980785 +.byte ($F0+18)&$ff ; -0.995185 +.byte ($F0+18)&$ff ; -1.000000 +.byte ($F0+18)&$ff ; -0.995185 +.byte ($F0+18)&$ff ; -0.980785 +.byte ($F0+18)&$ff ; -0.956940 +.byte ($F1+18)&$ff ; -0.923880 +.byte ($F1+18)&$ff ; -0.881921 +.byte ($F2+18)&$ff ; -0.831470 +.byte ($F3+18)&$ff ; -0.773010 +.byte ($F4+18)&$ff ; -0.707107 +.byte ($F5+18)&$ff ; -0.634393 +.byte ($F7+18)&$ff ; -0.555570 +.byte ($F8+18)&$ff ; -0.471397 +.byte ($F9+18)&$ff ; -0.382683 +.byte ($FB+18)&$ff ; -0.290285 +.byte ($FC+18)&$ff ; -0.195090 +.byte ($FE+18)&$ff ; -0.098017