Fix reorganized 6502 decompress_faster. Approx 3-4% faster, LZSA2 adds 1 byte.

This commit is contained in:
John Brandwood 2021-11-22 17:02:37 -05:00
parent 978c44eca7
commit 4b046625e6
2 changed files with 159 additions and 165 deletions

View File

@ -7,7 +7,7 @@
; ;
; This code is written for the ACME assembler. ; This code is written for the ACME assembler.
; ;
; The code is 167 bytes for the small version, and 192 bytes for the normal. ; The code is 165 bytes for the small version, and 191 bytes for the normal.
; ;
; Copyright John Brandwood 2021. ; Copyright John Brandwood 2021.
; ;
@ -37,15 +37,16 @@ LZSA_SMALL_SIZE = 0
; *************************************************************************** ; ***************************************************************************
; *************************************************************************** ; ***************************************************************************
; ;
; Data usage is last 8 bytes of zero-page. ; Data usage is last 7 bytes of zero-page.
; ;
lzsa_length = $F8 ; 1 byte.
lzsa_cmdbuf = $F9 ; 1 byte. lzsa_cmdbuf = $F9 ; 1 byte.
lzsa_winptr = $FA ; 1 word. lzsa_winptr = $FA ; 1 word.
lzsa_srcptr = $FC ; 1 word. lzsa_srcptr = $FC ; 1 word.
lzsa_dstptr = $FE ; 1 word. lzsa_dstptr = $FE ; 1 word.
lzsa_offset = lzsa_winptr
LZSA_SRC_LO = $FC LZSA_SRC_LO = $FC
LZSA_SRC_HI = $FD LZSA_SRC_HI = $FD
LZSA_DST_LO = $FE LZSA_DST_LO = $FE
@ -60,7 +61,6 @@ LZSA_DST_HI = $FF
; ;
; Args: lzsa_srcptr = ptr to compessed data ; Args: lzsa_srcptr = ptr to compessed data
; Args: lzsa_dstptr = ptr to output buffer ; Args: lzsa_dstptr = ptr to output buffer
; Uses: lots!
; ;
DECOMPRESS_LZSA1_FAST: DECOMPRESS_LZSA1_FAST:
@ -95,19 +95,14 @@ lzsa1_unpack: ldy #0 ; Initialize source index.
lsr lsr
lsr lsr
cmp #$07 ; Extended length? cmp #$07 ; Extended length?
bcc .inc_cp_len bcc .cp_got_len
jsr .get_length ; CS from CMP, X=0. jsr .get_length ; X=0, CS from CMP, returns CC.
stx .cp_npages + 1 ; Hi-byte of length.
ora #0 ; Check the lo-byte of length .cp_got_len: tax ; Lo-byte of length.
beq .put_cp_len ; without effecting CC.
.inc_cp_len: inx ; Increment # of pages to copy. .cp_byte: lda (lzsa_srcptr),y ; CC throughout the execution of
.put_cp_len: stx <lzsa_length
tax
.cp_page: lda (lzsa_srcptr),y ; CC throughout the execution of
sta (lzsa_dstptr),y ; of this .cp_page loop. sta (lzsa_dstptr),y ; of this .cp_page loop.
inc <lzsa_srcptr + 0 inc <lzsa_srcptr + 0
bne .cp_skip1 bne .cp_skip1
@ -116,28 +111,24 @@ lzsa1_unpack: ldy #0 ; Initialize source index.
bne .cp_skip2 bne .cp_skip2
inc <lzsa_dstptr + 1 inc <lzsa_dstptr + 1
.cp_skip2: dex .cp_skip2: dex
bne .cp_page bne .cp_byte
dec <lzsa_length ; Any full pages left to copy? .cp_npages: lda #0 ; Any full pages left to copy?
bne .cp_page beq .lz_offset
dec .cp_npages + 1 ; Unlikely, so can be slow.
bcc .cp_byte ; Always true!
!if LZSA_SMALL_SIZE {
; ;
; Copy bytes from decompressed window. ; Copy bytes from decompressed window.
; ;
; Shorter but slower version.
;
; N.B. X=0 is expected and guaranteed when we get here. ; N.B. X=0 is expected and guaranteed when we get here.
; ;
.lz_offset: !if LZSA_SMALL_SIZE { .lz_offset: jsr .get_byte ; Get offset-lo.
jsr .get_byte
} else {
lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0
bne .offset_lo
inc <lzsa_srcptr + 1
}
.offset_lo: adc <lzsa_dstptr + 0 ; Always CC from .cp_page loop. .offset_lo: adc <lzsa_dstptr + 0 ; Always CC from .cp_page loop.
sta <lzsa_winptr + 0 sta <lzsa_winptr + 0
@ -146,97 +137,107 @@ lzsa1_unpack: ldy #0 ; Initialize source index.
bit <lzsa_cmdbuf bit <lzsa_cmdbuf
bpl .offset_hi bpl .offset_hi
!if LZSA_SMALL_SIZE { jsr .get_byte ; Get offset-hi.
jsr .get_byte
.offset_hi: adc <lzsa_dstptr + 1 ; lzsa_winptr < lzsa_dstptr, so
sta <lzsa_winptr + 1 ; always leaves CS.
.lz_length: lda <lzsa_cmdbuf ; X=0 from previous loop.
and #$0F
adc #$03 - 1 ; CS from previous ADC.
cmp #$12 ; Extended length?
bcc .lz_got_len
jsr .get_length ; CS from CMP, X=0, returns CC.
stx .lz_npages + 1 ; Hi-byte of length.
.lz_got_len: tax ; Lo-byte of length.
.lz_byte: lda (lzsa_winptr),y ; CC throughout the execution of
sta (lzsa_dstptr),y ; of this .lz_page loop.
inc <lzsa_winptr + 0
bne .lz_skip1
inc <lzsa_winptr + 1
.lz_skip1: inc <lzsa_dstptr + 0
bne .lz_skip2
inc <lzsa_dstptr + 1
.lz_skip2: dex
bne .lz_byte
.lz_npages: lda #0 ; Any full pages left to copy?
beq .cp_length
dec .lz_npages + 1 ; Unlikely, so can be slow.
bcc .lz_byte ; Always true!
} else { } else {
;
; Copy bytes from decompressed window.
;
; Longer but faster.
;
; N.B. X=0 is expected and guaranteed when we get here.
;
.lz_offset: lda (lzsa_srcptr),y ; Get offset-lo.
inc <lzsa_srcptr + 0
bne .offset_lo
inc <lzsa_srcptr + 1
.offset_lo: sta <lzsa_offset + 0
lda #$FF ; Get offset-hi.
bit <lzsa_cmdbuf
bpl .offset_hi
lda (lzsa_srcptr),y lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0 inc <lzsa_srcptr + 0
bne .offset_hi bne .offset_hi
inc <lzsa_srcptr + 1 inc <lzsa_srcptr + 1
} .offset_hi: sta <lzsa_offset + 1
.offset_hi: adc <lzsa_dstptr + 1 ; lzsa_winptr < lzsa_dstptr, so
sta <lzsa_winptr + 1 ; always leaves CS.
!if LZSA_SMALL_SIZE {
.lz_length: lda <lzsa_cmdbuf ; X=0 from previous loop. .lz_length: lda <lzsa_cmdbuf ; X=0 from previous loop.
and #$0F and #$0F
adc #$03 - 1 ; CS from previous ADC. adc #$03 ; Always CC from .cp_page loop.
cmp #$12 ; Extended length?
bcc .inc_lz_len
jsr .get_length ; CS from CMP, X=0, returns CC.
ora #0 ; Check the lo-byte of length
beq .put_lz_len ; without effecting CC.
.inc_lz_len: inx ; Increment # of pages to copy.
.put_lz_len: stx <lzsa_length
tax
.lz_page: lda (lzsa_winptr),y ; CC throughout the execution of
sta (lzsa_dstptr),y ; of this .lz_page loop.
inc <lzsa_winptr + 0
bne .skip3
inc <lzsa_winptr + 1
.skip3: inc <lzsa_dstptr + 0
bne .skip4
inc <lzsa_dstptr + 1
.skip4: dex
bne .lz_page
dec <lzsa_length ; Any full pages left to copy?
bne .lz_page
jmp .cp_length ; Loop around to the beginning.
} else {
.lz_length: lda <lzsa_cmdbuf ; X=0 from previous loop.
and #$0F
adc #$03 - 1 ; CS from previous ADC.
cmp #$12 ; Extended length? cmp #$12 ; Extended length?
bcc .got_lz_len bcc .got_lz_len
jsr .get_length ; CS from CMP, X=0, returns CC. jsr .get_length ; X=0, CS from CMP, returns CC.
.got_lz_len: tay ; Check the lo-byte of length. .got_lz_len: inx ; Hi-byte of length+256.
beq .lz_page
inx ; Increment # of pages to copy. eor #$FF ; Negate the lo-byte of length
tay
eor #$FF
.get_lz_win: adc <lzsa_winptr + 0 ; Calc address of partial page. .get_lz_dst: adc <lzsa_dstptr + 0 ; Calc address of partial page.
sta <lzsa_winptr + 0 ; Always CC from previous CMP. sta <lzsa_dstptr + 0 ; Always CC from previous CMP.
bcs .get_lz_dst iny
dec <lzsa_winptr + 1 bcs .get_lz_win
beq .get_lz_win ; Is lo-byte of length zero?
.get_lz_dst: tya ; Calc address of partial page.
clc
adc <lzsa_dstptr + 0
sta <lzsa_dstptr + 0
bcs .get_lz_idx
dec <lzsa_dstptr + 1 dec <lzsa_dstptr + 1
.get_lz_idx: tya ; Negate the lo-byte of length. .get_lz_win: clc ; Calc address of match.
eor #$FF adc <lzsa_offset + 0 ; N.B. Offset is negative!
tay sta <lzsa_winptr + 0
iny lda <lzsa_dstptr + 1
adc <lzsa_offset + 1
sta <lzsa_winptr + 1
.lz_page: lda (lzsa_winptr),y .lz_byte: lda (lzsa_winptr),y
sta (lzsa_dstptr),y sta (lzsa_dstptr),y
iny iny
bne .lz_page bne .lz_byte
inc <lzsa_winptr + 1
inc <lzsa_dstptr + 1 inc <lzsa_dstptr + 1
dex ; Any full pages left to copy? dex ; Any full pages left to copy?
bne .lz_page bne .lz_more
jmp .cp_length ; Loop around to the beginning. jmp .cp_length ; Loop around to the beginning.
.lz_more: inc <lzsa_winptr + 1 ; Unlikely, so can be slow.
bne .lz_byte ; Always true!
} }
; ;
@ -252,30 +253,30 @@ lzsa1_unpack: ldy #0 ; Initialize source index.
inc <lzsa_srcptr + 1 inc <lzsa_srcptr + 1
.skip_inc: bcc .got_length ; No overflow means done. .skip_inc: bcc .got_length ; No overflow means done.
cmp #$01 ; Overflow to 256 or 257? clc ; MUST return CC!
bcc .extra_word tax ; Preserve overflow value.
.extra_byte: clc ; MUST return CC! .extra_byte: jsr .get_byte ; So rare, this can be slow!
inx
bne .get_byte ; Always NZ from previous INX.
.extra_word: jsr .get_byte ; So rare, this can be slow!
pha pha
jsr .get_byte ; So rare, this can be slow! txa ; Overflow to 256 or 257?
tax beq .extra_word
beq .finished ; Length-hi == 0 at EOF.
pla ; Length-lo.
rts
.get_byte: lda (lzsa_srcptr),y ; Subroutine version for when .check_length: pla ; Length-lo.
inc <lzsa_srcptr + 0 ; inlining isn't advantageous. bne .got_length ; Check for zero.
beq .next_page dex ; Do one less page loop if so.
.got_length: rts .got_length: rts
.next_page: inc <lzsa_srcptr + 1 ; Inc & test for bank overflow. .extra_word: jsr .get_byte ; So rare, this can be slow!
rts tax
bne .check_length ; Length-hi == 0 at EOF.
.finished: pla ; Length-lo. .finished: pla ; Length-lo.
pla ; Decompression completed, pop pla ; Decompression completed, pop
pla ; return address. pla ; return address.
rts rts
.get_byte: lda (lzsa_srcptr),y ; Subroutine version for when
inc <lzsa_srcptr + 0 ; inlining isn't advantageous.
bne .got_byte
inc <lzsa_srcptr + 1 ; Inc & test for bank overflow.
.got_byte: rts

View File

@ -7,7 +7,7 @@
; ;
; This code is written for the ACME assembler. ; This code is written for the ACME assembler.
; ;
; The code is 240 bytes for the small version, and 255 bytes for the normal. ; The code is 241 bytes for the small version, and 256 bytes for the normal.
; ;
; Copyright John Brandwood 2021. ; Copyright John Brandwood 2021.
; ;
@ -66,7 +66,6 @@ LZSA_DST_HI = $FF
; ;
; Args: lzsa_srcptr = ptr to compessed data ; Args: lzsa_srcptr = ptr to compessed data
; Args: lzsa_dstptr = ptr to output buffer ; Args: lzsa_dstptr = ptr to output buffer
; Uses: lots!
; ;
DECOMPRESS_LZSA2_FAST: DECOMPRESS_LZSA2_FAST:
@ -101,34 +100,28 @@ lzsa2_unpack: ldx #$00 ; Hi-byte of length or offset.
lsr lsr
lsr lsr
cmp #$03 ; Extended length? cmp #$03 ; Extended length?
bcc .inc_cp_len bcc .cp_got_len
inx jsr .get_length ; X=0 for literals, returns CC.
jsr .get_length ; X=1 for literals, returns CC. stx .cp_npages + 1 ; Hi-byte of length.
ora #0 ; Check the lo-byte of length .cp_got_len: tax ; Lo-byte of length.
beq .put_cp_len ; without effecting CC.
.inc_cp_len: inx ; Increment # of pages to copy. .cp_byte: lda (lzsa_srcptr),y ; CC throughout the execution of
.put_cp_len: stx <lzsa_length
tax
.cp_page: lda (lzsa_srcptr),y ; CC throughout the execution of
sta (lzsa_dstptr),y ; of this .cp_page loop. sta (lzsa_dstptr),y ; of this .cp_page loop.
inc <lzsa_srcptr + 0 inc <lzsa_srcptr + 0
bne .cp_skip1 bne .cp_skip1
inc <lzsa_srcptr + 1 inc <lzsa_srcptr + 1
.cp_skip1: inc <lzsa_dstptr + 0 .cp_skip1: inc <lzsa_dstptr + 0
bne .cp_skip2 bne .cp_skip2
inc <lzsa_dstptr + 1 inc <lzsa_dstptr + 1
.cp_skip2: dex .cp_skip2: dex
bne .cp_page bne .cp_byte
dec <lzsa_length ; Any full pages left to copy? .cp_npages: lda #0 ; Any full pages left to copy?
bne .cp_page beq .lz_offset
dec .cp_npages + 1 ; Unlikely, so can be slow.
bcc .cp_byte ; Always true!
; ;
; Copy bytes from decompressed window. ; Copy bytes from decompressed window.
@ -147,12 +140,13 @@ lzsa2_unpack: ldx #$00 ; Hi-byte of length or offset.
.lz_offset: lda <lzsa_cmdbuf .lz_offset: lda <lzsa_cmdbuf
asl asl
bcs .get_13_16_rep bcs .get_13_16_rep
asl
bcs .get_9_bits
.get_5_bits: dex ; X=$FF .get_5_9_bits: dex ; X=$FF for a 5-bit offset.
.get_13_bits: asl asl
php bcs .get_9_bits ; Fall through if 5-bit.
.get_13_bits: asl ; Both 5-bit and 13-bit read
php ; a nibble.
jsr .get_nibble jsr .get_nibble
plp plp
rol ; Shift into position, clr C. rol ; Shift into position, clr C.
@ -162,11 +156,10 @@ lzsa2_unpack: ldx #$00 ; Hi-byte of length or offset.
sbc #2 ; 13-bit offset from $FE00. sbc #2 ; 13-bit offset from $FE00.
bne .set_hi_8 ; Always NZ from previous SBC. bne .set_hi_8 ; Always NZ from previous SBC.
.get_9_bits: dex ; X=$FF if CS, X=$FE if CC. .get_9_bits: asl ; X=$FF if CC, X=$FE if CS.
asl
bcc .get_lo_8 bcc .get_lo_8
dex dex
bcs .get_lo_8 ; Always VS from previous BIT. bcs .get_lo_8 ; Always CS from previous BCC.
.get_13_16_rep: asl .get_13_16_rep: asl
bcc .get_13_bits ; Shares code with 5-bit path. bcc .get_13_bits ; Shares code with 5-bit path.
@ -190,10 +183,10 @@ lzsa2_unpack: ldx #$00 ; Hi-byte of length or offset.
} }
.set_offset: stx <lzsa_offset + 1 ; Save new offset. .set_offset: sta <lzsa_offset + 0 ; Save new offset.
sta <lzsa_offset + 0 stx <lzsa_offset + 1
.lz_length: ldx #$00 ; Hi-byte of length. .lz_length: ldx #1 ; Hi-byte of length+256.
lda <lzsa_cmdbuf lda <lzsa_cmdbuf
and #$07 and #$07
@ -202,49 +195,49 @@ lzsa2_unpack: ldx #$00 ; Hi-byte of length or offset.
cmp #$09 ; Extended length? cmp #$09 ; Extended length?
bcc .got_lz_len bcc .got_lz_len
jsr .get_length ; X=0 for match, returns CC. jsr .get_length ; X=1 for match, returns CC.
inx ; Hi-byte of length+256.
.got_lz_len: eor #$FF ; Negate the lo-byte of length .got_lz_len: eor #$FF ; Negate the lo-byte of length.
tay ; and check for zero. tay
iny
beq .get_lz_win
eor #$FF eor #$FF
inx ; Increment # of pages to copy.
.get_lz_dst: adc <lzsa_dstptr + 0 ; Calc address of partial page. .get_lz_dst: adc <lzsa_dstptr + 0 ; Calc address of partial page.
sta <lzsa_dstptr + 0 ; Always CC from previous CMP. sta <lzsa_dstptr + 0 ; Always CC from previous CMP.
iny
bcs .get_lz_win bcs .get_lz_win
beq .get_lz_win ; Is lo-byte of length zero?
dec <lzsa_dstptr + 1 dec <lzsa_dstptr + 1
.get_lz_win: clc ; Calc address of match. .get_lz_win: clc ; Calc address of match.
lda <lzsa_dstptr + 0 ; N.B. Offset is negative! adc <lzsa_offset + 0 ; N.B. Offset is negative!
adc <lzsa_offset + 0
sta <lzsa_winptr + 0 sta <lzsa_winptr + 0
lda <lzsa_dstptr + 1 lda <lzsa_dstptr + 1
adc <lzsa_offset + 1 adc <lzsa_offset + 1
sta <lzsa_winptr + 1 sta <lzsa_winptr + 1
.lz_page: lda (lzsa_winptr),y .lz_byte: lda (lzsa_winptr),y
sta (lzsa_dstptr),y sta (lzsa_dstptr),y
iny iny
bne .lz_page bne .lz_byte
inc <lzsa_winptr + 1
inc <lzsa_dstptr + 1 inc <lzsa_dstptr + 1
dex ; Any full pages left to copy? dex ; Any full pages left to copy?
bne .lz_page bne .lz_more
jmp .cp_length ; Loop around to the beginning. jmp .cp_length ; Loop around to the beginning.
.lz_more: inc <lzsa_winptr + 1 ; Unlikely, so can be slow.
bne .lz_byte ; Always true!
; ;
; Lookup tables to differentiate literal and match lengths. ; Lookup tables to differentiate literal and match lengths.
; ;
.nibl_len_tbl: !byte 9 ; 2+7 (for match). .nibl_len_tbl: !byte 3 ; 0+3 (for literal).
!byte 3 ; 0+3 (for literal). !byte 9 ; 2+7 (for match).
.byte_len_tbl: !byte 24 - 1 ; 2+7+15 - CS (for match). .byte_len_tbl: !byte 18 - 1 ; 0+3+15 - CS (for literal).
!byte 18 - 1 ; 0+3+15 - CS (for literal). !byte 24 - 1 ; 2+7+15 - CS (for match).
; ;
; Get 16-bit length in X:A register pair, return with CC. ; Get 16-bit length in X:A register pair, return with CC.
@ -263,21 +256,21 @@ lzsa2_unpack: ldx #$00 ; Hi-byte of length or offset.
bcc .got_length bcc .got_length
beq .finished beq .finished
.word_length: jsr .get_byte ; So rare, this can be slow! .word_length: clc ; MUST return CC!
jsr .get_byte ; So rare, this can be slow!
pha pha
jsr .get_byte ; So rare, this can be slow! jsr .get_byte ; So rare, this can be slow!
tax tax
pla pla
clc ; MUST return CC! bne .got_word ; Check for zero lo-byte.
rts dex ; Do one less page loop if so.
.got_word: rts
.get_byte: lda (lzsa_srcptr),y ; Subroutine version for when .get_byte: lda (lzsa_srcptr),y ; Subroutine version for when
inc <lzsa_srcptr + 0 ; inlining isn't advantageous. inc <lzsa_srcptr + 0 ; inlining isn't advantageous.
beq .next_page bne .got_byte
rts inc <lzsa_srcptr + 1
.got_byte: rts
.next_page: inc <lzsa_srcptr + 1
rts
.finished: pla ; Decompression completed, pop .finished: pla ; Decompression completed, pop
pla ; return address. pla ; return address.