Fast 6502 LZSA2 depacker: smaller size, same speed

This commit is contained in:
Emmanuel Marty 2020-01-03 10:01:23 +01:00 committed by GitHub
parent 8721c11041
commit 410544f4e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -5,10 +5,12 @@
; ;
; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format. ; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
; ;
; This code is written for the ACME assembler.
;
; Optional code is presented for two minor 6502 optimizations that break ; Optional code is presented for two minor 6502 optimizations that break
; compatibility with the current LZSA2 format standard. ; compatibility with the current LZSA2 format standard.
; ;
; This code is written for the ACME assembler. ; The code is 241 bytes for the small version, and 256 bytes for the normal.
; ;
; Copyright John Brandwood 2019. ; Copyright John Brandwood 2019.
; ;
@ -46,12 +48,34 @@ LZSA_SWAP_LEN16 = 0
LZSA_SWAP_XZY = 0 LZSA_SWAP_XZY = 0
; ;
; Remove code inlining to save space? ; Choose size over space (within sane limits)?
;
; This saves 15 bytes of code, but decompression is 7% slower.
; ;
LZSA_BEST_SIZE = 0 LZSA_SMALL_SIZE = 0
;
; Remove code inlining to save space?
;
; This saves 15 bytes of code at the cost of 7% speed.
;
!if LZSA_SMALL_SIZE {
LZSA_NO_INLINE = 1
} else {
LZSA_NO_INLINE = 0
}
;
; Use smaller code for copying literals?
;
; This saves 11 bytes of code at the cost of 5% speed.
;
!if LZSA_SMALL_SIZE {
LZSA_SHORT_CP = 1
} else {
LZSA_SHORT_CP = 1
}
; ;
; Assume that we're decompessing from a large multi-bank ; Assume that we're decompessing from a large multi-bank
@ -66,27 +90,23 @@ LZSA_FROM_BANK = 0
; ;
!if LZSA_FROM_BANK { !if LZSA_FROM_BANK {
!macro LZSA_INC_PAGE { !macro LZSA_INC_PAGE {
jsr .next_page jsr .next_page
} }
} else { } else {
!macro LZSA_INC_PAGE { !macro LZSA_INC_PAGE {
inc <lzsa_srcptr + 1 inc <lzsa_srcptr + 1
} }
} }
; ;
; Macro to read a byte from the compressed source data. ; Macro to read a byte from the compressed source data.
; ;
!if LZSA_BEST_SIZE { !if LZSA_NO_INLINE {
!macro LZSA_GET_SRC { !macro LZSA_GET_SRC {
jsr .get_byte jsr lzsa2_get_byte
} }
} else { } else {
@ -104,10 +124,13 @@ LZSA_FROM_BANK = 0
; ;
; Macro to speed up reading 50% of nibbles. ; Macro to speed up reading 50% of nibbles.
; ;
; This seems to save very few cycles compared to the
; increase in code size, and it isn't recommended.
;
LZSA_SLOW_NIBL = 1 LZSA_SLOW_NIBL = 1
!if (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) { !if (LZSA_SLOW_NIBL + LZSA_SMALL_SIZE) {
!macro LZSA_GET_NIBL { !macro LZSA_GET_NIBL {
jsr lzsa2_get_nibble ; Always call a function. jsr lzsa2_get_nibble ; Always call a function.
@ -118,9 +141,9 @@ LZSA_SLOW_NIBL = 1
!macro LZSA_GET_NIBL { !macro LZSA_GET_NIBL {
lsr <lzsa_nibflg ; Is there a nibble waiting? lsr <lzsa_nibflg ; Is there a nibble waiting?
lda <lzsa_nibble ; Extract the lo-nibble. lda <lzsa_nibble ; Extract the lo-nibble.
bcs .skip\@ bcs .skip
jsr .new_nibble ; Extract the hi-nibble. jsr lzsa2_new_nibble ; Extract the hi-nibble.
.skip\@: ora #$F0 .skip: ora #$F0
} }
} }
@ -141,15 +164,19 @@ lzsa_winptr = $FA ; 1 word.
lzsa_srcptr = $FC ; 1 word. lzsa_srcptr = $FC ; 1 word.
lzsa_dstptr = $FE ; 1 word. lzsa_dstptr = $FE ; 1 word.
lzsa_length = lzsa_winptr ; 1 word.
LZSA_SRC_LO = $FC LZSA_SRC_LO = $FC
LZSA_SRC_HI = $FD LZSA_SRC_HI = $FD
LZSA_DST_LO = $FE LZSA_DST_LO = $FE
LZSA_DST_HI = $FF LZSA_DST_HI = $FF
; *************************************************************************** ; ***************************************************************************
; *************************************************************************** ; ***************************************************************************
; ;
; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2b format. ; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2 format.
; ;
; Args: lzsa_srcptr = ptr to compessed data ; Args: lzsa_srcptr = ptr to compessed data
; Args: lzsa_dstptr = ptr to output buffer ; Args: lzsa_dstptr = ptr to output buffer
@ -182,20 +209,45 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
jsr .get_length ; X=0 table index for literals. jsr .get_length ; X=0 table index for literals.
!if LZSA_SHORT_CP {
.got_cp_len: cmp #0 ; Check the lo-byte of length.
beq .put_cp_len
inx ; Increment # of pages to copy.
.put_cp_len: stx <lzsa_length
tax
.cp_page: lda (lzsa_srcptr),y
sta (lzsa_dstptr),y
inc <lzsa_srcptr + 0
bne .skip1
inc <lzsa_srcptr + 1
.skip1: inc <lzsa_dstptr + 0
bne .skip2
inc <lzsa_dstptr + 1
.skip2: dex
bne .cp_page
dec <lzsa_length ; Any full pages left to copy?
bne .cp_page
} else {
.got_cp_len: tay ; Check the lo-byte of length. .got_cp_len: tay ; Check the lo-byte of length.
beq .cp_page beq .cp_page
inx ; Increment # of pages to copy. inx ; Increment # of pages to copy.
.get_cp_src: clc ; Calc source for partial .get_cp_src: clc ; Calc address of partial page.
adc <lzsa_srcptr + 0 ; page. adc <lzsa_srcptr + 0
sta <lzsa_srcptr + 0 sta <lzsa_srcptr + 0
bcs .get_cp_dst bcs .get_cp_dst
dec <lzsa_srcptr + 1 dec <lzsa_srcptr + 1
.get_cp_dst: tya .get_cp_dst: tya
clc ; Calc destination for partial clc ; Calc address of partial page.
adc <lzsa_dstptr + 0 ; page. adc <lzsa_dstptr + 0
sta <lzsa_dstptr + 0 sta <lzsa_dstptr + 0
bcs .get_cp_idx bcs .get_cp_idx
dec <lzsa_dstptr + 1 dec <lzsa_dstptr + 1
@ -214,6 +266,8 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
dex ; Any full pages left to copy? dex ; Any full pages left to copy?
bne .cp_page bne .cp_page
}
!if LZSA_SWAP_XZY { !if LZSA_SWAP_XZY {
; ;
@ -240,10 +294,10 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
bne .get_9_bits bne .get_9_bits
.get_5_bits: dex ; X=$FF .get_5_bits: dex ; X=$FF
.get_13_bits: LZSA_GET_NIBL ; Always returns with CS. .get_13_bits: +LZSA_GET_NIBL ; Always returns with CS.
bvc .get_5_skip bvc .get_5_skip
clc clc
.get_5_skip: rol a ; Shift into position, set C. .get_5_skip: rol ; Shift into position, set C.
cpx #$00 ; X=$FF for a 5-bit offset. cpx #$00 ; X=$FF for a 5-bit offset.
bne .set_offset bne .set_offset
sbc #2 ; Subtract 512 because 13-bit sbc #2 ; Subtract 512 because 13-bit
@ -264,7 +318,7 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
; ;
; Slower and longer path with STD order of bits. ; Slower and longer path with STD order of bits.
; ;
; Z80 NES ; STD NEW
; ================================ ; ================================
; xyz xzy ; xyz xzy
; 00z 0z0 5-bit offset ; 00z 0z0 5-bit offset
@ -274,6 +328,10 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
; 111 111 repeat offset ; 111 111 repeat offset
; NVZ for a BIT instruction ; NVZ for a BIT instruction
; ;
; N.B. Costs 3 bytes in code length.
; get5 and get13 are 8 cycles slower.
; get9, get16, and rep are 4 cycles slower.
;
.lz_offset: lda <lzsa_cmdbuf .lz_offset: lda <lzsa_cmdbuf
asl asl
@ -313,7 +371,7 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
; N.B. X=0 is expected and guaranteed when we get here. ; N.B. X=0 is expected and guaranteed when we get here.
; ;
.get_16_bits: jsr .get_byte ; Get hi-byte of offset. .get_16_bits: jsr lzsa2_get_byte ; Get hi-byte of offset.
tax tax
.get_low8: +LZSA_GET_SRC ; Get lo-byte of offset. .get_low8: +LZSA_GET_SRC ; Get lo-byte of offset.
@ -379,8 +437,6 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
; ;
; Get 16-bit length in X:A register pair. ; Get 16-bit length in X:A register pair.
; ;
; N.B. Requires reversal of bytes in 16-bit length.
;
.get_length: +LZSA_GET_NIBL .get_length: +LZSA_GET_NIBL
cmp #$FF ; Extended length? cmp #$FF ; Extended length?
@ -390,28 +446,29 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
.got_length: ldx #$00 ; Set hi-byte of 4 & 8 bit .got_length: ldx #$00 ; Set hi-byte of 4 & 8 bit
rts ; lengths. rts ; lengths.
.byte_length: jsr .get_byte ; So rare, this can be slow! .byte_length: jsr lzsa2_get_byte ; So rare, this can be slow!
adc .byte_len_tbl,x ; Always CS from previous CMP. adc .byte_len_tbl,x ; Always CS from previous CMP.
bcc .got_length bcc .got_length
beq .finished beq .finished
!if LZSA_SWAP_LEN16 { !if LZSA_SWAP_LEN16 {
.word_length: jsr .get_byte ; So rare, this can be slow! .word_length: jsr lzsa2_get_byte ; So rare, this can be slow!
tax tax
} else { } else {
.word_length: jsr .get_byte ; So rare, this can be slow! .word_length: jsr lzsa2_get_byte ; So rare, this can be slow!
pha pha
jsr .get_byte ; So rare, this can be slow! jsr lzsa2_get_byte ; So rare, this can be slow!
tax tax
pla pla
rts rts
} }
.get_byte: lda (lzsa_srcptr),y ; Subroutine version for when lzsa2_get_byte:
lda (lzsa_srcptr),y ; Subroutine version for when
inc <lzsa_srcptr + 0 ; inlining isn't advantageous. inc <lzsa_srcptr + 0 ; inlining isn't advantageous.
beq .next_page beq .next_page
rts rts
@ -430,9 +487,10 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
; Get a nibble value from compressed data in A. ; Get a nibble value from compressed data in A.
; ;
!if (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) { !if (LZSA_SLOW_NIBL | LZSA_SMALL_SIZE) {
lzsa2_get_nibble: lsr <lzsa_nibflg ; Is there a nibble waiting? lzsa2_get_nibble:
lsr <lzsa_nibflg ; Is there a nibble waiting?
lda <lzsa_nibble ; Extract the lo-nibble. lda <lzsa_nibble ; Extract the lo-nibble.
bcs .got_nibble bcs .got_nibble
@ -453,13 +511,14 @@ lzsa2_get_nibble: lsr <lzsa_nibflg ; Is there a nibble waiting
} else { } else {
.new_nibble: inc <lzsa_nibflg ; Reset the flag. lzsa2_new_nibble:
LZSA_GET_SRC inc <lzsa_nibflg ; Reset the flag.
+LZSA_GET_SRC
sta <lzsa_nibble ; Preserve for next time. sta <lzsa_nibble ; Preserve for next time.
lsr a ; Extract the hi-nibble. lsr ; Extract the hi-nibble.
lsr a lsr
lsr a lsr
lsr a lsr
!if LZSA_SWAP_XZY { !if LZSA_SWAP_XZY {
sec ; Offset code relies on CS. sec ; Offset code relies on CS.