mirror of
https://github.com/emmanuel-marty/lzsa.git
synced 2024-11-25 10:30:45 +00:00
Fast 6502 LZSA2 depacker: smaller size, same speed
This commit is contained in:
parent
8721c11041
commit
410544f4e6
@ -5,10 +5,12 @@
|
|||||||
;
|
;
|
||||||
; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
|
; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
|
||||||
;
|
;
|
||||||
|
; This code is written for the ACME assembler.
|
||||||
|
;
|
||||||
; Optional code is presented for two minor 6502 optimizations that break
|
; Optional code is presented for two minor 6502 optimizations that break
|
||||||
; compatibility with the current LZSA2 format standard.
|
; compatibility with the current LZSA2 format standard.
|
||||||
;
|
;
|
||||||
; This code is written for the ACME assembler.
|
; The code is 241 bytes for the small version, and 256 bytes for the normal.
|
||||||
;
|
;
|
||||||
; Copyright John Brandwood 2019.
|
; Copyright John Brandwood 2019.
|
||||||
;
|
;
|
||||||
@ -28,7 +30,7 @@
|
|||||||
;
|
;
|
||||||
|
|
||||||
;
|
;
|
||||||
; Save 7 bytes of code, and 21 cycles every time that a
|
; Save 7 bytes of code, and 21 cycles every time that a
|
||||||
; 16-bit length is decoded?
|
; 16-bit length is decoded?
|
||||||
;
|
;
|
||||||
; N.B. Setting this breaks compatibility with LZSA v1.2
|
; N.B. Setting this breaks compatibility with LZSA v1.2
|
||||||
@ -46,12 +48,34 @@ LZSA_SWAP_LEN16 = 0
|
|||||||
LZSA_SWAP_XZY = 0
|
LZSA_SWAP_XZY = 0
|
||||||
|
|
||||||
;
|
;
|
||||||
; Remove code inlining to save space?
|
; Choose size over space (within sane limits)?
|
||||||
;
|
|
||||||
; This saves 15 bytes of code, but decompression is 7% slower.
|
|
||||||
;
|
;
|
||||||
|
|
||||||
LZSA_BEST_SIZE = 0
|
LZSA_SMALL_SIZE = 0
|
||||||
|
|
||||||
|
;
|
||||||
|
; Remove code inlining to save space?
|
||||||
|
;
|
||||||
|
; This saves 15 bytes of code at the cost of 7% speed.
|
||||||
|
;
|
||||||
|
|
||||||
|
!if LZSA_SMALL_SIZE {
|
||||||
|
LZSA_NO_INLINE = 1
|
||||||
|
} else {
|
||||||
|
LZSA_NO_INLINE = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
;
|
||||||
|
; Use smaller code for copying literals?
|
||||||
|
;
|
||||||
|
; This saves 11 bytes of code at the cost of 5% speed.
|
||||||
|
;
|
||||||
|
|
||||||
|
!if LZSA_SMALL_SIZE {
|
||||||
|
LZSA_SHORT_CP = 1
|
||||||
|
} else {
|
||||||
|
LZSA_SHORT_CP = 1
|
||||||
|
}
|
||||||
|
|
||||||
;
|
;
|
||||||
; Assume that we're decompessing from a large multi-bank
|
; Assume that we're decompessing from a large multi-bank
|
||||||
@ -66,62 +90,61 @@ LZSA_FROM_BANK = 0
|
|||||||
;
|
;
|
||||||
|
|
||||||
!if LZSA_FROM_BANK {
|
!if LZSA_FROM_BANK {
|
||||||
|
!macro LZSA_INC_PAGE {
|
||||||
!macro LZSA_INC_PAGE {
|
jsr .next_page
|
||||||
jsr .next_page
|
}
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
!macro LZSA_INC_PAGE {
|
||||||
!macro LZSA_INC_PAGE {
|
inc <lzsa_srcptr + 1
|
||||||
inc <lzsa_srcptr + 1
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
;
|
;
|
||||||
; Macro to read a byte from the compressed source data.
|
; Macro to read a byte from the compressed source data.
|
||||||
;
|
;
|
||||||
|
|
||||||
!if LZSA_BEST_SIZE {
|
!if LZSA_NO_INLINE {
|
||||||
|
|
||||||
!macro LZSA_GET_SRC {
|
!macro LZSA_GET_SRC {
|
||||||
jsr .get_byte
|
jsr lzsa2_get_byte
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
!macro LZSA_GET_SRC {
|
!macro LZSA_GET_SRC {
|
||||||
lda (lzsa_srcptr),y
|
lda (lzsa_srcptr),y
|
||||||
inc <lzsa_srcptr + 0
|
inc <lzsa_srcptr + 0
|
||||||
bne .skip
|
bne .skip
|
||||||
+LZSA_INC_PAGE
|
+LZSA_INC_PAGE
|
||||||
.skip:
|
.skip:
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
;
|
;
|
||||||
; Macro to speed up reading 50% of nibbles.
|
; Macro to speed up reading 50% of nibbles.
|
||||||
;
|
;
|
||||||
|
; This seems to save very few cycles compared to the
|
||||||
|
; increase in code size, and it isn't recommended.
|
||||||
|
;
|
||||||
|
|
||||||
LZSA_SLOW_NIBL = 1
|
LZSA_SLOW_NIBL = 1
|
||||||
|
|
||||||
!if (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
|
!if (LZSA_SLOW_NIBL + LZSA_SMALL_SIZE) {
|
||||||
|
|
||||||
!macro LZSA_GET_NIBL {
|
!macro LZSA_GET_NIBL {
|
||||||
jsr lzsa2_get_nibble ; Always call a function.
|
jsr lzsa2_get_nibble ; Always call a function.
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
!macro LZSA_GET_NIBL {
|
!macro LZSA_GET_NIBL {
|
||||||
lsr <lzsa_nibflg ; Is there a nibble waiting?
|
lsr <lzsa_nibflg ; Is there a nibble waiting?
|
||||||
lda <lzsa_nibble ; Extract the lo-nibble.
|
lda <lzsa_nibble ; Extract the lo-nibble.
|
||||||
bcs .skip\@
|
bcs .skip
|
||||||
jsr .new_nibble ; Extract the hi-nibble.
|
jsr lzsa2_new_nibble ; Extract the hi-nibble.
|
||||||
.skip\@: ora #$F0
|
.skip: ora #$F0
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -141,15 +164,19 @@ lzsa_winptr = $FA ; 1 word.
|
|||||||
lzsa_srcptr = $FC ; 1 word.
|
lzsa_srcptr = $FC ; 1 word.
|
||||||
lzsa_dstptr = $FE ; 1 word.
|
lzsa_dstptr = $FE ; 1 word.
|
||||||
|
|
||||||
|
lzsa_length = lzsa_winptr ; 1 word.
|
||||||
|
|
||||||
LZSA_SRC_LO = $FC
|
LZSA_SRC_LO = $FC
|
||||||
LZSA_SRC_HI = $FD
|
LZSA_SRC_HI = $FD
|
||||||
LZSA_DST_LO = $FE
|
LZSA_DST_LO = $FE
|
||||||
LZSA_DST_HI = $FF
|
LZSA_DST_HI = $FF
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
; ***************************************************************************
|
; ***************************************************************************
|
||||||
; ***************************************************************************
|
; ***************************************************************************
|
||||||
;
|
;
|
||||||
; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2b format.
|
; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2 format.
|
||||||
;
|
;
|
||||||
; Args: lzsa_srcptr = ptr to compessed data
|
; Args: lzsa_srcptr = ptr to compessed data
|
||||||
; Args: lzsa_dstptr = ptr to output buffer
|
; Args: lzsa_dstptr = ptr to output buffer
|
||||||
@ -182,20 +209,45 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
|
|
||||||
jsr .get_length ; X=0 table index for literals.
|
jsr .get_length ; X=0 table index for literals.
|
||||||
|
|
||||||
|
!if LZSA_SHORT_CP {
|
||||||
|
|
||||||
|
.got_cp_len: cmp #0 ; Check the lo-byte of length.
|
||||||
|
beq .put_cp_len
|
||||||
|
|
||||||
|
inx ; Increment # of pages to copy.
|
||||||
|
|
||||||
|
.put_cp_len: stx <lzsa_length
|
||||||
|
tax
|
||||||
|
|
||||||
|
.cp_page: lda (lzsa_srcptr),y
|
||||||
|
sta (lzsa_dstptr),y
|
||||||
|
inc <lzsa_srcptr + 0
|
||||||
|
bne .skip1
|
||||||
|
inc <lzsa_srcptr + 1
|
||||||
|
.skip1: inc <lzsa_dstptr + 0
|
||||||
|
bne .skip2
|
||||||
|
inc <lzsa_dstptr + 1
|
||||||
|
.skip2: dex
|
||||||
|
bne .cp_page
|
||||||
|
dec <lzsa_length ; Any full pages left to copy?
|
||||||
|
bne .cp_page
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
.got_cp_len: tay ; Check the lo-byte of length.
|
.got_cp_len: tay ; Check the lo-byte of length.
|
||||||
beq .cp_page
|
beq .cp_page
|
||||||
|
|
||||||
inx ; Increment # of pages to copy.
|
inx ; Increment # of pages to copy.
|
||||||
|
|
||||||
.get_cp_src: clc ; Calc source for partial
|
.get_cp_src: clc ; Calc address of partial page.
|
||||||
adc <lzsa_srcptr + 0 ; page.
|
adc <lzsa_srcptr + 0
|
||||||
sta <lzsa_srcptr + 0
|
sta <lzsa_srcptr + 0
|
||||||
bcs .get_cp_dst
|
bcs .get_cp_dst
|
||||||
dec <lzsa_srcptr + 1
|
dec <lzsa_srcptr + 1
|
||||||
|
|
||||||
.get_cp_dst: tya
|
.get_cp_dst: tya
|
||||||
clc ; Calc destination for partial
|
clc ; Calc address of partial page.
|
||||||
adc <lzsa_dstptr + 0 ; page.
|
adc <lzsa_dstptr + 0
|
||||||
sta <lzsa_dstptr + 0
|
sta <lzsa_dstptr + 0
|
||||||
bcs .get_cp_idx
|
bcs .get_cp_idx
|
||||||
dec <lzsa_dstptr + 1
|
dec <lzsa_dstptr + 1
|
||||||
@ -214,13 +266,15 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
dex ; Any full pages left to copy?
|
dex ; Any full pages left to copy?
|
||||||
bne .cp_page
|
bne .cp_page
|
||||||
|
|
||||||
!if LZSA_SWAP_XZY {
|
}
|
||||||
|
|
||||||
|
!if LZSA_SWAP_XZY {
|
||||||
|
|
||||||
;
|
;
|
||||||
; Shorter and faster path with NEW order of bits.
|
; Shorter and faster path with NEW order of bits.
|
||||||
;
|
;
|
||||||
; STD NEW
|
; STD NEW
|
||||||
; ================================
|
; ================================
|
||||||
; xyz xzy
|
; xyz xzy
|
||||||
; 00z 0z0 5-bit offset
|
; 00z 0z0 5-bit offset
|
||||||
; 01z 0z1 9-bit offset
|
; 01z 0z1 9-bit offset
|
||||||
@ -240,10 +294,10 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
bne .get_9_bits
|
bne .get_9_bits
|
||||||
|
|
||||||
.get_5_bits: dex ; X=$FF
|
.get_5_bits: dex ; X=$FF
|
||||||
.get_13_bits: LZSA_GET_NIBL ; Always returns with CS.
|
.get_13_bits: +LZSA_GET_NIBL ; Always returns with CS.
|
||||||
bvc .get_5_skip
|
bvc .get_5_skip
|
||||||
clc
|
clc
|
||||||
.get_5_skip: rol a ; Shift into position, set C.
|
.get_5_skip: rol ; Shift into position, set C.
|
||||||
cpx #$00 ; X=$FF for a 5-bit offset.
|
cpx #$00 ; X=$FF for a 5-bit offset.
|
||||||
bne .set_offset
|
bne .set_offset
|
||||||
sbc #2 ; Subtract 512 because 13-bit
|
sbc #2 ; Subtract 512 because 13-bit
|
||||||
@ -264,8 +318,8 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
;
|
;
|
||||||
; Slower and longer path with STD order of bits.
|
; Slower and longer path with STD order of bits.
|
||||||
;
|
;
|
||||||
; Z80 NES
|
; STD NEW
|
||||||
; ================================
|
; ================================
|
||||||
; xyz xzy
|
; xyz xzy
|
||||||
; 00z 0z0 5-bit offset
|
; 00z 0z0 5-bit offset
|
||||||
; 01z 0z1 9-bit offset
|
; 01z 0z1 9-bit offset
|
||||||
@ -274,6 +328,10 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
; 111 111 repeat offset
|
; 111 111 repeat offset
|
||||||
; NVZ for a BIT instruction
|
; NVZ for a BIT instruction
|
||||||
;
|
;
|
||||||
|
; N.B. Costs 3 bytes in code length.
|
||||||
|
; get5 and get13 are 8 cycles slower.
|
||||||
|
; get9, get16, and rep are 4 cycles slower.
|
||||||
|
;
|
||||||
|
|
||||||
.lz_offset: lda <lzsa_cmdbuf
|
.lz_offset: lda <lzsa_cmdbuf
|
||||||
asl
|
asl
|
||||||
@ -313,7 +371,7 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
; N.B. X=0 is expected and guaranteed when we get here.
|
; N.B. X=0 is expected and guaranteed when we get here.
|
||||||
;
|
;
|
||||||
|
|
||||||
.get_16_bits: jsr .get_byte ; Get hi-byte of offset.
|
.get_16_bits: jsr lzsa2_get_byte ; Get hi-byte of offset.
|
||||||
tax
|
tax
|
||||||
|
|
||||||
.get_low8: +LZSA_GET_SRC ; Get lo-byte of offset.
|
.get_low8: +LZSA_GET_SRC ; Get lo-byte of offset.
|
||||||
@ -379,8 +437,6 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
;
|
;
|
||||||
; Get 16-bit length in X:A register pair.
|
; Get 16-bit length in X:A register pair.
|
||||||
;
|
;
|
||||||
; N.B. Requires reversal of bytes in 16-bit length.
|
|
||||||
;
|
|
||||||
|
|
||||||
.get_length: +LZSA_GET_NIBL
|
.get_length: +LZSA_GET_NIBL
|
||||||
cmp #$FF ; Extended length?
|
cmp #$FF ; Extended length?
|
||||||
@ -390,34 +446,35 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
.got_length: ldx #$00 ; Set hi-byte of 4 & 8 bit
|
.got_length: ldx #$00 ; Set hi-byte of 4 & 8 bit
|
||||||
rts ; lengths.
|
rts ; lengths.
|
||||||
|
|
||||||
.byte_length: jsr .get_byte ; So rare, this can be slow!
|
.byte_length: jsr lzsa2_get_byte ; So rare, this can be slow!
|
||||||
adc .byte_len_tbl,x ; Always CS from previous CMP.
|
adc .byte_len_tbl,x ; Always CS from previous CMP.
|
||||||
bcc .got_length
|
bcc .got_length
|
||||||
beq .finished
|
beq .finished
|
||||||
|
|
||||||
!if LZSA_SWAP_LEN16 {
|
!if LZSA_SWAP_LEN16 {
|
||||||
|
|
||||||
.word_length: jsr .get_byte ; So rare, this can be slow!
|
.word_length: jsr lzsa2_get_byte ; So rare, this can be slow!
|
||||||
tax
|
tax
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
.word_length: jsr .get_byte ; So rare, this can be slow!
|
.word_length: jsr lzsa2_get_byte ; So rare, this can be slow!
|
||||||
pha
|
pha
|
||||||
jsr .get_byte ; So rare, this can be slow!
|
jsr lzsa2_get_byte ; So rare, this can be slow!
|
||||||
tax
|
tax
|
||||||
pla
|
pla
|
||||||
rts
|
rts
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.get_byte: lda (lzsa_srcptr),y ; Subroutine version for when
|
lzsa2_get_byte:
|
||||||
|
lda (lzsa_srcptr),y ; Subroutine version for when
|
||||||
inc <lzsa_srcptr + 0 ; inlining isn't advantageous.
|
inc <lzsa_srcptr + 0 ; inlining isn't advantageous.
|
||||||
beq .next_page
|
beq .next_page
|
||||||
rts
|
rts
|
||||||
|
|
||||||
.next_page: inc <lzsa_srcptr + 1 ; Inc & test for bank overflow.
|
.next_page: inc <lzsa_srcptr + 1 ; Inc & test for bank overflow.
|
||||||
!if LZSA_FROM_BANK {
|
!if LZSA_FROM_BANK {
|
||||||
bmi .next_bank ; Change for target hardware!
|
bmi .next_bank ; Change for target hardware!
|
||||||
}
|
}
|
||||||
rts
|
rts
|
||||||
@ -430,9 +487,10 @@ lzsa2_unpack: ldy #0 ; Initialize source index.
|
|||||||
; Get a nibble value from compressed data in A.
|
; Get a nibble value from compressed data in A.
|
||||||
;
|
;
|
||||||
|
|
||||||
!if (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
|
!if (LZSA_SLOW_NIBL | LZSA_SMALL_SIZE) {
|
||||||
|
|
||||||
lzsa2_get_nibble: lsr <lzsa_nibflg ; Is there a nibble waiting?
|
lzsa2_get_nibble:
|
||||||
|
lsr <lzsa_nibflg ; Is there a nibble waiting?
|
||||||
lda <lzsa_nibble ; Extract the lo-nibble.
|
lda <lzsa_nibble ; Extract the lo-nibble.
|
||||||
bcs .got_nibble
|
bcs .got_nibble
|
||||||
|
|
||||||
@ -453,13 +511,14 @@ lzsa2_get_nibble: lsr <lzsa_nibflg ; Is there a nibble waiting
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
.new_nibble: inc <lzsa_nibflg ; Reset the flag.
|
lzsa2_new_nibble:
|
||||||
LZSA_GET_SRC
|
inc <lzsa_nibflg ; Reset the flag.
|
||||||
|
+LZSA_GET_SRC
|
||||||
sta <lzsa_nibble ; Preserve for next time.
|
sta <lzsa_nibble ; Preserve for next time.
|
||||||
lsr a ; Extract the hi-nibble.
|
lsr ; Extract the hi-nibble.
|
||||||
lsr a
|
lsr
|
||||||
lsr a
|
lsr
|
||||||
lsr a
|
lsr
|
||||||
|
|
||||||
!if LZSA_SWAP_XZY {
|
!if LZSA_SWAP_XZY {
|
||||||
sec ; Offset code relies on CS.
|
sec ; Offset code relies on CS.
|
||||||
|
Loading…
Reference in New Issue
Block a user