lzsa: update benchmarks

This commit is contained in:
Vince Weaver 2020-03-04 23:22:17 -05:00
parent 8d654dce84
commit 012d449534
14 changed files with 1578 additions and 9 deletions

View File

@ -8,10 +8,15 @@ B2D = ../bmp2dhr/b2d
all: gr_lzsa.dsk
gr_lzsa.dsk: HELLO GR_LZSA
cp empty.dsk mist.dsk
$(DOS33) -y mist.dsk SAVE A HELLO
$(DOS33) -y mist.dsk BSAVE -a 0x1000 GR_LZSA
gr_lzsa.dsk: HELLO GR_LZSA LZSA2_SMALL_BENCH LZSA2_FAST_BENCH \
LZ4_BENCH RLE_BENCH
cp empty.dsk lzsa_test.dsk
$(DOS33) -y lzsa_test.dsk SAVE A HELLO
$(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 GR_LZSA
$(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 LZSA2_SMALL_BENCH
$(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 LZSA2_FAST_BENCH
$(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 RLE_BENCH
$(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 LZ4_BENCH
###
@ -21,6 +26,46 @@ GR_LZSA: gr_lzsa.o
gr_lzsa.o: gr_lzsa.s decompress_small_v2.s
ca65 -o gr_lzsa.o gr_lzsa.s -l gr_lzsa.lst
###
LZSA2_SMALL_BENCH: lzsa2_small_bench.o
ld65 -o LZSA2_SMALL_BENCH lzsa2_small_bench.o -C ../linker_scripts/apple2_1000.inc
lzsa2_small_bench.o: lzsa2_small_bench.s decompress_small_v2.s
ca65 -o lzsa2_small_bench.o lzsa2_small_bench.s -l lzsa2_small_bench.lst
###
LZSA2_FAST_BENCH: lzsa2_fast_bench.o
ld65 -o LZSA2_FAST_BENCH lzsa2_fast_bench.o -C ../linker_scripts/apple2_1000.inc
lzsa2_fast_bench.o: lzsa2_fast_bench.s decompress_fast_v2.s
ca65 -o lzsa2_fast_bench.o lzsa2_fast_bench.s -l lzsa2_fast_bench.lst
###
RLE_BENCH: rle_bench.o
ld65 -o RLE_BENCH rle_bench.o -C ../linker_scripts/apple2_1000.inc
rle_bench.o: rle_bench.s gr_unrle.s
ca65 -o rle_bench.o rle_bench.s -l rle_bench.lst
###
LZ4_BENCH: lz4_bench.o
ld65 -o LZ4_BENCH lz4_bench.o -C ../linker_scripts/apple2_1000.inc
lz4_bench.o: lz4_bench.s lz4_decode.s spaceship_far_n.lz4
ca65 -o lz4_bench.o lz4_bench.s -l lz4_bench.lst
###
spaceship_far_n.lz4:
lz4 -f -16 spaceship_far_n.gr
dd if=spaceship_far_n.gr.lz4 of=spaceship_far_n.lz4 bs=1 skip=11
truncate spaceship_far_n.lz4 -s -8
###
@ -30,4 +75,5 @@ HELLO: hello.bas
####
clean:
rm -f *~ *.o *.lst HELLO GR_LZSA
rm -f *~ *.o *.lst HELLO GR_LZSA RLE_BENCH LZSA2_FAST_BENCH LZSA2_SMALL_BENCH LZ4_BENCH

View File

@ -0,0 +1,367 @@
; note -- modified by Vince Weaver to assemble with ca65
; -----------------------------------------------------------------------------
; Decompress raw LZSA2 block.
; Create one with lzsa -r -f2 <original_file> <compressed_file>
;
; in:
; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
;
; out:
; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
;
; -----------------------------------------------------------------------------
; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
;
; in:
; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
;
; out:
; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
;
; -----------------------------------------------------------------------------
;
; Copyright (C) 2019 Emmanuel Marty, Peter Ferrie
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; -----------------------------------------------------------------------------
;NIBCOUNT = $FC ; zero-page location for temp offset
decompress_lzsa2_fast:
sta LZSA_DST_HI
ldy #$00
sty LZSA_DST_LO
sty NIBCOUNT
decode_token:
jsr getsrc ; read token byte: XYZ|LL|MMM
pha ; preserve token on stack
and #$18 ; isolate literals count (LL)
beq no_literals ; skip if no literals to copy
cmp #$18 ; LITERALS_RUN_LEN_V2?
bcc prepare_copy_literals ; if less, count is directly embedded in token
jsr getnibble ; get extra literals length nibble
; add nibble to len from token
adc #$02 ; (LITERALS_RUN_LEN_V2) minus carry
cmp #$12 ; LITERALS_RUN_LEN_V2 + 15 ?
bcc prepare_copy_literals_direct ; if less, literals count is complete
jsr getsrc ; get extra byte of variable literals count
; the carry is always set by the CMP above
; GETSRC doesn't change it
sbc #$EE ; overflow?
jmp prepare_copy_literals_direct
prepare_copy_literals_large:
; handle 16 bits literals count
; literals count = directly these 16 bits
jsr getlargesrc ; grab low 8 bits in X, high 8 bits in A
tay ; put high 8 bits in Y
bcs prepare_copy_literals_high ; (*same as JMP PREPARE_COPY_LITERALS_HIGH but shorter)
prepare_copy_literals:
lsr ; shift literals count into place
lsr
lsr
prepare_copy_literals_direct:
tax
bcs prepare_copy_literals_large ; if so, literals count is large
prepare_copy_literals_high:
txa
beq copy_literals
iny
copy_literals:
jsr getput ; copy one byte of literals
dex
bne copy_literals
dey
bne copy_literals
no_literals:
pla ; retrieve token from stack
pha ; preserve token again
asl
bcs repmatch_or_large_offset ; 1YZ: rep-match or 13/16 bit offset
asl ; 0YZ: 5 or 9 bit offset
bcs offset_9_bit
; 00Z: 5 bit offset
ldx #$FF ; set offset bits 15-8 to 1
jsr getcombinedbits ; rotate Z bit into bit 0, read nibble for bits 4-1
ora #$E0 ; set bits 7-5 to 1
bne got_offset_lo ; go store low byte of match offset and prepare match
offset_9_bit: ; 01Z: 9 bit offset
;;asl ; shift Z (offset bit 8) in place
rol
rol
and #$01
eor #$FF ; set offset bits 15-9 to 1
bne got_offset_hi ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
repmatch_or_large_offset:
asl ; 13 bit offset?
bcs repmatch_or_16bit ; handle rep-match or 16-bit offset if not
; 10Z: 13 bit offset
jsr getcombinedbits ; rotate Z bit into bit 8, read nibble for bits 12-9
adc #$DE ; set bits 15-13 to 1 and substract 2 (to substract 512)
bne got_offset_hi ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
repmatch_or_16bit: ; rep-match or 16 bit offset
;;ASL ; XYZ=111?
bmi rep_match ; reuse previous offset if so (rep-match)
; 110: handle 16 bit offset
jsr getsrc ; grab high 8 bits
got_offset_hi:
tax
jsr getsrc ; grab low 8 bits
got_offset_lo:
sta OFFSLO ; store low byte of match offset
stx OFFSHI ; store high byte of match offset
rep_match:
.ifdef BACKWARD_DECOMPRESS
; Backward decompression - substract match offset
sec ; add dest + match offset
lda putdst+1 ; low 8 bits
OFFSLO = *+1
sbc #$AA
sta copy_match_loop+1 ; store back reference address
lda putdst+2
OFFSHI = *+1
sbc #$AA ; high 8 bits
sta copy_match_loop+2 ; store high 8 bits of address
sec
.else
; Forward decompression - add match offset
clc ; add dest + match offset
lda putdst+1 ; low 8 bits
OFFSLO = *+1
adc #$AA
sta copy_match_loop+1 ; store back reference address
OFFSHI = *+1
lda #$AA ; high 8 bits
adc putdst+2
sta copy_match_loop+2 ; store high 8 bits of address
.endif
pla ; retrieve token from stack again
and #$07 ; isolate match len (MMM)
adc #$01 ; add MIN_MATCH_SIZE_V2 and carry
cmp #$09 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
bcc prepare_copy_match ; if less, length is directly embedded in token
jsr getnibble ; get extra match length nibble
; add nibble to len from token
adc #$08 ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
cmp #$18 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
bcc prepare_copy_match ; if less, match length is complete
jsr getsrc ; get extra byte of variable match length
; the carry is always set by the CMP above
; GETSRC doesn't change it
sbc #$E8 ; overflow?
prepare_copy_match:
tax
bcc prepare_copy_match_y ; if not, the match length is complete
beq decompression_done ; if EOD code, bail
; Handle 16 bits match length
jsr getlargesrc ; grab low 8 bits in X, high 8 bits in A
tay ; put high 8 bits in Y
prepare_copy_match_y:
txa
beq copy_match_loop
iny
copy_match_loop:
lda $AAAA ; get one byte of backreference
jsr putdst ; copy to destination
.ifdef BACKWARD_DECOMPRESS
; Backward decompression -- put backreference bytes backward
lda copy_match_loop+1
beq getmatch_adj_hi
getmatch_done:
dec copy_match_loop+1
.else
; Forward decompression -- put backreference bytes forward
inc copy_match_loop+1
beq getmatch_adj_hi
getmatch_done:
.endif
dex
bne copy_match_loop
dey
bne copy_match_loop
jmp decode_token
.ifdef BACKWARD_DECOMPRESS
getmatch_adj_hi:
dec copy_match_loop+2
jmp getmatch_done
.else
getmatch_adj_hi:
inc copy_match_loop+2
jmp getmatch_done
.endif
getcombinedbits:
eor #$80
asl
php
jsr getnibble ; get nibble into bits 0-3 (for offset bits 1-4)
plp ; merge Z bit as the carry bit (for offset bit 0)
combinedbitz:
rol ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
decompression_done:
rts
getnibble:
NIBBLES = *+1
lda #$AA
lsr NIBCOUNT
bcc need_nibbles
and #$0F ; isolate low 4 bits of nibble
rts
need_nibbles:
inc NIBCOUNT
jsr getsrc ; get 2 nibbles
sta NIBBLES
lsr
lsr
lsr
lsr
sec
rts
.ifdef BACKWARD_DECOMPRESS
; Backward decompression -- get and put bytes backward
getput:
jsr getsrc
putdst:
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
sta $AAAA
lda putdst+1
beq putdst_adj_hi
dec putdst+1
rts
putdst_adj_hi:
dec putdst+2
dec putdst+1
rts
getlargesrc:
jsr getsrc ; grab low 8 bits
tax ; move to X
; fall through grab high 8 bits
getsrc:
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
lda $AAAA
pha
lda getsrc+1
beq getsrc_adj_hi
dec getsrc+1
pla
rts
getsrc_adj_hi:
dec getsrc+2
dec getsrc+1
pla
rts
.else
; Forward decompression -- get and put bytes forward
getput:
jsr getsrc
putdst:
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
sta $AAAA
inc putdst+1
beq putdst_adj_hi
rts
putdst_adj_hi:
inc putdst+2
rts
getlargesrc:
jsr getsrc ; grab low 8 bits
tax ; move to X
; fall through grab high 8 bits
getsrc:
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
lda $AAAA
inc getsrc+1
beq getsrc_adj_hi
rts
getsrc_adj_hi:
inc getsrc+2
rts
.endif

View File

@ -0,0 +1,637 @@
; ***************************************************************************
; ***************************************************************************
;
; lzsa2_6502.s
;
; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
;
; This code is written for the ACME assembler.
;
; Optional code is presented for two minor 6502 optimizations that break
; compatibility with the current LZSA2 format standard.
;
; The code is 241 bytes for the small version, and 267 bytes for the normal.
;
; Copyright John Brandwood 2019.
;
; Distributed under the Boost Software License, Version 1.0.
; (See accompanying file LICENSE_1_0.txt or copy at
; http://www.boost.org/LICENSE_1_0.txt)
;
; ***************************************************************************
; ***************************************************************************
; ***************************************************************************
; ***************************************************************************
;
; Decompression Options & Macros
;
;
; Save 7 bytes of code, and 21 cycles every time that a
; 16-bit length is decoded?
;
; N.B. Setting this breaks compatibility with LZSA v1.2
;
LZSA_SWAP_LEN16 = 0
;
; Save 3 bytes of code, and 4 or 8 cycles when decoding
; an offset?
;
; N.B. Setting this breaks compatibility with LZSA v1.2
;
LZSA_SWAP_XZY = 0
;
; Choose size over space (within sane limits)?
;
LZSA_SMALL_SIZE = 0
;
; Remove code inlining to save space?
;
; This saves 15 bytes of code at the cost of 7% speed.
;
!if LZSA_SMALL_SIZE {
LZSA_NO_INLINE = 1
} else {
LZSA_NO_INLINE = 0
}
;
; Use smaller code for copying literals?
;
; This saves 11 bytes of code at the cost of 5% speed.
;
!if LZSA_SMALL_SIZE {
LZSA_SHORT_CP = 1
} else {
LZSA_SHORT_CP = 0
}
;
; Assume that we're decompressing from a large multi-bank
; compressed data file, and that the next bank may need to
; paged in when a page-boundary is crossed.
;
LZSA_FROM_BANK = 0
;
; We will read from or write to $FFFF. This prevents the
; use of the "INC ptrhi / BNE" trick and reduces speed.
;
LZSA_USE_FFFF = 0
;
; Macro to increment the source pointer to the next page.
;
!if LZSA_FROM_BANK {
!macro LZSA_INC_PAGE {
jsr lzsa2_next_page
}
} else {
!macro LZSA_INC_PAGE {
inc <lzsa_srcptr + 1
}
}
;
; Macro to read a byte from the compressed source data.
;
!if LZSA_NO_INLINE {
!macro LZSA_GET_SRC {
jsr lzsa2_get_byte
}
} else {
!macro LZSA_GET_SRC {
lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0
bne .skip
+LZSA_INC_PAGE
.skip:
}
}
;
; Macro to speed up reading 50% of nibbles.
;
; This seems to save very few cycles compared to the
; increase in code size, and it isn't recommended.
;
LZSA_SLOW_NIBL = 1
!if (LZSA_SLOW_NIBL + LZSA_SMALL_SIZE) {
!macro LZSA_GET_NIBL {
jsr lzsa2_get_nibble ; Always call a function.
}
} else {
!macro LZSA_GET_NIBL {
lsr <lzsa_nibflg ; Is there a nibble waiting?
lda <lzsa_nibble ; Extract the lo-nibble.
bcs .skip
jsr lzsa2_new_nibble ; Extract the hi-nibble.
.skip: ora #$F0
}
}
; ***************************************************************************
; ***************************************************************************
;
; Data usage is last 11 bytes of zero-page.
;
lzsa_cmdbuf = $F5 ; 1 byte.
lzsa_nibflg = $F6 ; 1 byte.
lzsa_nibble = $F7 ; 1 byte.
lzsa_offset = $F8 ; 1 word.
lzsa_winptr = $FA ; 1 word.
lzsa_srcptr = $FC ; 1 word.
lzsa_dstptr = $FE ; 1 word.
lzsa_length = lzsa_winptr ; 1 word.
LZSA_SRC_LO = $FC
LZSA_SRC_HI = $FD
LZSA_DST_LO = $FE
LZSA_DST_HI = $FF
; ***************************************************************************
; ***************************************************************************
;
; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2 format.
;
; Args: lzsa_srcptr = ptr to compessed data
; Args: lzsa_dstptr = ptr to output buffer
; Uses: lots!
;
; If compiled with LZSA_FROM_BANK, then lzsa_srcptr should be within the bank
; window range.
;
DECOMPRESS_LZSA2_FAST:
lzsa2_unpack: ldy #0 ; Initialize source index.
sty <lzsa_nibflg ; Initialize nibble buffer.
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) = 0 {
beq .cp_length ; always taken
.incsrc1:
inc <lzsa_srcptr + 1
bne .resume_src1 ; always taken
!if LZSA_SHORT_CP {
.incsrc2:
inc <lzsa_srcptr + 1
bne .resume_src2 ; always taken
.incdst:
inc <lzsa_dstptr + 1
bne .resume_dst ; always taken
}
}
;
; Copy bytes from compressed source data.
;
.cp_length: ldx #$00 ; Hi-byte of length or offset.
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) {
+LZSA_GET_SRC
} else {
lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0
beq .incsrc1
}
.resume_src1:
sta <lzsa_cmdbuf ; Preserve this for later.
and #$18 ; Extract literal length.
beq .lz_offset ; Skip directly to match?
lsr ; Get 2-bit literal length.
lsr
lsr
cmp #$03 ; Extended length?
bne .got_cp_len
jsr .get_length ; X=0 table index for literals.
!if LZSA_SHORT_CP {
.got_cp_len: cmp #0 ; Check the lo-byte of length.
beq .put_cp_len
inx ; Increment # of pages to copy.
.put_cp_len: stx <lzsa_length
tax
.cp_page: lda (lzsa_srcptr),y
sta (lzsa_dstptr),y
inc <lzsa_srcptr + 0
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) {
bne .skip1
inc <lzsa_srcptr + 1
.skip1: inc <lzsa_dstptr + 0
bne .skip2
inc <lzsa_dstptr + 1
.skip2:
} else {
beq .incsrc2
.resume_src2:
inc <lzsa_dstptr + 0
beq .incdst
.resume_dst:
}
dex
bne .cp_page
dec <lzsa_length ; Any full pages left to copy?
bne .cp_page
} else {
.got_cp_len: tay ; Check the lo-byte of length.
beq .cp_page
inx ; Increment # of pages to copy.
.get_cp_src: clc ; Calc address of partial page.
adc <lzsa_srcptr + 0
sta <lzsa_srcptr + 0
bcs .get_cp_dst
dec <lzsa_srcptr + 1
.get_cp_dst: tya
clc ; Calc address of partial page.
adc <lzsa_dstptr + 0
sta <lzsa_dstptr + 0
bcs .get_cp_idx
dec <lzsa_dstptr + 1
.get_cp_idx: tya ; Negate the lo-byte of length.
eor #$FF
tay
iny
.cp_page: lda (lzsa_srcptr),y
sta (lzsa_dstptr),y
iny
bne .cp_page
inc <lzsa_srcptr + 1
inc <lzsa_dstptr + 1
dex ; Any full pages left to copy?
bne .cp_page
}
!if LZSA_SWAP_XZY {
;
; Shorter and faster path with NEW order of bits.
;
; STD NEW
; ================================
; xyz xzy
; 00z 0z0 5-bit offset
; 01z 0z1 9-bit offset
; 10z 1z0 13-bit offset
; 110 101 16-bit offset
; 111 111 repeat offset
; NVZ for a BIT instruction
;
; N.B. Saves 3 bytes in code length.
; get5 and get13 are 8 cycles faster.
; get9, get16, and rep are 4 cycles faster.
;
.lz_offset: lda #$20 ; Y bit in lzsa_cmdbuf.
bit <lzsa_cmdbuf
bmi .get_13_16_rep
bne .get_9_bits
.get_5_bits: dex ; X=$FF
.get_13_bits: +LZSA_GET_NIBL ; Always returns with CS.
bvc .get_5_skip
clc
.get_5_skip: rol ; Shift into position, set C.
cpx #$00 ; X=$FF for a 5-bit offset.
bne .set_offset
sbc #2 ; Subtract 512 because 13-bit
tax ; offset starts at $FE00.
bne .get_low8 ; Always NZ from previous TAX.
.get_9_bits: dex ; X=$FF if VC, X=$FE if VS.
bvc .get_low8
dex
bvs .get_low8 ; Always VS from previous BIT.
.get_13_16_rep: beq .get_13_bits ; Shares code with 5-bit path.
.get_16_rep: bvs .lz_length ; Repeat previous offset.
} else {
;
; Slower and longer path with STD order of bits.
;
; STD NEW
; ================================
; xyz xzy
; 00z 0z0 5-bit offset
; 01z 0z1 9-bit offset
; 10z 1z0 13-bit offset
; 110 101 16-bit offset
; 111 111 repeat offset
; NVZ for a BIT instruction
;
; N.B. Costs 3 bytes in code length.
; get5 and get13 are 8 cycles slower.
; get9, get16, and rep are 4 cycles slower.
;
.lz_offset: lda <lzsa_cmdbuf
asl
bcs .get_13_16_rep
asl
bcs .get_9_bits
.get_5_bits: dex ; X=$FF
.get_13_bits: asl
php
+LZSA_GET_NIBL ; Always returns with CS.
plp
rol ; Shift into position, set C.
eor #$01
cpx #$00 ; X=$FF for a 5-bit offset.
bne .set_offset
sbc #2 ; Subtract 512 because 13-bit
; offset starts at $FE00.
bne .get_low8x ; Always NZ from previous SBC.
.get_9_bits: dex ; X=$FF if CS, X=$FE if CC.
asl
bcc .get_low8
dex
bcs .get_low8 ; Always VS from previous BIT.
.get_13_16_rep: asl
bcc .get_13_bits ; Shares code with 5-bit path.
.get_16_rep: bmi .lz_length ; Repeat previous offset.
}
;
; Copy bytes from decompressed window.
;
; N.B. X=0 is expected and guaranteed when we get here.
;
.get_16_bits: jsr lzsa2_get_byte ; Get hi-byte of offset.
.get_low8x: tax
.get_low8:
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) {
+LZSA_GET_SRC ; Get lo-byte of offset.
} else {
lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0
beq .incsrc3
.resume_src3:
}
.set_offset: stx <lzsa_offset + 1 ; Save new offset.
sta <lzsa_offset + 0
.lz_length: ldx #$00 ; Hi-byte of length.
lda <lzsa_cmdbuf
and #$07
clc
adc #$02
cmp #$09 ; Extended length?
bne .got_lz_len
inx
jsr .get_length ; X=1 table index for match.
.got_lz_len: eor #$FF ; Negate the lo-byte of length
tay ; and check for zero.
iny
beq .calc_lz_addr
eor #$FF
inx ; Increment # of pages to copy.
clc ; Calc destination for partial
adc <lzsa_dstptr + 0 ; page.
sta <lzsa_dstptr + 0
bcs .calc_lz_addr
dec <lzsa_dstptr + 1
.calc_lz_addr: clc ; Calc address of match.
lda <lzsa_dstptr + 0 ; N.B. Offset is negative!
adc <lzsa_offset + 0
sta <lzsa_winptr + 0
lda <lzsa_dstptr + 1
adc <lzsa_offset + 1
sta <lzsa_winptr + 1
.lz_page: lda (lzsa_winptr),y
sta (lzsa_dstptr),y
iny
bne .lz_page
inc <lzsa_winptr + 1
inc <lzsa_dstptr + 1
dex ; Any full pages left to copy?
bne .lz_page
jmp .cp_length ; Loop around to the beginning.
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) = 0 {
.incsrc3:
inc <lzsa_srcptr + 1
bne .resume_src3 ; always taken
}
;
; Lookup tables to differentiate literal and match lengths.
;
.nibl_len_tbl: !byte 3 + $10 ; 0+3 (for literal).
!byte 9 + $10 ; 2+7 (for match).
.byte_len_tbl: !byte 18 - 1 ; 0+3+15 - CS (for literal).
!byte 24 - 1 ; 2+7+15 - CS (for match).
;
; Get 16-bit length in X:A register pair.
;
.get_length: +LZSA_GET_NIBL
cmp #$FF ; Extended length?
bcs .byte_length
adc .nibl_len_tbl,x ; Always CC from previous CMP.
.got_length: ldx #$00 ; Set hi-byte of 4 & 8 bit
rts ; lengths.
.byte_length: jsr lzsa2_get_byte ; So rare, this can be slow!
adc .byte_len_tbl,x ; Always CS from previous CMP.
bcc .got_length
beq .finished
!if LZSA_SWAP_LEN16 {
.word_length: jsr lzsa2_get_byte ; So rare, this can be slow!
tax
} else {
.word_length: jsr lzsa2_get_byte ; So rare, this can be slow!
pha
jsr lzsa2_get_byte ; So rare, this can be slow!
tax
pla
rts
}
lzsa2_get_byte:
lda (lzsa_srcptr),y ; Subroutine version for when
inc <lzsa_srcptr + 0 ; inlining isn't advantageous.
beq lzsa2_next_page
rts
lzsa2_next_page:
inc <lzsa_srcptr + 1 ; Inc & test for bank overflow.
!if LZSA_FROM_BANK {
bmi lzsa2_next_bank ; Change for target hardware!
}
rts
.finished: pla ; Decompression completed, pop
pla ; return address.
rts
;
; Get a nibble value from compressed data in A.
;
!if (LZSA_SLOW_NIBL | LZSA_SMALL_SIZE) {
lzsa2_get_nibble:
lsr <lzsa_nibflg ; Is there a nibble waiting?
lda <lzsa_nibble ; Extract the lo-nibble.
bcs .got_nibble
inc <lzsa_nibflg ; Reset the flag.
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) {
+LZSA_GET_SRC
} else {
lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0
beq .incsrc4
.resume_src4:
}
sta <lzsa_nibble ; Preserve for next time.
lsr ; Extract the hi-nibble.
lsr
lsr
lsr
!if LZSA_SWAP_XZY {
sec ; Offset code relies on CS.
}
.got_nibble: ora #$F0
rts
} else {
lzsa2_new_nibble:
inc <lzsa_nibflg ; Reset the flag.
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) {
+LZSA_GET_SRC
} else {
lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0
beq .incsrc4
.resume_src4:
}
sta <lzsa_nibble ; Preserve for next time.
lsr ; Extract the hi-nibble.
lsr
lsr
lsr
!if LZSA_SWAP_XZY {
sec ; Offset code relies on CS.
}
rts
}
!if (LZSA_FROM_BANK | LZSA_NO_INLINE | LZSA_USE_FFFF) = 0 {
.incsrc4:
inc <lzsa_srcptr + 1
bne .resume_src4 ; always taken
}

View File

@ -1,3 +1,5 @@
; Note: modifed by Vince Weaver to assemble with ca65
; -----------------------------------------------------------------------------
; Decompress raw LZSA2 block.
; Create one with lzsa -r -f2 <original_file> <compressed_file>
@ -44,7 +46,12 @@
;NIBCOUNT = $FC ; zero-page location for temp offset
decompress_lzsa2:
; page to decompress to in a
sta LZSA_DST_HI
ldy #$00
sty LZSA_DST_LO
sty NIBCOUNT
decode_token:

View File

@ -13,10 +13,7 @@
lda #>graphic_start
sta LZSA_SRC_HI
lda #$0
sta LZSA_DST_LO
lda #$c
sta LZSA_DST_HI
jsr decompress_lzsa2

112
lzsa_test/gr_unrle.s Normal file
View File

@ -0,0 +1,112 @@
;=================
; load RLE image
;=================
; Output is A:00 (assume page aligned)
; Input is in GBASH/GBASL
; format: first byte=xsize
; A0,X,Y means run of X bytes of Y color
; A1 means end of file
; A2-AF,X means run of low nibble, X color
; if high nibble not A: just display color
; CV = current Y
; CH = max xsize (usually 40)
; TEMP = page
; TEMPY= current X
load_rle_gr:
sec
sbc #4 ; adjust page to write to
; to match gr_offsets
sta TEMP
ldy #$0 ; init Y to 0
sty CV
jsr load_and_increment ; load xsize
sta CH
jsr unrle_new_y
rle_loop:
jsr load_and_increment
tax
cmp #$A1 ; if 0xa1
beq rle_done ; we are done
and #$f0 ; mask
cmp #$a0 ; see if special AX
beq decompress_special
; not special, just color
txa ; put color back in A
ldx #$1 ; only want to print 1
bne decompress_run
decompress_special:
txa ; put read value back in A
and #$0f ; check if was A0
bne decompress_color ; if A0 need to read run, color
decompress_large:
jsr load_and_increment ; run length now in A
decompress_color:
tax ; put runlen into X
jsr load_and_increment ; get color into A
decompress_run:
rle_run_loop:
sta (BASL),y ; write out the value
inc BASL
dec TEMPY
bne rle_not_eol ; if less then keep going
; if here, we are > max_X
inc CV
inc CV
pha
jsr unrle_new_y
pla
rle_not_eol:
dex
bne rle_run_loop ; if not zero, keep looping
beq rle_loop ; and branch always
rle_done:
lda #$15 ; move the cursor somewhere sane
sta CV
rts
load_and_increment:
lda (GBASL),Y
inc GBASL
bne lai_no_oflo
inc GBASH
lai_no_oflo:
rts
unrle_new_y:
ldy CV
lda gr_offsets,Y
sta BASL
lda gr_offsets+1,Y
clc
adc TEMP ; adjust for page
sta BASH
lda CH
sta TEMPY
ldy #0
rts

View File

@ -1,2 +1,2 @@
5 HOME
120 PRINT CHR$(4);"BRUN GR_LZSA"
120 PRINT CHR$(4);"CATALOG"

50
lzsa_test/lz4_bench.s Normal file
View File

@ -0,0 +1,50 @@
.include "zp.inc"
.include "hardware.inc"
lda #0
sta DRAW_PAGE
bit SET_GR
bit PAGE0
bit KEYRESET
pause_loop:
lda KEYPRESS
bpl pause_loop
lda #<graphic_start
sta LZ4_SRC
lda #>graphic_start
sta LZ4_SRC+1
lda #<graphic_end
sta LZ4_END
lda #>graphic_end
sta LZ4_END+1
lda #$0
sta LZ4_DST
lda #$c
sta LZ4_DST+1
before:
jsr lz4_decode
after:
jsr gr_copy_to_current
blah:
jmp blah
.include "lz4_decode.s"
.include "gr_copy.s"
.include "gr_offsets.s"
graphic_start:
.incbin "spaceship_far_n.lz4"
graphic_end:

201
lzsa_test/lz4_decode.s Normal file
View File

@ -0,0 +1,201 @@
; LZ4 data decompressor for Apple II
; Code by Peter Ferrie (qkumba) (peter.ferrie@gmail.com)
; "LZ4 unpacker in 143 bytes (6502 version) (2013)"
; http://pferrie.host22.com/misc/appleii.htm
; This is that code, but with comments and labels added for clarity.
; I also found a bug when decoding with runs of multiples of 256
; which has since been fixed upstream.
; For LZ4 reference see
; https://github.com/lz4/lz4/wiki/lz4_Frame_format.md
; LZ4 summary:
;
; HEADER:
; Should: check for magic number 04 22 4d 18
; FLG: 64 in our case (01=version, block.index=1, block.checksum=0
; size=0, checksum=1, reserved
; MAX Blocksize: 40 (64kB)
; HEADER CHECKSUM: a7
; BLOCK HEADER: 4 bytes (le) length If highest bit set, uncompressed!
; data (see below), followed by checksum?
; BLOCKS:
; Token byte. High 4-bits literal length, low 4-bits copy length
; + If literal length==15, then following byte gets added to length
; If that byte was 255, then keep adding bytes until not 255
; + The literal bytes follow. There may be zero of them
; + Next is block copy info. little-endian 2-byte offset to
; be subtracted from current read position indicating source
; + The low 4-bits of the token are the copy length, which needs
; 4 added to it. As with the literal length, if it is 15 then
; you read a byte and add (and if that byte is 255, keep adding)
; At end you have 4 byte end-of-block marker (all zeros?) then
; 4 bytes of checksum (if marked in flags)
; our code does that, so be sure to set end -8
;LZ4_SRC EQU $00
;LZ4_DST EQU $02
;LZ4_END EQU $04
;COUNT EQU $06
;DELTA EQU $08
;======================
; LZ4 decode
;======================
; input buffer in LZ4_SRC
; end of input in LZ4_END
; output buffer in LZ4_DST
lz4_decode:
unpmain:
ldy #0 ; used to index, always zero
parsetoken:
jsr getsrc ; get next token
pha ; save for later (need bottom 4 bits)
lsr ; number of literals in top 4 bits
lsr ; so shift into place
lsr
lsr
beq copymatches ; if zero, then no literals
; jump ahead and copy
jsr buildcount ; add up all the literal sizes
; result is in ram[count+1]-1:A
tax ; now in ram[count+1]-1:X
jsr docopy ; copy the literals
lda LZ4_SRC ; 16-bit compare
cmp LZ4_END ; to see if we have reached the end
lda LZ4_SRC+1
sbc LZ4_END+1
bcs done
copymatches:
jsr getsrc ; get 16-bit delta value
sta DELTA
jsr getsrc
sta DELTA+1
pla ; restore token
and #$0f ; get bottom 4 bits
; match count. 0 means 4
; 15 means 19+, must be calculated
jsr buildcount ; add up count bits, in ram[count+1]-:A
clc
adc #4 ; adjust count by 4 (minmatch)
tax ; now in ramp[count+1]-1:X
beq copy_no_adjust ; BUGFIX, don't increment if
; exactly a multiple of 0x100
bcc copy_no_adjust
inc COUNT+1 ; increment if we overflowed
copy_no_adjust:
lda LZ4_SRC+1 ; save src on stack
pha
lda LZ4_SRC
pha
sec ; subtract delta
lda LZ4_DST ; from destination, make new src
sbc DELTA
sta LZ4_SRC
lda LZ4_DST+1
sbc DELTA+1
sta LZ4_SRC+1
jsr docopy ; do the copy
pla ; restore the src
sta LZ4_SRC
pla
sta LZ4_SRC+1
jmp parsetoken ; back to parsing tokens
done:
pla
rts
;=========
; getsrc
;=========
; gets byte from src into A, increments pointer
getsrc:
lda (LZ4_SRC), Y ; get a byte from src
inc LZ4_SRC ; increment pointer
bne done_getsrc ; update 16-bit pointer
inc LZ4_SRC+1 ; on 8-bit overflow
done_getsrc:
rts
;============
; buildcount
;============
buildcount:
ldx #1 ; high count starts at 1
stx COUNT+1 ; (loops at zero?)
cmp #$0f ; if LITERAL_COUNT < 15, we are done
bne done_buildcount
buildcount_loop:
sta COUNT ; save LITERAL_COUNT (15)
jsr getsrc ; get the next byte
tax ; put in X
clc
adc COUNT ; add new byte to old value
bcc bc_8bit_oflow ; if overflow, increment high byte
inc COUNT+1
bc_8bit_oflow:
inx ; check if read value was 255
beq buildcount_loop ; if it was, keep looping and adding
done_buildcount:
rts
;============
; getput
;============
; gets a byte, then puts the byte
getput:
jsr getsrc
; fallthrough to putdst
;=============
; putdst
;=============
; store A into destination
putdst:
sta (LZ4_DST), Y ; store A into destination
inc LZ4_DST ; increment 16-bit pointer
bne putdst_end ; if overflow, increment top byte
inc LZ4_DST+1
putdst_end:
rts
;=============================
; docopy
;=============================
; copies ram[count+1]-1:X bytes
; from src to dst
docopy:
docopy_loop:
jsr getput ; get/put byte
dex ; decrement count
bne docopy_loop ; if not zero, loop
dec COUNT+1 ; if zero, decrement high byte
bne docopy_loop ; if not zero, loop
rts

View File

@ -0,0 +1,40 @@
.include "zp.inc"
.include "hardware.inc"
lda #0
sta DRAW_PAGE
bit SET_GR
bit PAGE0
bit KEYRESET
pause_loop:
lda KEYPRESS
bpl pause_loop
lda #<graphic_start
sta LZSA_SRC_LO
lda #>graphic_start
sta LZSA_SRC_HI
before:
lda #$c
jsr decompress_lzsa2_fast
after:
jsr gr_copy_to_current
blah:
jmp blah
.include "decompress_fast_v2.s"
.include "gr_copy.s"
.include "gr_offsets.s"
graphic_start:
.incbin "spaceship_far_n.gr.small_v2"
graphic_end:

View File

@ -0,0 +1,40 @@
.include "zp.inc"
.include "hardware.inc"
lda #0
sta DRAW_PAGE
bit SET_GR
bit PAGE0
bit KEYRESET
pause_loop:
lda KEYPRESS
bpl pause_loop
lda #<graphic_start
sta LZSA_SRC_LO
lda #>graphic_start
sta LZSA_SRC_HI
before:
lda #$c
jsr decompress_lzsa2
after:
jsr gr_copy_to_current
blah:
jmp blah
.include "decompress_small_v2.s"
.include "gr_copy.s"
.include "gr_offsets.s"
graphic_start:
.incbin "spaceship_far_n.gr.small_v2"
graphic_end:

View File

@ -5,3 +5,10 @@ lz4 323
lzsa -r -f1 -- small_v1 -- 252 bytes
lzsa -r -f2 -- small_v2 -- 228 bytes
speed:
rle: 8AE7 cycles 687 byte exe
lzsa_small_v2: E8EF cycles 782 byte exe
lzsa_fast_v2: DFBF cycles 793 byte exe
lz4: 1170A cycles 767 byte exe

65
lzsa_test/rle_bench.s Normal file
View File

@ -0,0 +1,65 @@
.include "zp.inc"
.include "hardware.inc"
lda #0
sta DRAW_PAGE
bit SET_GR
bit PAGE0
bit KEYRESET
pause_loop:
lda KEYPRESS
bpl pause_loop
lda #<graphic_start
sta GBASL
lda #>graphic_start
sta GBASH
before:
lda #$c
jsr load_rle_gr
after:
jsr gr_copy_to_current
blah:
jmp blah
.include "gr_unrle.s"
.include "gr_copy.s"
.include "gr_offsets.s"
graphic_start:
spaceship_far_n_rle: .byte $28 ; ysize=48
.byte $A0,$FF,$FF, $AF,$FF, $A7,$5F, $A3,$F5, $A0,$1B,$FF, $5F, $05,$05
.byte $A4,$00, $0F,$0F, $A0,$1C,$FF, $50, $5F, $05, $A5,$00
.byte $F0, $A0,$11,$FF, $0F, $9F,$9F, $AB,$99, $55, $0A
.byte $90,$90, $A3,$00, $F0, $AE,$FF, $0F, $9F, $D9
.byte $00, $A6,$DD, $A4,$88, $A3,$DD, $88, $55, $D9
.byte $00, $A3,$D9, $D0, $9F, $A8,$FF, $A5,$0F, $00,$00
.byte $8D,$8D, $00, $A6,$8D, $A4,$88, $A4,$8D, $55, $8D
.byte $00, $A7,$8D, $AB,$FF, $F0, $00, $88,$88, $00
.byte $AB,$88, $08,$08, $88, $55, $88, $00, $A6,$88
.byte $F8, $AE,$FF, $F8, $F0, $A6,$08, $A5,$88, $99,$99
.byte $88, $55, $08, $A7,$00, $0F,$0F, $A0,$13,$FF, $A7,$50
.byte $59,$59, $58, $55, $5F, $A0,$01,$AF, $A3,$FF, $A9,$F0
.byte $FF, $AC,$7F, $4F, $45,$45, $AA,$75, $55, $05
.byte $0A, $AC,$7F, $AC,$77, $A3,$44, $00,$00, $44, $54
.byte $A3,$55, $A3,$44, $A3,$00, $A0,$18,$77, $44, $FF, $44
.byte $00,$00, $44, $A4,$55, $A3,$44, $00, $FF, $00
.byte $A0,$18,$77, $A3,$44, $00,$00, $44, $A4,$55, $A3,$44, $A3,$00
.byte $A0,$18,$77, $A3,$44, $00,$00, $A6,$55, $44,$44, $A3,$00, $A0,$18,$77
.byte $A3,$44, $00,$00, $A6,$55, $54, $44, $A3,$00, $A4,$77
.byte $07, $A3,$77, $A3,$07, $AD,$77, $A3,$44, $00, $A8,$55
.byte $44, $A3,$00, $A3,$77, $07, $00, $88,$88, $07
.byte $A3,$00, $07, $AC,$77, $A3,$44, $00, $A9,$55, $A3,$00
.byte $A3,$77, $A3,$00, $A6,$88, $AC,$77, $A3,$44, $AA,$55, $A3,$00
.byte $77,$77, $57, $A3,$00, $A6,$88
.byte $A1
; cycles=7669

Binary file not shown.

After

Width:  |  Height:  |  Size: 416 B