diff --git a/lzsa_test/Makefile b/lzsa_test/Makefile index 2f48a2eb..2d896430 100644 --- a/lzsa_test/Makefile +++ b/lzsa_test/Makefile @@ -8,10 +8,15 @@ B2D = ../bmp2dhr/b2d all: gr_lzsa.dsk -gr_lzsa.dsk: HELLO GR_LZSA - cp empty.dsk mist.dsk - $(DOS33) -y mist.dsk SAVE A HELLO - $(DOS33) -y mist.dsk BSAVE -a 0x1000 GR_LZSA +gr_lzsa.dsk: HELLO GR_LZSA LZSA2_SMALL_BENCH LZSA2_FAST_BENCH \ + LZ4_BENCH RLE_BENCH + cp empty.dsk lzsa_test.dsk + $(DOS33) -y lzsa_test.dsk SAVE A HELLO + $(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 GR_LZSA + $(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 LZSA2_SMALL_BENCH + $(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 LZSA2_FAST_BENCH + $(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 RLE_BENCH + $(DOS33) -y lzsa_test.dsk BSAVE -a 0x1000 LZ4_BENCH ### @@ -21,6 +26,46 @@ GR_LZSA: gr_lzsa.o gr_lzsa.o: gr_lzsa.s decompress_small_v2.s ca65 -o gr_lzsa.o gr_lzsa.s -l gr_lzsa.lst +### + +LZSA2_SMALL_BENCH: lzsa2_small_bench.o + ld65 -o LZSA2_SMALL_BENCH lzsa2_small_bench.o -C ../linker_scripts/apple2_1000.inc + +lzsa2_small_bench.o: lzsa2_small_bench.s decompress_small_v2.s + ca65 -o lzsa2_small_bench.o lzsa2_small_bench.s -l lzsa2_small_bench.lst + +### + +LZSA2_FAST_BENCH: lzsa2_fast_bench.o + ld65 -o LZSA2_FAST_BENCH lzsa2_fast_bench.o -C ../linker_scripts/apple2_1000.inc + +lzsa2_fast_bench.o: lzsa2_fast_bench.s decompress_fast_v2.s + ca65 -o lzsa2_fast_bench.o lzsa2_fast_bench.s -l lzsa2_fast_bench.lst + +### + +RLE_BENCH: rle_bench.o + ld65 -o RLE_BENCH rle_bench.o -C ../linker_scripts/apple2_1000.inc + +rle_bench.o: rle_bench.s gr_unrle.s + ca65 -o rle_bench.o rle_bench.s -l rle_bench.lst + +### + +LZ4_BENCH: lz4_bench.o + ld65 -o LZ4_BENCH lz4_bench.o -C ../linker_scripts/apple2_1000.inc + +lz4_bench.o: lz4_bench.s lz4_decode.s spaceship_far_n.lz4 + ca65 -o lz4_bench.o lz4_bench.s -l lz4_bench.lst + +### + + +spaceship_far_n.lz4: + lz4 -f -16 spaceship_far_n.gr + dd if=spaceship_far_n.gr.lz4 of=spaceship_far_n.lz4 bs=1 skip=11 + truncate spaceship_far_n.lz4 -s -8 + ### @@ -30,4 +75,5 @@ HELLO: hello.bas #### clean: - rm -f *~ *.o *.lst HELLO GR_LZSA + rm -f *~ *.o *.lst HELLO GR_LZSA RLE_BENCH LZSA2_FAST_BENCH LZSA2_SMALL_BENCH LZ4_BENCH + diff --git a/lzsa_test/decompress_fast_v2.s b/lzsa_test/decompress_fast_v2.s new file mode 100644 index 00000000..e4ee4ff6 --- /dev/null +++ b/lzsa_test/decompress_fast_v2.s @@ -0,0 +1,367 @@ +; note -- modified by Vince Weaver to assemble with ca65 + +; ----------------------------------------------------------------------------- +; Decompress raw LZSA2 block. +; Create one with lzsa -r -f2 +; +; in: +; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address +; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address +; +; out: +; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1 +; +; ----------------------------------------------------------------------------- +; Backward decompression is also supported, use lzsa -r -b -f2 +; To use it, also define BACKWARD_DECOMPRESS=1 before including this code! +; +; in: +; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data +; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer +; +; out: +; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1 +; +; ----------------------------------------------------------------------------- +; +; Copyright (C) 2019 Emmanuel Marty, Peter Ferrie +; +; This software is provided 'as-is', without any express or implied +; warranty. In no event will the authors be held liable for any damages +; arising from the use of this software. +; +; Permission is granted to anyone to use this software for any purpose, +; including commercial applications, and to alter it and redistribute it +; freely, subject to the following restrictions: +; +; 1. The origin of this software must not be misrepresented; you must not +; claim that you wrote the original software. If you use this software +; in a product, an acknowledgment in the product documentation would be +; appreciated but is not required. +; 2. Altered source versions must be plainly marked as such, and must not be +; misrepresented as being the original software. +; 3. This notice may not be removed or altered from any source distribution. +; ----------------------------------------------------------------------------- + +;NIBCOUNT = $FC ; zero-page location for temp offset + +decompress_lzsa2_fast: + + sta LZSA_DST_HI + + ldy #$00 + sty LZSA_DST_LO + sty NIBCOUNT + +decode_token: + jsr getsrc ; read token byte: XYZ|LL|MMM + pha ; preserve token on stack + + and #$18 ; isolate literals count (LL) + beq no_literals ; skip if no literals to copy + cmp #$18 ; LITERALS_RUN_LEN_V2? + bcc prepare_copy_literals ; if less, count is directly embedded in token + + jsr getnibble ; get extra literals length nibble + ; add nibble to len from token + adc #$02 ; (LITERALS_RUN_LEN_V2) minus carry + cmp #$12 ; LITERALS_RUN_LEN_V2 + 15 ? + bcc prepare_copy_literals_direct ; if less, literals count is complete + + jsr getsrc ; get extra byte of variable literals count + ; the carry is always set by the CMP above + ; GETSRC doesn't change it + sbc #$EE ; overflow? + jmp prepare_copy_literals_direct + +prepare_copy_literals_large: + ; handle 16 bits literals count + ; literals count = directly these 16 bits + jsr getlargesrc ; grab low 8 bits in X, high 8 bits in A + tay ; put high 8 bits in Y + bcs prepare_copy_literals_high ; (*same as JMP PREPARE_COPY_LITERALS_HIGH but shorter) + +prepare_copy_literals: + lsr ; shift literals count into place + lsr + lsr + +prepare_copy_literals_direct: + tax + bcs prepare_copy_literals_large ; if so, literals count is large + +prepare_copy_literals_high: + txa + beq copy_literals + iny + +copy_literals: + jsr getput ; copy one byte of literals + dex + bne copy_literals + dey + bne copy_literals + +no_literals: + pla ; retrieve token from stack + pha ; preserve token again + asl + bcs repmatch_or_large_offset ; 1YZ: rep-match or 13/16 bit offset + + asl ; 0YZ: 5 or 9 bit offset + bcs offset_9_bit + + ; 00Z: 5 bit offset + + ldx #$FF ; set offset bits 15-8 to 1 + + jsr getcombinedbits ; rotate Z bit into bit 0, read nibble for bits 4-1 + ora #$E0 ; set bits 7-5 to 1 + bne got_offset_lo ; go store low byte of match offset and prepare match + +offset_9_bit: ; 01Z: 9 bit offset + ;;asl ; shift Z (offset bit 8) in place + rol + rol + and #$01 + eor #$FF ; set offset bits 15-9 to 1 + bne got_offset_hi ; go store high byte, read low byte of match offset and prepare match + ; (*same as JMP GOT_OFFSET_HI but shorter) + +repmatch_or_large_offset: + asl ; 13 bit offset? + bcs repmatch_or_16bit ; handle rep-match or 16-bit offset if not + + ; 10Z: 13 bit offset + + jsr getcombinedbits ; rotate Z bit into bit 8, read nibble for bits 12-9 + adc #$DE ; set bits 15-13 to 1 and substract 2 (to substract 512) + bne got_offset_hi ; go store high byte, read low byte of match offset and prepare match + ; (*same as JMP GOT_OFFSET_HI but shorter) + +repmatch_or_16bit: ; rep-match or 16 bit offset + ;;ASL ; XYZ=111? + bmi rep_match ; reuse previous offset if so (rep-match) + + ; 110: handle 16 bit offset + jsr getsrc ; grab high 8 bits +got_offset_hi: + tax + jsr getsrc ; grab low 8 bits +got_offset_lo: + sta OFFSLO ; store low byte of match offset + stx OFFSHI ; store high byte of match offset + +rep_match: +.ifdef BACKWARD_DECOMPRESS + + ; Backward decompression - substract match offset + + sec ; add dest + match offset + lda putdst+1 ; low 8 bits +OFFSLO = *+1 + sbc #$AA + sta copy_match_loop+1 ; store back reference address + lda putdst+2 +OFFSHI = *+1 + sbc #$AA ; high 8 bits + sta copy_match_loop+2 ; store high 8 bits of address + sec + +.else + + ; Forward decompression - add match offset + + clc ; add dest + match offset + lda putdst+1 ; low 8 bits +OFFSLO = *+1 + adc #$AA + sta copy_match_loop+1 ; store back reference address +OFFSHI = *+1 + lda #$AA ; high 8 bits + adc putdst+2 + sta copy_match_loop+2 ; store high 8 bits of address +.endif + + pla ; retrieve token from stack again + and #$07 ; isolate match len (MMM) + adc #$01 ; add MIN_MATCH_SIZE_V2 and carry + cmp #$09 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2? + bcc prepare_copy_match ; if less, length is directly embedded in token + + jsr getnibble ; get extra match length nibble + ; add nibble to len from token + adc #$08 ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry + cmp #$18 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15? + bcc prepare_copy_match ; if less, match length is complete + + jsr getsrc ; get extra byte of variable match length + ; the carry is always set by the CMP above + ; GETSRC doesn't change it + sbc #$E8 ; overflow? + +prepare_copy_match: + tax + bcc prepare_copy_match_y ; if not, the match length is complete + beq decompression_done ; if EOD code, bail + + ; Handle 16 bits match length + jsr getlargesrc ; grab low 8 bits in X, high 8 bits in A + tay ; put high 8 bits in Y + +prepare_copy_match_y: + txa + beq copy_match_loop + iny + +copy_match_loop: + lda $AAAA ; get one byte of backreference + jsr putdst ; copy to destination + +.ifdef BACKWARD_DECOMPRESS + + ; Backward decompression -- put backreference bytes backward + + lda copy_match_loop+1 + beq getmatch_adj_hi +getmatch_done: + dec copy_match_loop+1 + +.else + + ; Forward decompression -- put backreference bytes forward + + inc copy_match_loop+1 + beq getmatch_adj_hi +getmatch_done: + +.endif + + dex + bne copy_match_loop + dey + bne copy_match_loop + jmp decode_token + +.ifdef BACKWARD_DECOMPRESS + +getmatch_adj_hi: + dec copy_match_loop+2 + jmp getmatch_done + +.else + +getmatch_adj_hi: + inc copy_match_loop+2 + jmp getmatch_done +.endif + +getcombinedbits: + eor #$80 + asl + php + + jsr getnibble ; get nibble into bits 0-3 (for offset bits 1-4) + plp ; merge Z bit as the carry bit (for offset bit 0) +combinedbitz: + rol ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared +decompression_done: + rts + +getnibble: +NIBBLES = *+1 + lda #$AA + lsr NIBCOUNT + bcc need_nibbles + and #$0F ; isolate low 4 bits of nibble + rts + +need_nibbles: + inc NIBCOUNT + jsr getsrc ; get 2 nibbles + sta NIBBLES + lsr + lsr + lsr + lsr + sec + rts + +.ifdef BACKWARD_DECOMPRESS + + ; Backward decompression -- get and put bytes backward + +getput: + jsr getsrc +putdst: +LZSA_DST_LO = *+1 +LZSA_DST_HI = *+2 + sta $AAAA + lda putdst+1 + beq putdst_adj_hi + dec putdst+1 + rts + +putdst_adj_hi: + dec putdst+2 + dec putdst+1 + rts + +getlargesrc: + jsr getsrc ; grab low 8 bits + tax ; move to X + ; fall through grab high 8 bits + +getsrc: +LZSA_SRC_LO = *+1 +LZSA_SRC_HI = *+2 + lda $AAAA + pha + lda getsrc+1 + beq getsrc_adj_hi + dec getsrc+1 + pla + rts + +getsrc_adj_hi: + dec getsrc+2 + dec getsrc+1 + pla + rts + +.else + + ; Forward decompression -- get and put bytes forward + +getput: + jsr getsrc +putdst: +LZSA_DST_LO = *+1 +LZSA_DST_HI = *+2 + sta $AAAA + inc putdst+1 + beq putdst_adj_hi + rts + +putdst_adj_hi: + inc putdst+2 + rts + +getlargesrc: + jsr getsrc ; grab low 8 bits + tax ; move to X + ; fall through grab high 8 bits + +getsrc: +LZSA_SRC_LO = *+1 +LZSA_SRC_HI = *+2 + lda $AAAA + inc getsrc+1 + beq getsrc_adj_hi + rts + +getsrc_adj_hi: + inc getsrc+2 + rts +.endif + diff --git a/lzsa_test/decompress_faster_v2.s b/lzsa_test/decompress_faster_v2.s new file mode 100644 index 00000000..89e028c4 --- /dev/null +++ b/lzsa_test/decompress_faster_v2.s @@ -0,0 +1,637 @@ +; *************************************************************************** +; *************************************************************************** +; +; lzsa2_6502.s +; +; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format. +; +; This code is written for the ACME assembler. +; +; Optional code is presented for two minor 6502 optimizations that break +; compatibility with the current LZSA2 format standard. +; +; The code is 241 bytes for the small version, and 267 bytes for the normal. +; +; Copyright John Brandwood 2019. +; +; Distributed under the Boost Software License, Version 1.0. +; (See accompanying file LICENSE_1_0.txt or copy at +; http://www.boost.org/LICENSE_1_0.txt) +; +; *************************************************************************** +; *************************************************************************** + + + +; *************************************************************************** +; *************************************************************************** +; +; Decompression Options & Macros +; + + ; + ; Save 7 bytes of code, and 21 cycles every time that a + ; 16-bit length is decoded? + ; + ; N.B. Setting this breaks compatibility with LZSA v1.2 + ; + +LZSA_SWAP_LEN16 = 0 + + ; + ; Save 3 bytes of code, and 4 or 8 cycles when decoding + ; an offset? + ; + ; N.B. Setting this breaks compatibility with LZSA v1.2 + ; + +LZSA_SWAP_XZY = 0 + + ; + ; Choose size over space (within sane limits)? + ; + +LZSA_SMALL_SIZE = 0 + + ; + ; Remove code inlining to save space? + ; + ; This saves 15 bytes of code at the cost of 7% speed. + ; + + !if LZSA_SMALL_SIZE { +LZSA_NO_INLINE = 1 + } else { +LZSA_NO_INLINE = 0 + } + + ; + ; Use smaller code for copying literals? + ; + ; This saves 11 bytes of code at the cost of 5% speed. + ; + + !if LZSA_SMALL_SIZE { +LZSA_SHORT_CP = 1 + } else { +LZSA_SHORT_CP = 0 + } + + ; + ; Assume that we're decompressing from a large multi-bank + ; compressed data file, and that the next bank may need to + ; paged in when a page-boundary is crossed. + ; + +LZSA_FROM_BANK = 0 + + ; + ; We will read from or write to $FFFF. This prevents the + ; use of the "INC ptrhi / BNE" trick and reduces speed. + ; + +LZSA_USE_FFFF = 0 + + ; + ; Macro to increment the source pointer to the next page. + ; + + !if LZSA_FROM_BANK { + !macro LZSA_INC_PAGE { + jsr lzsa2_next_page + } + } else { + !macro LZSA_INC_PAGE { + inc @@ -44,7 +46,12 @@ ;NIBCOUNT = $FC ; zero-page location for temp offset decompress_lzsa2: + + ; page to decompress to in a + + sta LZSA_DST_HI ldy #$00 + sty LZSA_DST_LO sty NIBCOUNT decode_token: diff --git a/lzsa_test/gr_lzsa.s b/lzsa_test/gr_lzsa.s index e572c3cb..de7c9fbc 100644 --- a/lzsa_test/gr_lzsa.s +++ b/lzsa_test/gr_lzsa.s @@ -13,10 +13,7 @@ lda #>graphic_start sta LZSA_SRC_HI - lda #$0 - sta LZSA_DST_LO lda #$c - sta LZSA_DST_HI jsr decompress_lzsa2 diff --git a/lzsa_test/gr_unrle.s b/lzsa_test/gr_unrle.s new file mode 100644 index 00000000..5e0b5dd5 --- /dev/null +++ b/lzsa_test/gr_unrle.s @@ -0,0 +1,112 @@ + ;================= + ; load RLE image + ;================= + ; Output is A:00 (assume page aligned) + ; Input is in GBASH/GBASL + + ; format: first byte=xsize + ; A0,X,Y means run of X bytes of Y color + ; A1 means end of file + ; A2-AF,X means run of low nibble, X color + ; if high nibble not A: just display color + + ; CV = current Y + ; CH = max xsize (usually 40) + ; TEMP = page + ; TEMPY= current X + + +load_rle_gr: + sec + sbc #4 ; adjust page to write to + ; to match gr_offsets + sta TEMP + + ldy #$0 ; init Y to 0 + sty CV + + jsr load_and_increment ; load xsize + sta CH + + jsr unrle_new_y + + +rle_loop: + jsr load_and_increment + + tax + + cmp #$A1 ; if 0xa1 + beq rle_done ; we are done + + and #$f0 ; mask + cmp #$a0 ; see if special AX + beq decompress_special + + ; not special, just color + + txa ; put color back in A + ldx #$1 ; only want to print 1 + bne decompress_run + +decompress_special: + txa ; put read value back in A + + and #$0f ; check if was A0 + + bne decompress_color ; if A0 need to read run, color + +decompress_large: + jsr load_and_increment ; run length now in A + +decompress_color: + tax ; put runlen into X + jsr load_and_increment ; get color into A + +decompress_run: +rle_run_loop: + sta (BASL),y ; write out the value + inc BASL + dec TEMPY + bne rle_not_eol ; if less then keep going + + ; if here, we are > max_X + + inc CV + inc CV + pha + jsr unrle_new_y + pla + +rle_not_eol: + dex + bne rle_run_loop ; if not zero, keep looping + + beq rle_loop ; and branch always + +rle_done: + lda #$15 ; move the cursor somewhere sane + sta CV + rts + + +load_and_increment: + lda (GBASL),Y + inc GBASL + bne lai_no_oflo + inc GBASH +lai_no_oflo: + rts + +unrle_new_y: + ldy CV + lda gr_offsets,Y + sta BASL + lda gr_offsets+1,Y + clc + adc TEMP ; adjust for page + sta BASH + lda CH + sta TEMPY + ldy #0 + rts diff --git a/lzsa_test/hello.bas b/lzsa_test/hello.bas index 617228a8..f255b312 100644 --- a/lzsa_test/hello.bas +++ b/lzsa_test/hello.bas @@ -1,2 +1,2 @@ 5 HOME -120 PRINT CHR$(4);"BRUN GR_LZSA" +120 PRINT CHR$(4);"CATALOG" diff --git a/lzsa_test/lz4_bench.s b/lzsa_test/lz4_bench.s new file mode 100644 index 00000000..520db0db --- /dev/null +++ b/lzsa_test/lz4_bench.s @@ -0,0 +1,50 @@ + +.include "zp.inc" +.include "hardware.inc" + + lda #0 + sta DRAW_PAGE + + bit SET_GR + bit PAGE0 + + bit KEYRESET +pause_loop: + lda KEYPRESS + bpl pause_loop + + lda #graphic_start + sta LZ4_SRC+1 + + lda #graphic_end + sta LZ4_END+1 + + lda #$0 + sta LZ4_DST + lda #$c + sta LZ4_DST+1 + +before: + + jsr lz4_decode +after: + + jsr gr_copy_to_current + + +blah: + jmp blah + + + .include "lz4_decode.s" + .include "gr_copy.s" + .include "gr_offsets.s" + +graphic_start: + + .incbin "spaceship_far_n.lz4" +graphic_end: diff --git a/lzsa_test/lz4_decode.s b/lzsa_test/lz4_decode.s new file mode 100644 index 00000000..598430da --- /dev/null +++ b/lzsa_test/lz4_decode.s @@ -0,0 +1,201 @@ +; LZ4 data decompressor for Apple II + +; Code by Peter Ferrie (qkumba) (peter.ferrie@gmail.com) +; "LZ4 unpacker in 143 bytes (6502 version) (2013)" +; http://pferrie.host22.com/misc/appleii.htm +; This is that code, but with comments and labels added for clarity. +; I also found a bug when decoding with runs of multiples of 256 +; which has since been fixed upstream. + +; For LZ4 reference see +; https://github.com/lz4/lz4/wiki/lz4_Frame_format.md + +; LZ4 summary: +; +; HEADER: +; Should: check for magic number 04 22 4d 18 +; FLG: 64 in our case (01=version, block.index=1, block.checksum=0 +; size=0, checksum=1, reserved +; MAX Blocksize: 40 (64kB) +; HEADER CHECKSUM: a7 +; BLOCK HEADER: 4 bytes (le) length If highest bit set, uncompressed! +; data (see below), followed by checksum? +; BLOCKS: +; Token byte. High 4-bits literal length, low 4-bits copy length +; + If literal length==15, then following byte gets added to length +; If that byte was 255, then keep adding bytes until not 255 +; + The literal bytes follow. There may be zero of them +; + Next is block copy info. little-endian 2-byte offset to +; be subtracted from current read position indicating source +; + The low 4-bits of the token are the copy length, which needs +; 4 added to it. As with the literal length, if it is 15 then +; you read a byte and add (and if that byte is 255, keep adding) + +; At end you have 4 byte end-of-block marker (all zeros?) then +; 4 bytes of checksum (if marked in flags) +; our code does that, so be sure to set end -8 + + +;LZ4_SRC EQU $00 +;LZ4_DST EQU $02 +;LZ4_END EQU $04 +;COUNT EQU $06 +;DELTA EQU $08 + + + ;====================== + ; LZ4 decode + ;====================== + ; input buffer in LZ4_SRC + ; end of input in LZ4_END + ; output buffer in LZ4_DST + + +lz4_decode: + + +unpmain: + ldy #0 ; used to index, always zero + +parsetoken: + jsr getsrc ; get next token + pha ; save for later (need bottom 4 bits) + + lsr ; number of literals in top 4 bits + lsr ; so shift into place + lsr + lsr + beq copymatches ; if zero, then no literals + ; jump ahead and copy + + jsr buildcount ; add up all the literal sizes + ; result is in ram[count+1]-1:A + tax ; now in ram[count+1]-1:X + jsr docopy ; copy the literals + + lda LZ4_SRC ; 16-bit compare + cmp LZ4_END ; to see if we have reached the end + lda LZ4_SRC+1 + sbc LZ4_END+1 + bcs done + +copymatches: + jsr getsrc ; get 16-bit delta value + sta DELTA + jsr getsrc + sta DELTA+1 + + pla ; restore token + and #$0f ; get bottom 4 bits + ; match count. 0 means 4 + ; 15 means 19+, must be calculated + + jsr buildcount ; add up count bits, in ram[count+1]-:A + + clc + adc #4 ; adjust count by 4 (minmatch) + + tax ; now in ramp[count+1]-1:X + + beq copy_no_adjust ; BUGFIX, don't increment if + ; exactly a multiple of 0x100 + bcc copy_no_adjust + + inc COUNT+1 ; increment if we overflowed +copy_no_adjust: + + lda LZ4_SRC+1 ; save src on stack + pha + lda LZ4_SRC + pha + + sec ; subtract delta + lda LZ4_DST ; from destination, make new src + sbc DELTA + sta LZ4_SRC + lda LZ4_DST+1 + sbc DELTA+1 + sta LZ4_SRC+1 + + jsr docopy ; do the copy + + pla ; restore the src + sta LZ4_SRC + pla + sta LZ4_SRC+1 + + jmp parsetoken ; back to parsing tokens + +done: + pla + rts + + ;========= + ; getsrc + ;========= + ; gets byte from src into A, increments pointer +getsrc: + lda (LZ4_SRC), Y ; get a byte from src + inc LZ4_SRC ; increment pointer + bne done_getsrc ; update 16-bit pointer + inc LZ4_SRC+1 ; on 8-bit overflow +done_getsrc: + rts + + ;============ + ; buildcount + ;============ +buildcount: + ldx #1 ; high count starts at 1 + stx COUNT+1 ; (loops at zero?) + cmp #$0f ; if LITERAL_COUNT < 15, we are done + bne done_buildcount +buildcount_loop: + sta COUNT ; save LITERAL_COUNT (15) + jsr getsrc ; get the next byte + tax ; put in X + clc + adc COUNT ; add new byte to old value + bcc bc_8bit_oflow ; if overflow, increment high byte + inc COUNT+1 +bc_8bit_oflow: + inx ; check if read value was 255 + beq buildcount_loop ; if it was, keep looping and adding +done_buildcount: + rts + + ;============ + ; getput + ;============ + ; gets a byte, then puts the byte +getput: + jsr getsrc + ; fallthrough to putdst + + ;============= + ; putdst + ;============= + ; store A into destination +putdst: + sta (LZ4_DST), Y ; store A into destination + inc LZ4_DST ; increment 16-bit pointer + bne putdst_end ; if overflow, increment top byte + inc LZ4_DST+1 +putdst_end: + rts + + ;============================= + ; docopy + ;============================= + ; copies ram[count+1]-1:X bytes + ; from src to dst +docopy: + +docopy_loop: + jsr getput ; get/put byte + dex ; decrement count + bne docopy_loop ; if not zero, loop + dec COUNT+1 ; if zero, decrement high byte + bne docopy_loop ; if not zero, loop + + rts diff --git a/lzsa_test/lzsa2_fast_bench.s b/lzsa_test/lzsa2_fast_bench.s new file mode 100644 index 00000000..029eec98 --- /dev/null +++ b/lzsa_test/lzsa2_fast_bench.s @@ -0,0 +1,40 @@ + +.include "zp.inc" +.include "hardware.inc" + + lda #0 + sta DRAW_PAGE + + bit SET_GR + bit PAGE0 + + bit KEYRESET +pause_loop: + lda KEYPRESS + bpl pause_loop + + lda #graphic_start + sta LZSA_SRC_HI + +before: + lda #$c + jsr decompress_lzsa2_fast +after: + + jsr gr_copy_to_current + + +blah: + jmp blah + + + .include "decompress_fast_v2.s" + .include "gr_copy.s" + .include "gr_offsets.s" + +graphic_start: + + .incbin "spaceship_far_n.gr.small_v2" +graphic_end: diff --git a/lzsa_test/lzsa2_small_bench.s b/lzsa_test/lzsa2_small_bench.s new file mode 100644 index 00000000..1064fe9f --- /dev/null +++ b/lzsa_test/lzsa2_small_bench.s @@ -0,0 +1,40 @@ + +.include "zp.inc" +.include "hardware.inc" + + lda #0 + sta DRAW_PAGE + + bit SET_GR + bit PAGE0 + + bit KEYRESET +pause_loop: + lda KEYPRESS + bpl pause_loop + + lda #graphic_start + sta LZSA_SRC_HI + +before: + lda #$c + jsr decompress_lzsa2 +after: + + jsr gr_copy_to_current + + +blah: + jmp blah + + + .include "decompress_small_v2.s" + .include "gr_copy.s" + .include "gr_offsets.s" + +graphic_start: + + .incbin "spaceship_far_n.gr.small_v2" +graphic_end: diff --git a/lzsa_test/notes b/lzsa_test/notes index f9b2bb32..400dd37d 100644 --- a/lzsa_test/notes +++ b/lzsa_test/notes @@ -5,3 +5,10 @@ lz4 323 lzsa -r -f1 -- small_v1 -- 252 bytes lzsa -r -f2 -- small_v2 -- 228 bytes + + +speed: + rle: 8AE7 cycles 687 byte exe + lzsa_small_v2: E8EF cycles 782 byte exe + lzsa_fast_v2: DFBF cycles 793 byte exe + lz4: 1170A cycles 767 byte exe diff --git a/lzsa_test/rle_bench.s b/lzsa_test/rle_bench.s new file mode 100644 index 00000000..b4ed9290 --- /dev/null +++ b/lzsa_test/rle_bench.s @@ -0,0 +1,65 @@ + +.include "zp.inc" +.include "hardware.inc" + + lda #0 + sta DRAW_PAGE + + bit SET_GR + bit PAGE0 + + bit KEYRESET +pause_loop: + lda KEYPRESS + bpl pause_loop + + lda #graphic_start + sta GBASH + +before: + lda #$c + jsr load_rle_gr + +after: + + jsr gr_copy_to_current + + +blah: + jmp blah + + + .include "gr_unrle.s" + .include "gr_copy.s" + .include "gr_offsets.s" + +graphic_start: +spaceship_far_n_rle: .byte $28 ; ysize=48 + .byte $A0,$FF,$FF, $AF,$FF, $A7,$5F, $A3,$F5, $A0,$1B,$FF, $5F, $05,$05 + .byte $A4,$00, $0F,$0F, $A0,$1C,$FF, $50, $5F, $05, $A5,$00 + .byte $F0, $A0,$11,$FF, $0F, $9F,$9F, $AB,$99, $55, $0A + .byte $90,$90, $A3,$00, $F0, $AE,$FF, $0F, $9F, $D9 + .byte $00, $A6,$DD, $A4,$88, $A3,$DD, $88, $55, $D9 + .byte $00, $A3,$D9, $D0, $9F, $A8,$FF, $A5,$0F, $00,$00 + .byte $8D,$8D, $00, $A6,$8D, $A4,$88, $A4,$8D, $55, $8D + .byte $00, $A7,$8D, $AB,$FF, $F0, $00, $88,$88, $00 + .byte $AB,$88, $08,$08, $88, $55, $88, $00, $A6,$88 + .byte $F8, $AE,$FF, $F8, $F0, $A6,$08, $A5,$88, $99,$99 + .byte $88, $55, $08, $A7,$00, $0F,$0F, $A0,$13,$FF, $A7,$50 + .byte $59,$59, $58, $55, $5F, $A0,$01,$AF, $A3,$FF, $A9,$F0 + .byte $FF, $AC,$7F, $4F, $45,$45, $AA,$75, $55, $05 + .byte $0A, $AC,$7F, $AC,$77, $A3,$44, $00,$00, $44, $54 + .byte $A3,$55, $A3,$44, $A3,$00, $A0,$18,$77, $44, $FF, $44 + .byte $00,$00, $44, $A4,$55, $A3,$44, $00, $FF, $00 + .byte $A0,$18,$77, $A3,$44, $00,$00, $44, $A4,$55, $A3,$44, $A3,$00 + .byte $A0,$18,$77, $A3,$44, $00,$00, $A6,$55, $44,$44, $A3,$00, $A0,$18,$77 + .byte $A3,$44, $00,$00, $A6,$55, $54, $44, $A3,$00, $A4,$77 + .byte $07, $A3,$77, $A3,$07, $AD,$77, $A3,$44, $00, $A8,$55 + .byte $44, $A3,$00, $A3,$77, $07, $00, $88,$88, $07 + .byte $A3,$00, $07, $AC,$77, $A3,$44, $00, $A9,$55, $A3,$00 + .byte $A3,$77, $A3,$00, $A6,$88, $AC,$77, $A3,$44, $AA,$55, $A3,$00 + .byte $77,$77, $57, $A3,$00, $A6,$88 + .byte $A1 +; cycles=7669 diff --git a/lzsa_test/spaceship_far_n.png b/lzsa_test/spaceship_far_n.png new file mode 100644 index 00000000..3dd2df0c Binary files /dev/null and b/lzsa_test/spaceship_far_n.png differ