Merge pull request #5 from emmanuel-marty/master

Catch up with the changes in main
This commit is contained in:
introspec 2020-01-02 13:51:29 +00:00 committed by GitHub
commit 3b37a0bb70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 2709 additions and 1050 deletions

View File

@ -5,6 +5,8 @@ LZSA is a collection of byte-aligned compression formats that are specifically e
Check out [The Hollow](https://www.pouet.net/prod.php?which=81909) by Darklite and Offense, winner of the Solskogen 2019 wild compo, that uses LZSA on Z80.
[Gabba](https://www.pouet.net/prod.php?which=83539) by Stardust ranked 2nd in the ZX Spectrum demo compo at CAFe demoparty 2019 and also used LZSA on Z80.
The LZSA compression tool uses an aggressive optimal packing strategy to try to find the sequence of commands that gives the smallest packed file that decompresses to the original while maintaining the maximum possible decompression speed.
The compression formats give the user choices that range from decompressing faster than LZ4 on 8-bit systems with better compression, to compressing as well as ZX7 with much better decompression speed. LZSA1 is designed to replace LZ4 and LZSA2 to replace ZX7, in 8-bit scenarios.

View File

@ -0,0 +1,305 @@
; -----------------------------------------------------------------------------
; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
;
; in:
; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
;
; out:
; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
;
; -----------------------------------------------------------------------------
; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
;
; in:
; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
;
; out:
; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
;
; -----------------------------------------------------------------------------
;
; Copyright (C) 2019 Emmanuel Marty, Peter Ferrie
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; -----------------------------------------------------------------------------
DECOMPRESS_LZSA1_FAST
LDY #$00
DECODE_TOKEN
JSR GETSRC ; read token byte: O|LLL|MMMM
PHA ; preserve token on stack
AND #$70 ; isolate literals count
BEQ NO_LITERALS ; skip if no literals to copy
CMP #$70 ; LITERALS_RUN_LEN?
BNE PREPARE_COPY_LITERALS ; if not, count is directly embedded in token
JSR GETSRC ; get extra byte of variable literals count
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$F9 ; (LITERALS_RUN_LEN)
BCC PREPARE_COPY_LITERALS_DIRECT
BEQ LARGE_VARLEN_LITERALS ; if adding up to zero, go grab 16-bit count
JSR GETSRC ; get single extended byte of variable literals count
INY ; add 256 to literals count
BCS PREPARE_COPY_LITERALS_DIRECT ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
LARGE_VARLEN_LITERALS ; handle 16 bits literals count
; literals count = directly these 16 bits
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
TXA
JMP PREPARE_COPY_LARGE_LITERALS
PREPARE_COPY_LITERALS
TAX
LDA SHIFT_TABLE-1,X ; shift literals length into place
; -1 because position 00 is reserved
PREPARE_COPY_LITERALS_DIRECT
TAX
PREPARE_COPY_LARGE_LITERALS
BEQ COPY_LITERALS
INY
COPY_LITERALS
JSR GETPUT ; copy one byte of literals
DEX
BNE COPY_LITERALS
DEY
BNE COPY_LITERALS
NO_LITERALS
PLA ; retrieve token from stack
PHA ; preserve token again
BMI GET_LONG_OFFSET ; $80: 16 bit offset
JSR GETSRC ; get 8 bit offset from stream in A
TAX ; save for later
LDA #$FF ; high 8 bits
BNE GOT_OFFSET ; go prepare match
; (*like JMP GOT_OFFSET but shorter)
SHORT_VARLEN_MATCHLEN
JSR GETSRC ; get single extended byte of variable match len
INY ; add 256 to match length
PREPARE_COPY_MATCH
TAX
PREPARE_COPY_MATCH_Y
TXA
BEQ COPY_MATCH_LOOP
INY
COPY_MATCH_LOOP
LDA $AAAA ; get one byte of backreference
JSR PUTDST ; copy to destination
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- put backreference bytes backward
LDA COPY_MATCH_LOOP+1
BEQ GETMATCH_ADJ_HI
GETMATCH_DONE
DEC COPY_MATCH_LOOP+1
} else {
; Forward decompression -- put backreference bytes forward
INC COPY_MATCH_LOOP+1
BEQ GETMATCH_ADJ_HI
GETMATCH_DONE
}
DEX
BNE COPY_MATCH_LOOP
DEY
BNE COPY_MATCH_LOOP
BEQ DECODE_TOKEN ; (*like JMP DECODE_TOKEN but shorter)
!ifdef BACKWARD_DECOMPRESS {
GETMATCH_ADJ_HI
DEC COPY_MATCH_LOOP+2
JMP GETMATCH_DONE
} else {
GETMATCH_ADJ_HI
INC COPY_MATCH_LOOP+2
JMP GETMATCH_DONE
}
GET_LONG_OFFSET ; handle 16 bit offset:
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
GOT_OFFSET
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression - substract match offset
STA OFFSHI ; store high 8 bits of offset
STX OFFSLO
SEC ; substract dest - match offset
LDA PUTDST+1
OFFSLO = *+1
SBC #$AA ; low 8 bits
STA COPY_MATCH_LOOP+1 ; store back reference address
LDA PUTDST+2
OFFSHI = *+1
SBC #$AA ; high 8 bits
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
SEC
} else {
; Forward decompression - add match offset
STA OFFSHI ; store high 8 bits of offset
TXA
CLC ; add dest + match offset
ADC PUTDST+1 ; low 8 bits
STA COPY_MATCH_LOOP+1 ; store back reference address
OFFSHI = *+1
LDA #$AA ; high 8 bits
ADC PUTDST+2
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
}
PLA ; retrieve token from stack again
AND #$0F ; isolate match len (MMMM)
ADC #$02 ; plus carry which is always set by the high ADC
CMP #$12 ; MATCH_RUN_LEN?
BCC PREPARE_COPY_MATCH ; if not, count is directly embedded in token
JSR GETSRC ; get extra byte of variable match length
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$EE ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
BCC PREPARE_COPY_MATCH
BNE SHORT_VARLEN_MATCHLEN
; Handle 16 bits match length
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
; large match length with zero high byte?
BNE PREPARE_COPY_MATCH_Y ; if not, continue
DECOMPRESSION_DONE
RTS
SHIFT_TABLE
!BYTE $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00
!BYTE $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01
!BYTE $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02
!BYTE $03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03
!BYTE $04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04
!BYTE $05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05
!BYTE $06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06
!BYTE $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- get and put bytes backward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
LDA PUTDST+1
BEQ PUTDST_ADJ_HI
DEC PUTDST+1
RTS
PUTDST_ADJ_HI
DEC PUTDST+2
DEC PUTDST+1
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
PHA
LDA GETSRC+1
BEQ GETSRC_ADJ_HI
DEC GETSRC+1
PLA
RTS
GETSRC_ADJ_HI
DEC GETSRC+2
DEC GETSRC+1
PLA
RTS
} else {
; Forward decompression -- get and put bytes forward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
INC PUTDST+1
BEQ PUTDST_ADJ_HI
RTS
PUTDST_ADJ_HI
INC PUTDST+2
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
INC GETSRC+1
BEQ GETSRC_ADJ_HI
RTS
GETSRC_ADJ_HI
INC GETSRC+2
RTS
}

View File

@ -0,0 +1,363 @@
; -----------------------------------------------------------------------------
; Decompress raw LZSA2 block.
; Create one with lzsa -r -f2 <original_file> <compressed_file>
;
; in:
; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
;
; out:
; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
;
; -----------------------------------------------------------------------------
; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
;
; in:
; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
;
; out:
; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
;
; -----------------------------------------------------------------------------
;
; Copyright (C) 2019 Emmanuel Marty, Peter Ferrie
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; -----------------------------------------------------------------------------
NIBCOUNT = $FC ; zero-page location for temp offset
DECOMPRESS_LZSA2_FAST
LDY #$00
STY NIBCOUNT
DECODE_TOKEN
JSR GETSRC ; read token byte: XYZ|LL|MMM
PHA ; preserve token on stack
AND #$18 ; isolate literals count (LL)
BEQ NO_LITERALS ; skip if no literals to copy
CMP #$18 ; LITERALS_RUN_LEN_V2?
BCC PREPARE_COPY_LITERALS ; if less, count is directly embedded in token
JSR GETNIBBLE ; get extra literals length nibble
; add nibble to len from token
ADC #$02 ; (LITERALS_RUN_LEN_V2) minus carry
CMP #$12 ; LITERALS_RUN_LEN_V2 + 15 ?
BCC PREPARE_COPY_LITERALS_DIRECT ; if less, literals count is complete
JSR GETSRC ; get extra byte of variable literals count
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$EE ; overflow?
JMP PREPARE_COPY_LITERALS_DIRECT
PREPARE_COPY_LITERALS_LARGE
; handle 16 bits literals count
; literals count = directly these 16 bits
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
BCS PREPARE_COPY_LITERALS_HIGH ; (*same as JMP PREPARE_COPY_LITERALS_HIGH but shorter)
PREPARE_COPY_LITERALS
LSR ; shift literals count into place
LSR
LSR
PREPARE_COPY_LITERALS_DIRECT
TAX
BCS PREPARE_COPY_LITERALS_LARGE ; if so, literals count is large
PREPARE_COPY_LITERALS_HIGH
TXA
BEQ COPY_LITERALS
INY
COPY_LITERALS
JSR GETPUT ; copy one byte of literals
DEX
BNE COPY_LITERALS
DEY
BNE COPY_LITERALS
NO_LITERALS
PLA ; retrieve token from stack
PHA ; preserve token again
ASL
BCS REPMATCH_OR_LARGE_OFFSET ; 1YZ: rep-match or 13/16 bit offset
ASL ; 0YZ: 5 or 9 bit offset
BCS OFFSET_9_BIT
; 00Z: 5 bit offset
LDX #$FF ; set offset bits 15-8 to 1
JSR GETCOMBINEDBITS ; rotate Z bit into bit 0, read nibble for bits 4-1
ORA #$E0 ; set bits 7-5 to 1
BNE GOT_OFFSET_LO ; go store low byte of match offset and prepare match
OFFSET_9_BIT ; 01Z: 9 bit offset
;;ASL ; shift Z (offset bit 8) in place
ROL
ROL
AND #$01
EOR #$FF ; set offset bits 15-9 to 1
BNE GOT_OFFSET_HI ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
REPMATCH_OR_LARGE_OFFSET
ASL ; 13 bit offset?
BCS REPMATCH_OR_16_BIT ; handle rep-match or 16-bit offset if not
; 10Z: 13 bit offset
JSR GETCOMBINEDBITS ; rotate Z bit into bit 8, read nibble for bits 12-9
ADC #$DE ; set bits 15-13 to 1 and substract 2 (to substract 512)
BNE GOT_OFFSET_HI ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
REPMATCH_OR_16_BIT ; rep-match or 16 bit offset
;;ASL ; XYZ=111?
BMI REP_MATCH ; reuse previous offset if so (rep-match)
; 110: handle 16 bit offset
JSR GETSRC ; grab high 8 bits
GOT_OFFSET_HI
TAX
JSR GETSRC ; grab low 8 bits
GOT_OFFSET_LO
STA OFFSLO ; store low byte of match offset
STX OFFSHI ; store high byte of match offset
REP_MATCH
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression - substract match offset
SEC ; add dest + match offset
LDA PUTDST+1 ; low 8 bits
OFFSLO = *+1
SBC #$AA
STA COPY_MATCH_LOOP+1 ; store back reference address
LDA PUTDST+2
OFFSHI = *+1
SBC #$AA ; high 8 bits
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
SEC
} else {
; Forward decompression - add match offset
CLC ; add dest + match offset
LDA PUTDST+1 ; low 8 bits
OFFSLO = *+1
ADC #$AA
STA COPY_MATCH_LOOP+1 ; store back reference address
OFFSHI = *+1
LDA #$AA ; high 8 bits
ADC PUTDST+2
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
}
PLA ; retrieve token from stack again
AND #$07 ; isolate match len (MMM)
ADC #$01 ; add MIN_MATCH_SIZE_V2 and carry
CMP #$09 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
BCC PREPARE_COPY_MATCH ; if less, length is directly embedded in token
JSR GETNIBBLE ; get extra match length nibble
; add nibble to len from token
ADC #$08 ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
CMP #$18 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
BCC PREPARE_COPY_MATCH ; if less, match length is complete
JSR GETSRC ; get extra byte of variable match length
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$E8 ; overflow?
PREPARE_COPY_MATCH
TAX
BCC PREPARE_COPY_MATCH_Y ; if not, the match length is complete
BEQ DECOMPRESSION_DONE ; if EOD code, bail
; Handle 16 bits match length
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
PREPARE_COPY_MATCH_Y
TXA
BEQ COPY_MATCH_LOOP
INY
COPY_MATCH_LOOP
LDA $AAAA ; get one byte of backreference
JSR PUTDST ; copy to destination
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- put backreference bytes backward
LDA COPY_MATCH_LOOP+1
BEQ GETMATCH_ADJ_HI
GETMATCH_DONE
DEC COPY_MATCH_LOOP+1
} else {
; Forward decompression -- put backreference bytes forward
INC COPY_MATCH_LOOP+1
BEQ GETMATCH_ADJ_HI
GETMATCH_DONE
}
DEX
BNE COPY_MATCH_LOOP
DEY
BNE COPY_MATCH_LOOP
JMP DECODE_TOKEN
!ifdef BACKWARD_DECOMPRESS {
GETMATCH_ADJ_HI
DEC COPY_MATCH_LOOP+2
JMP GETMATCH_DONE
} else {
GETMATCH_ADJ_HI
INC COPY_MATCH_LOOP+2
JMP GETMATCH_DONE
}
GETCOMBINEDBITS
EOR #$80
ASL
PHP
JSR GETNIBBLE ; get nibble into bits 0-3 (for offset bits 1-4)
PLP ; merge Z bit as the carry bit (for offset bit 0)
COMBINEDBITZ
ROL ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
DECOMPRESSION_DONE
RTS
GETNIBBLE
NIBBLES = *+1
LDA #$AA
LSR NIBCOUNT
BCC NEED_NIBBLES
AND #$0F ; isolate low 4 bits of nibble
RTS
NEED_NIBBLES
INC NIBCOUNT
JSR GETSRC ; get 2 nibbles
STA NIBBLES
LSR
LSR
LSR
LSR
SEC
RTS
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- get and put bytes backward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
LDA PUTDST+1
BEQ PUTDST_ADJ_HI
DEC PUTDST+1
RTS
PUTDST_ADJ_HI
DEC PUTDST+2
DEC PUTDST+1
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
PHA
LDA GETSRC+1
BEQ GETSRC_ADJ_HI
DEC GETSRC+1
PLA
RTS
GETSRC_ADJ_HI
DEC GETSRC+2
DEC GETSRC+1
PLA
RTS
} else {
; Forward decompression -- get and put bytes forward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
INC PUTDST+1
BEQ PUTDST_ADJ_HI
RTS
PUTDST_ADJ_HI
INC PUTDST+2
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
INC GETSRC+1
BEQ GETSRC_ADJ_HI
RTS
GETSRC_ADJ_HI
INC GETSRC+2
RTS
}

View File

@ -0,0 +1,470 @@
; ***************************************************************************
; ***************************************************************************
;
; lzsa2_6502.s
;
; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
;
; Optional code is presented for two minor 6502 optimizations that break
; compatibility with the current LZSA2 format standard.
;
; This code is written for the ACME assembler.
;
; Copyright John Brandwood 2019.
;
; Distributed under the Boost Software License, Version 1.0.
; (See accompanying file LICENSE_1_0.txt or copy at
; http://www.boost.org/LICENSE_1_0.txt)
;
; ***************************************************************************
; ***************************************************************************
; ***************************************************************************
; ***************************************************************************
;
; Decompression Options & Macros
;
;
; Save 7 bytes of code, and 21 cycles every time that a
; 16-bit length is decoded?
;
; N.B. Setting this breaks compatibility with LZSA v1.2
;
LZSA_SWAP_LEN16 = 0
;
; Save 3 bytes of code, and 4 or 8 cycles when decoding
; an offset?
;
; N.B. Setting this breaks compatibility with LZSA v1.2
;
LZSA_SWAP_XZY = 0
;
; Remove code inlining to save space?
;
; This saves 15 bytes of code, but decompression is 7% slower.
;
LZSA_BEST_SIZE = 0
;
; Assume that we're decompessing from a large multi-bank
; compressed data file, and that the next bank may need to
; paged in when a page-boundary is crossed.
;
LZSA_FROM_BANK = 0
;
; Macro to increment the source pointer to the next page.
;
!if LZSA_FROM_BANK {
!macro LZSA_INC_PAGE {
jsr .next_page
}
} else {
!macro LZSA_INC_PAGE {
inc <lzsa_srcptr + 1
}
}
;
; Macro to read a byte from the compressed source data.
;
!if LZSA_BEST_SIZE {
!macro LZSA_GET_SRC {
jsr .get_byte
}
} else {
!macro LZSA_GET_SRC {
lda (lzsa_srcptr),y
inc <lzsa_srcptr + 0
bne .skip
+LZSA_INC_PAGE
.skip:
}
}
;
; Macro to speed up reading 50% of nibbles.
;
LZSA_SLOW_NIBL = 1
!if (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
!macro LZSA_GET_NIBL {
jsr lzsa2_get_nibble ; Always call a function.
}
} else {
!macro LZSA_GET_NIBL {
lsr <lzsa_nibflg ; Is there a nibble waiting?
lda <lzsa_nibble ; Extract the lo-nibble.
bcs .skip\@
jsr .new_nibble ; Extract the hi-nibble.
.skip\@: ora #$F0
}
}
; ***************************************************************************
; ***************************************************************************
;
; Data usage is last 11 bytes of zero-page.
;
lzsa_cmdbuf = $F5 ; 1 byte.
lzsa_nibflg = $F6 ; 1 byte.
lzsa_nibble = $F7 ; 1 byte.
lzsa_offset = $F8 ; 1 word.
lzsa_winptr = $FA ; 1 word.
lzsa_srcptr = $FC ; 1 word.
lzsa_dstptr = $FE ; 1 word.
LZSA_SRC_LO = $FC
LZSA_SRC_HI = $FD
LZSA_DST_LO = $FE
LZSA_DST_HI = $FF
; ***************************************************************************
; ***************************************************************************
;
; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2b format.
;
; Args: lzsa_srcptr = ptr to compessed data
; Args: lzsa_dstptr = ptr to output buffer
; Uses: lots!
;
; If compiled with LZSA_FROM_BANK, then lzsa_srcptr should be within the bank
; window range.
;
DECOMPRESS_LZSA2_FAST:
lzsa2_unpack: ldy #0 ; Initialize source index.
sty <lzsa_nibflg ; Initialize nibble buffer.
;
; Copy bytes from compressed source data.
;
.cp_length: ldx #$00 ; Hi-byte of length or offset.
+LZSA_GET_SRC
sta <lzsa_cmdbuf ; Preserve this for later.
and #$18 ; Extract literal length.
beq .lz_offset ; Skip directly to match?
lsr ; Get 2-bit literal length.
lsr
lsr
cmp #$03 ; Extended length?
bne .got_cp_len
jsr .get_length ; X=0 table index for literals.
.got_cp_len: tay ; Check the lo-byte of length.
beq .cp_page
inx ; Increment # of pages to copy.
.get_cp_src: clc ; Calc source for partial
adc <lzsa_srcptr + 0 ; page.
sta <lzsa_srcptr + 0
bcs .get_cp_dst
dec <lzsa_srcptr + 1
.get_cp_dst: tya
clc ; Calc destination for partial
adc <lzsa_dstptr + 0 ; page.
sta <lzsa_dstptr + 0
bcs .get_cp_idx
dec <lzsa_dstptr + 1
.get_cp_idx: tya ; Negate the lo-byte of length.
eor #$FF
tay
iny
.cp_page: lda (lzsa_srcptr),y
sta (lzsa_dstptr),y
iny
bne .cp_page
inc <lzsa_srcptr + 1
inc <lzsa_dstptr + 1
dex ; Any full pages left to copy?
bne .cp_page
!if LZSA_SWAP_XZY {
;
; Shorter and faster path with NEW order of bits.
;
; STD NEW
; ================================
; xyz xzy
; 00z 0z0 5-bit offset
; 01z 0z1 9-bit offset
; 10z 1z0 13-bit offset
; 110 101 16-bit offset
; 111 111 repeat offset
; NVZ for a BIT instruction
;
; N.B. Saves 3 bytes in code length.
; get5 and get13 are 8 cycles faster.
; get9, get16, and rep are 4 cycles faster.
;
.lz_offset: lda #$20 ; Y bit in lzsa_cmdbuf.
bit <lzsa_cmdbuf
bmi .get_13_16_rep
bne .get_9_bits
.get_5_bits: dex ; X=$FF
.get_13_bits: LZSA_GET_NIBL ; Always returns with CS.
bvc .get_5_skip
clc
.get_5_skip: rol a ; Shift into position, set C.
cpx #$00 ; X=$FF for a 5-bit offset.
bne .set_offset
sbc #2 ; Subtract 512 because 13-bit
tax ; offset starts at $FE00.
bne .get_low8 ; Always NZ from previous TAX.
.get_9_bits: dex ; X=$FF if VC, X=$FE if VS.
bvc .get_low8
dex
bvs .get_low8 ; Always VS from previous BIT.
.get_13_16_rep: beq .get_13_bits ; Shares code with 5-bit path.
.get_16_rep: bvs .lz_length ; Repeat previous offset.
} else {
;
; Slower and longer path with STD order of bits.
;
; Z80 NES
; ================================
; xyz xzy
; 00z 0z0 5-bit offset
; 01z 0z1 9-bit offset
; 10z 1z0 13-bit offset
; 110 101 16-bit offset
; 111 111 repeat offset
; NVZ for a BIT instruction
;
.lz_offset: lda <lzsa_cmdbuf
asl
bcs .get_13_16_rep
asl
bcs .get_9_bits
.get_5_bits: dex ; X=$FF
.get_13_bits: asl
php
+LZSA_GET_NIBL ; Always returns with CS.
plp
rol ; Shift into position, set C.
eor #$01
cpx #$00 ; X=$FF for a 5-bit offset.
bne .set_offset
sbc #2 ; Subtract 512 because 13-bit
tax ; offset starts at $FE00.
bne .get_low8 ; Always NZ from previous TAX.
.get_9_bits: dex ; X=$FF if CS, X=$FE if CC.
asl
bcc .get_low8
dex
bcs .get_low8 ; Always VS from previous BIT.
.get_13_16_rep: asl
bcc .get_13_bits ; Shares code with 5-bit path.
.get_16_rep: bmi .lz_length ; Repeat previous offset.
}
;
; Copy bytes from decompressed window.
;
; N.B. X=0 is expected and guaranteed when we get here.
;
.get_16_bits: jsr .get_byte ; Get hi-byte of offset.
tax
.get_low8: +LZSA_GET_SRC ; Get lo-byte of offset.
.set_offset: stx <lzsa_offset + 1 ; Save new offset.
sta <lzsa_offset + 0
.lz_length: ldx #$00 ; Hi-byte of length.
lda <lzsa_cmdbuf
and #$07
clc
adc #$02
cmp #$09 ; Extended length?
bne .got_lz_len
inx
jsr .get_length ; X=1 table index for match.
.got_lz_len: eor #$FF ; Negate the lo-byte of length
tay ; and check for zero.
iny
beq .calc_lz_addr
eor #$FF
inx ; Increment # of pages to copy.
clc ; Calc destination for partial
adc <lzsa_dstptr + 0 ; page.
sta <lzsa_dstptr + 0
bcs .calc_lz_addr
dec <lzsa_dstptr + 1
.calc_lz_addr: clc ; Calc address of match.
lda <lzsa_dstptr + 0 ; N.B. Offset is negative!
adc <lzsa_offset + 0
sta <lzsa_winptr + 0
lda <lzsa_dstptr + 1
adc <lzsa_offset + 1
sta <lzsa_winptr + 1
.lz_page: lda (lzsa_winptr),y
sta (lzsa_dstptr),y
iny
bne .lz_page
inc <lzsa_winptr + 1
inc <lzsa_dstptr + 1
dex ; Any full pages left to copy?
bne .lz_page
jmp .cp_length ; Loop around to the beginning.
;
; Lookup tables to differentiate literal and match lengths.
;
.nibl_len_tbl: !byte 3 + $10 ; 0+3 (for literal).
!byte 9 + $10 ; 2+7 (for match).
.byte_len_tbl: !byte 18 - 1 ; 0+3+15 - CS (for literal).
!byte 24 - 1 ; 2+7+15 - CS (for match).
;
; Get 16-bit length in X:A register pair.
;
; N.B. Requires reversal of bytes in 16-bit length.
;
.get_length: +LZSA_GET_NIBL
cmp #$FF ; Extended length?
bcs .byte_length
adc .nibl_len_tbl,x ; Always CC from previous CMP.
.got_length: ldx #$00 ; Set hi-byte of 4 & 8 bit
rts ; lengths.
.byte_length: jsr .get_byte ; So rare, this can be slow!
adc .byte_len_tbl,x ; Always CS from previous CMP.
bcc .got_length
beq .finished
!if LZSA_SWAP_LEN16 {
.word_length: jsr .get_byte ; So rare, this can be slow!
tax
} else {
.word_length: jsr .get_byte ; So rare, this can be slow!
pha
jsr .get_byte ; So rare, this can be slow!
tax
pla
rts
}
.get_byte: lda (lzsa_srcptr),y ; Subroutine version for when
inc <lzsa_srcptr + 0 ; inlining isn't advantageous.
beq .next_page
rts
.next_page: inc <lzsa_srcptr + 1 ; Inc & test for bank overflow.
!if LZSA_FROM_BANK {
bmi .next_bank ; Change for target hardware!
}
rts
.finished: pla ; Decompression completed, pop
pla ; return address.
rts
;
; Get a nibble value from compressed data in A.
;
!if (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
lzsa2_get_nibble: lsr <lzsa_nibflg ; Is there a nibble waiting?
lda <lzsa_nibble ; Extract the lo-nibble.
bcs .got_nibble
inc <lzsa_nibflg ; Reset the flag.
+LZSA_GET_SRC
sta <lzsa_nibble ; Preserve for next time.
lsr ; Extract the hi-nibble.
lsr
lsr
lsr
!if LZSA_SWAP_XZY {
sec ; Offset code relies on CS.
}
.got_nibble: ora #$F0
rts
} else {
.new_nibble: inc <lzsa_nibflg ; Reset the flag.
LZSA_GET_SRC
sta <lzsa_nibble ; Preserve for next time.
lsr a ; Extract the hi-nibble.
lsr a
lsr a
lsr a
!if LZSA_SWAP_XZY {
sec ; Offset code relies on CS.
}
rts
}

View File

@ -1,270 +1,270 @@
; -----------------------------------------------------------------------------
; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
;
; in:
; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
;
; out:
; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
;
; -----------------------------------------------------------------------------
; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
;
; in:
; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
;
; out:
; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
;
; -----------------------------------------------------------------------------
;
; Copyright (C) 2019 Emmanuel Marty
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; -----------------------------------------------------------------------------
DECOMPRESS_LZSA1
LDY #$00
DECODE_TOKEN
JSR GETSRC ; read token byte: O|LLL|MMMM
PHA ; preserve token on stack
AND #$70 ; isolate literals count
BEQ NO_LITERALS ; skip if no literals to copy
LSR ; shift literals count into place
LSR
LSR
LSR
CMP #$07 ; LITERALS_RUN_LEN?
BCC PREPARE_COPY_LITERALS ; if not, count is directly embedded in token
JSR GETSRC ; get extra byte of variable literals count
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$F9 ; (LITERALS_RUN_LEN)
BCC PREPARE_COPY_LITERALS
BEQ LARGE_VARLEN_LITERALS ; if adding up to zero, go grab 16-bit count
JSR GETSRC ; get single extended byte of variable literals count
INY ; add 256 to literals count
BCS PREPARE_COPY_LITERALS ; (*like JMP PREPARE_COPY_LITERALS but shorter)
LARGE_VARLEN_LITERALS ; handle 16 bits literals count
; literals count = directly these 16 bits
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
TXA
PREPARE_COPY_LITERALS
TAX
BEQ COPY_LITERALS
INY
COPY_LITERALS
JSR GETPUT ; copy one byte of literals
DEX
BNE COPY_LITERALS
DEY
BNE COPY_LITERALS
NO_LITERALS
PLA ; retrieve token from stack
PHA ; preserve token again
BMI GET_LONG_OFFSET ; $80: 16 bit offset
JSR GETSRC ; get 8 bit offset from stream in A
TAX ; save for later
LDA #$FF ; high 8 bits
BNE GOT_OFFSET ; go prepare match
; (*like JMP GOT_OFFSET but shorter)
SHORT_VARLEN_MATCHLEN
JSR GETSRC ; get single extended byte of variable match len
INY ; add 256 to match length
PREPARE_COPY_MATCH
TAX
PREPARE_COPY_MATCH_Y
TXA
BEQ COPY_MATCH_LOOP
INY
COPY_MATCH_LOOP
LDA $AAAA ; get one byte of backreference
JSR PUTDST ; copy to destination
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- put backreference bytes backward
LDA COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
DEC COPY_MATCH_LOOP+2
GETMATCH_DONE
DEC COPY_MATCH_LOOP+1
} else {
; Forward decompression -- put backreference bytes forward
INC COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
INC COPY_MATCH_LOOP+2
GETMATCH_DONE
}
DEX
BNE COPY_MATCH_LOOP
DEY
BNE COPY_MATCH_LOOP
BEQ DECODE_TOKEN ; (*like JMP DECODE_TOKEN but shorter)
GET_LONG_OFFSET ; handle 16 bit offset:
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
GOT_OFFSET
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression - substract match offset
STA OFFSHI ; store high 8 bits of offset
STX OFFSLO
SEC ; substract dest - match offset
LDA PUTDST+1
OFFSLO = *+1
SBC #$AA ; low 8 bits
STA COPY_MATCH_LOOP+1 ; store back reference address
LDA PUTDST+2
OFFSHI = *+1
SBC #$AA ; high 8 bits
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
SEC
} else {
; Forward decompression - add match offset
STA OFFSHI ; store high 8 bits of offset
TXA
CLC ; add dest + match offset
ADC PUTDST+1 ; low 8 bits
STA COPY_MATCH_LOOP+1 ; store back reference address
OFFSHI = *+1
LDA #$AA ; high 8 bits
ADC PUTDST+2
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
}
PLA ; retrieve token from stack again
AND #$0F ; isolate match len (MMMM)
ADC #$02 ; plus carry which is always set by the high ADC
CMP #$12 ; MATCH_RUN_LEN?
BCC PREPARE_COPY_MATCH ; if not, count is directly embedded in token
JSR GETSRC ; get extra byte of variable match length
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$EE ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
BCC PREPARE_COPY_MATCH
BNE SHORT_VARLEN_MATCHLEN
; Handle 16 bits match length
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
; large match length with zero high byte?
BNE PREPARE_COPY_MATCH_Y ; if not, continue
DECOMPRESSION_DONE
RTS
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- get and put bytes backward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
LDA PUTDST+1
BNE PUTDST_DONE
DEC PUTDST+2
PUTDST_DONE
DEC PUTDST+1
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
PHA
LDA GETSRC+1
BNE GETSRC_DONE
DEC GETSRC+2
GETSRC_DONE
DEC GETSRC+1
PLA
RTS
} else {
; Forward decompression -- get and put bytes forward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
INC PUTDST+1
BNE PUTDST_DONE
INC PUTDST+2
PUTDST_DONE
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
INC GETSRC+1
BNE GETSRC_DONE
INC GETSRC+2
GETSRC_DONE
RTS
}
; -----------------------------------------------------------------------------
; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
;
; in:
; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
;
; out:
; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
;
; -----------------------------------------------------------------------------
; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
;
; in:
; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
;
; out:
; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
;
; -----------------------------------------------------------------------------
;
; Copyright (C) 2019 Emmanuel Marty
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; -----------------------------------------------------------------------------
DECOMPRESS_LZSA1
LDY #$00
DECODE_TOKEN
JSR GETSRC ; read token byte: O|LLL|MMMM
PHA ; preserve token on stack
AND #$70 ; isolate literals count
BEQ NO_LITERALS ; skip if no literals to copy
LSR ; shift literals count into place
LSR
LSR
LSR
CMP #$07 ; LITERALS_RUN_LEN?
BCC PREPARE_COPY_LITERALS ; if not, count is directly embedded in token
JSR GETSRC ; get extra byte of variable literals count
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$F9 ; (LITERALS_RUN_LEN)
BCC PREPARE_COPY_LITERALS
BEQ LARGE_VARLEN_LITERALS ; if adding up to zero, go grab 16-bit count
JSR GETSRC ; get single extended byte of variable literals count
INY ; add 256 to literals count
BCS PREPARE_COPY_LITERALS ; (*like JMP PREPARE_COPY_LITERALS but shorter)
LARGE_VARLEN_LITERALS ; handle 16 bits literals count
; literals count = directly these 16 bits
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
TXA
PREPARE_COPY_LITERALS
TAX
BEQ COPY_LITERALS
INY
COPY_LITERALS
JSR GETPUT ; copy one byte of literals
DEX
BNE COPY_LITERALS
DEY
BNE COPY_LITERALS
NO_LITERALS
PLA ; retrieve token from stack
PHA ; preserve token again
BMI GET_LONG_OFFSET ; $80: 16 bit offset
JSR GETSRC ; get 8 bit offset from stream in A
TAX ; save for later
LDA #$FF ; high 8 bits
BNE GOT_OFFSET ; go prepare match
; (*like JMP GOT_OFFSET but shorter)
SHORT_VARLEN_MATCHLEN
JSR GETSRC ; get single extended byte of variable match len
INY ; add 256 to match length
PREPARE_COPY_MATCH
TAX
PREPARE_COPY_MATCH_Y
TXA
BEQ COPY_MATCH_LOOP
INY
COPY_MATCH_LOOP
LDA $AAAA ; get one byte of backreference
JSR PUTDST ; copy to destination
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- put backreference bytes backward
LDA COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
DEC COPY_MATCH_LOOP+2
GETMATCH_DONE
DEC COPY_MATCH_LOOP+1
} else {
; Forward decompression -- put backreference bytes forward
INC COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
INC COPY_MATCH_LOOP+2
GETMATCH_DONE
}
DEX
BNE COPY_MATCH_LOOP
DEY
BNE COPY_MATCH_LOOP
BEQ DECODE_TOKEN ; (*like JMP DECODE_TOKEN but shorter)
GET_LONG_OFFSET ; handle 16 bit offset:
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
GOT_OFFSET
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression - substract match offset
STA OFFSHI ; store high 8 bits of offset
STX OFFSLO
SEC ; substract dest - match offset
LDA PUTDST+1
OFFSLO = *+1
SBC #$AA ; low 8 bits
STA COPY_MATCH_LOOP+1 ; store back reference address
LDA PUTDST+2
OFFSHI = *+1
SBC #$AA ; high 8 bits
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
SEC
} else {
; Forward decompression - add match offset
STA OFFSHI ; store high 8 bits of offset
TXA
CLC ; add dest + match offset
ADC PUTDST+1 ; low 8 bits
STA COPY_MATCH_LOOP+1 ; store back reference address
OFFSHI = *+1
LDA #$AA ; high 8 bits
ADC PUTDST+2
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
}
PLA ; retrieve token from stack again
AND #$0F ; isolate match len (MMMM)
ADC #$02 ; plus carry which is always set by the high ADC
CMP #$12 ; MATCH_RUN_LEN?
BCC PREPARE_COPY_MATCH ; if not, count is directly embedded in token
JSR GETSRC ; get extra byte of variable match length
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$EE ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
BCC PREPARE_COPY_MATCH
BNE SHORT_VARLEN_MATCHLEN
; Handle 16 bits match length
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
; large match length with zero high byte?
BNE PREPARE_COPY_MATCH_Y ; if not, continue
DECOMPRESSION_DONE
RTS
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- get and put bytes backward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
LDA PUTDST+1
BNE PUTDST_DONE
DEC PUTDST+2
PUTDST_DONE
DEC PUTDST+1
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
PHA
LDA GETSRC+1
BNE GETSRC_DONE
DEC GETSRC+2
GETSRC_DONE
DEC GETSRC+1
PLA
RTS
} else {
; Forward decompression -- get and put bytes forward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
INC PUTDST+1
BNE PUTDST_DONE
INC PUTDST+2
PUTDST_DONE
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
INC GETSRC+1
BNE GETSRC_DONE
INC GETSRC+2
GETSRC_DONE
RTS
}

View File

@ -1,336 +1,336 @@
; -----------------------------------------------------------------------------
; Decompress raw LZSA2 block.
; Create one with lzsa -r -f2 <original_file> <compressed_file>
;
; in:
; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
;
; out:
; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
;
; -----------------------------------------------------------------------------
; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
;
; in:
; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
;
; out:
; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
;
; -----------------------------------------------------------------------------
;
; Copyright (C) 2019 Emmanuel Marty
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; -----------------------------------------------------------------------------
NIBCOUNT = $FC ; zero-page location for temp offset
DECOMPRESS_LZSA2
LDY #$00
STY NIBCOUNT
DECODE_TOKEN
JSR GETSRC ; read token byte: XYZ|LL|MMM
PHA ; preserve token on stack
AND #$18 ; isolate literals count (LL)
BEQ NO_LITERALS ; skip if no literals to copy
LSR ; shift literals count into place
LSR
LSR
CMP #$03 ; LITERALS_RUN_LEN_V2?
BCC PREPARE_COPY_LITERALS ; if less, count is directly embedded in token
JSR GETNIBBLE ; get extra literals length nibble
; add nibble to len from token
ADC #$02 ; (LITERALS_RUN_LEN_V2) minus carry
CMP #$12 ; LITERALS_RUN_LEN_V2 + 15 ?
BCC PREPARE_COPY_LITERALS ; if less, literals count is complete
JSR GETSRC ; get extra byte of variable literals count
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$EE ; overflow?
PREPARE_COPY_LITERALS
TAX
BCC PREPARE_COPY_LITERALS_HIGH ; if not, literals count is complete
; handle 16 bits literals count
; literals count = directly these 16 bits
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
PREPARE_COPY_LITERALS_HIGH
TXA
BEQ COPY_LITERALS
INY
COPY_LITERALS
JSR GETPUT ; copy one byte of literals
DEX
BNE COPY_LITERALS
DEY
BNE COPY_LITERALS
NO_LITERALS
PLA ; retrieve token from stack
PHA ; preserve token again
ASL
BCS REPMATCH_OR_LARGE_OFFSET ; 1YZ: rep-match or 13/16 bit offset
ASL ; 0YZ: 5 or 9 bit offset
BCS OFFSET_9_BIT
; 00Z: 5 bit offset
LDX #$FF ; set offset bits 15-8 to 1
JSR GETCOMBINEDBITS ; rotate Z bit into bit 0, read nibble for bits 4-1
ORA #$E0 ; set bits 7-5 to 1
BNE GOT_OFFSET_LO ; go store low byte of match offset and prepare match
OFFSET_9_BIT ; 01Z: 9 bit offset
;;ASL ; shift Z (offset bit 8) in place
ROL
ROL
AND #$01
EOR #$FF ; set offset bits 15-9 to 1
BNE GOT_OFFSET_HI ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
REPMATCH_OR_LARGE_OFFSET
ASL ; 13 bit offset?
BCS REPMATCH_OR_16_BIT ; handle rep-match or 16-bit offset if not
; 10Z: 13 bit offset
JSR GETCOMBINEDBITS ; rotate Z bit into bit 8, read nibble for bits 12-9
ADC #$DE ; set bits 15-13 to 1 and substract 2 (to substract 512)
BNE GOT_OFFSET_HI ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
REPMATCH_OR_16_BIT ; rep-match or 16 bit offset
;;ASL ; XYZ=111?
BMI REP_MATCH ; reuse previous offset if so (rep-match)
; 110: handle 16 bit offset
JSR GETSRC ; grab high 8 bits
GOT_OFFSET_HI
TAX
JSR GETSRC ; grab low 8 bits
GOT_OFFSET_LO
STA OFFSLO ; store low byte of match offset
STX OFFSHI ; store high byte of match offset
REP_MATCH
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression - substract match offset
SEC ; add dest + match offset
LDA PUTDST+1 ; low 8 bits
OFFSLO = *+1
SBC #$AA
STA COPY_MATCH_LOOP+1 ; store back reference address
LDA PUTDST+2
OFFSHI = *+1
SBC #$AA ; high 8 bits
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
SEC
} else {
; Forward decompression - add match offset
CLC ; add dest + match offset
LDA PUTDST+1 ; low 8 bits
OFFSLO = *+1
ADC #$AA
STA COPY_MATCH_LOOP+1 ; store back reference address
OFFSHI = *+1
LDA #$AA ; high 8 bits
ADC PUTDST+2
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
}
PLA ; retrieve token from stack again
AND #$07 ; isolate match len (MMM)
ADC #$01 ; add MIN_MATCH_SIZE_V2 and carry
CMP #$09 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
BCC PREPARE_COPY_MATCH ; if less, length is directly embedded in token
JSR GETNIBBLE ; get extra match length nibble
; add nibble to len from token
ADC #$08 ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
CMP #$18 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
BCC PREPARE_COPY_MATCH ; if less, match length is complete
JSR GETSRC ; get extra byte of variable match length
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$E8 ; overflow?
PREPARE_COPY_MATCH
TAX
BCC PREPARE_COPY_MATCH_Y ; if not, the match length is complete
BEQ DECOMPRESSION_DONE ; if EOD code, bail
; Handle 16 bits match length
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
PREPARE_COPY_MATCH_Y
TXA
BEQ COPY_MATCH_LOOP
INY
COPY_MATCH_LOOP
LDA $AAAA ; get one byte of backreference
JSR PUTDST ; copy to destination
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- put backreference bytes backward
LDA COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
DEC COPY_MATCH_LOOP+2
GETMATCH_DONE
DEC COPY_MATCH_LOOP+1
} else {
; Forward decompression -- put backreference bytes forward
INC COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
INC COPY_MATCH_LOOP+2
GETMATCH_DONE
}
DEX
BNE COPY_MATCH_LOOP
DEY
BNE COPY_MATCH_LOOP
JMP DECODE_TOKEN
GETCOMBINEDBITS
EOR #$80
ASL
PHP
JSR GETNIBBLE ; get nibble into bits 0-3 (for offset bits 1-4)
PLP ; merge Z bit as the carry bit (for offset bit 0)
COMBINEDBITZ
ROL ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
DECOMPRESSION_DONE
RTS
GETNIBBLE
NIBBLES = *+1
LDA #$AA
LSR NIBCOUNT
BCS HAS_NIBBLES
INC NIBCOUNT
JSR GETSRC ; get 2 nibbles
STA NIBBLES
LSR
LSR
LSR
LSR
SEC
HAS_NIBBLES
AND #$0F ; isolate low 4 bits of nibble
RTS
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- get and put bytes backward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
LDA PUTDST+1
BNE PUTDST_DONE
DEC PUTDST+2
PUTDST_DONE
DEC PUTDST+1
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
PHA
LDA GETSRC+1
BNE GETSRC_DONE
DEC GETSRC+2
GETSRC_DONE
DEC GETSRC+1
PLA
RTS
} else {
; Forward decompression -- get and put bytes forward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
INC PUTDST+1
BNE PUTDST_DONE
INC PUTDST+2
PUTDST_DONE
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
INC GETSRC+1
BNE GETSRC_DONE
INC GETSRC+2
GETSRC_DONE
RTS
}
; -----------------------------------------------------------------------------
; Decompress raw LZSA2 block.
; Create one with lzsa -r -f2 <original_file> <compressed_file>
;
; in:
; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
;
; out:
; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
;
; -----------------------------------------------------------------------------
; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
;
; in:
; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
;
; out:
; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
;
; -----------------------------------------------------------------------------
;
; Copyright (C) 2019 Emmanuel Marty
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; -----------------------------------------------------------------------------
NIBCOUNT = $FC ; zero-page location for temp offset
DECOMPRESS_LZSA2
LDY #$00
STY NIBCOUNT
DECODE_TOKEN
JSR GETSRC ; read token byte: XYZ|LL|MMM
PHA ; preserve token on stack
AND #$18 ; isolate literals count (LL)
BEQ NO_LITERALS ; skip if no literals to copy
LSR ; shift literals count into place
LSR
LSR
CMP #$03 ; LITERALS_RUN_LEN_V2?
BCC PREPARE_COPY_LITERALS ; if less, count is directly embedded in token
JSR GETNIBBLE ; get extra literals length nibble
; add nibble to len from token
ADC #$02 ; (LITERALS_RUN_LEN_V2) minus carry
CMP #$12 ; LITERALS_RUN_LEN_V2 + 15 ?
BCC PREPARE_COPY_LITERALS ; if less, literals count is complete
JSR GETSRC ; get extra byte of variable literals count
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$EE ; overflow?
PREPARE_COPY_LITERALS
TAX
BCC PREPARE_COPY_LITERALS_HIGH ; if not, literals count is complete
; handle 16 bits literals count
; literals count = directly these 16 bits
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
PREPARE_COPY_LITERALS_HIGH
TXA
BEQ COPY_LITERALS
INY
COPY_LITERALS
JSR GETPUT ; copy one byte of literals
DEX
BNE COPY_LITERALS
DEY
BNE COPY_LITERALS
NO_LITERALS
PLA ; retrieve token from stack
PHA ; preserve token again
ASL
BCS REPMATCH_OR_LARGE_OFFSET ; 1YZ: rep-match or 13/16 bit offset
ASL ; 0YZ: 5 or 9 bit offset
BCS OFFSET_9_BIT
; 00Z: 5 bit offset
LDX #$FF ; set offset bits 15-8 to 1
JSR GETCOMBINEDBITS ; rotate Z bit into bit 0, read nibble for bits 4-1
ORA #$E0 ; set bits 7-5 to 1
BNE GOT_OFFSET_LO ; go store low byte of match offset and prepare match
OFFSET_9_BIT ; 01Z: 9 bit offset
;;ASL ; shift Z (offset bit 8) in place
ROL
ROL
AND #$01
EOR #$FF ; set offset bits 15-9 to 1
BNE GOT_OFFSET_HI ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
REPMATCH_OR_LARGE_OFFSET
ASL ; 13 bit offset?
BCS REPMATCH_OR_16_BIT ; handle rep-match or 16-bit offset if not
; 10Z: 13 bit offset
JSR GETCOMBINEDBITS ; rotate Z bit into bit 8, read nibble for bits 12-9
ADC #$DE ; set bits 15-13 to 1 and substract 2 (to substract 512)
BNE GOT_OFFSET_HI ; go store high byte, read low byte of match offset and prepare match
; (*same as JMP GOT_OFFSET_HI but shorter)
REPMATCH_OR_16_BIT ; rep-match or 16 bit offset
;;ASL ; XYZ=111?
BMI REP_MATCH ; reuse previous offset if so (rep-match)
; 110: handle 16 bit offset
JSR GETSRC ; grab high 8 bits
GOT_OFFSET_HI
TAX
JSR GETSRC ; grab low 8 bits
GOT_OFFSET_LO
STA OFFSLO ; store low byte of match offset
STX OFFSHI ; store high byte of match offset
REP_MATCH
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression - substract match offset
SEC ; add dest + match offset
LDA PUTDST+1 ; low 8 bits
OFFSLO = *+1
SBC #$AA
STA COPY_MATCH_LOOP+1 ; store back reference address
LDA PUTDST+2
OFFSHI = *+1
SBC #$AA ; high 8 bits
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
SEC
} else {
; Forward decompression - add match offset
CLC ; add dest + match offset
LDA PUTDST+1 ; low 8 bits
OFFSLO = *+1
ADC #$AA
STA COPY_MATCH_LOOP+1 ; store back reference address
OFFSHI = *+1
LDA #$AA ; high 8 bits
ADC PUTDST+2
STA COPY_MATCH_LOOP+2 ; store high 8 bits of address
}
PLA ; retrieve token from stack again
AND #$07 ; isolate match len (MMM)
ADC #$01 ; add MIN_MATCH_SIZE_V2 and carry
CMP #$09 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
BCC PREPARE_COPY_MATCH ; if less, length is directly embedded in token
JSR GETNIBBLE ; get extra match length nibble
; add nibble to len from token
ADC #$08 ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
CMP #$18 ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
BCC PREPARE_COPY_MATCH ; if less, match length is complete
JSR GETSRC ; get extra byte of variable match length
; the carry is always set by the CMP above
; GETSRC doesn't change it
SBC #$E8 ; overflow?
PREPARE_COPY_MATCH
TAX
BCC PREPARE_COPY_MATCH_Y ; if not, the match length is complete
BEQ DECOMPRESSION_DONE ; if EOD code, bail
; Handle 16 bits match length
JSR GETLARGESRC ; grab low 8 bits in X, high 8 bits in A
TAY ; put high 8 bits in Y
PREPARE_COPY_MATCH_Y
TXA
BEQ COPY_MATCH_LOOP
INY
COPY_MATCH_LOOP
LDA $AAAA ; get one byte of backreference
JSR PUTDST ; copy to destination
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- put backreference bytes backward
LDA COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
DEC COPY_MATCH_LOOP+2
GETMATCH_DONE
DEC COPY_MATCH_LOOP+1
} else {
; Forward decompression -- put backreference bytes forward
INC COPY_MATCH_LOOP+1
BNE GETMATCH_DONE
INC COPY_MATCH_LOOP+2
GETMATCH_DONE
}
DEX
BNE COPY_MATCH_LOOP
DEY
BNE COPY_MATCH_LOOP
JMP DECODE_TOKEN
GETCOMBINEDBITS
EOR #$80
ASL
PHP
JSR GETNIBBLE ; get nibble into bits 0-3 (for offset bits 1-4)
PLP ; merge Z bit as the carry bit (for offset bit 0)
COMBINEDBITZ
ROL ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
DECOMPRESSION_DONE
RTS
GETNIBBLE
NIBBLES = *+1
LDA #$AA
LSR NIBCOUNT
BCS HAS_NIBBLES
INC NIBCOUNT
JSR GETSRC ; get 2 nibbles
STA NIBBLES
LSR
LSR
LSR
LSR
SEC
HAS_NIBBLES
AND #$0F ; isolate low 4 bits of nibble
RTS
!ifdef BACKWARD_DECOMPRESS {
; Backward decompression -- get and put bytes backward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
LDA PUTDST+1
BNE PUTDST_DONE
DEC PUTDST+2
PUTDST_DONE
DEC PUTDST+1
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
PHA
LDA GETSRC+1
BNE GETSRC_DONE
DEC GETSRC+2
GETSRC_DONE
DEC GETSRC+1
PLA
RTS
} else {
; Forward decompression -- get and put bytes forward
GETPUT
JSR GETSRC
PUTDST
LZSA_DST_LO = *+1
LZSA_DST_HI = *+2
STA $AAAA
INC PUTDST+1
BNE PUTDST_DONE
INC PUTDST+2
PUTDST_DONE
RTS
GETLARGESRC
JSR GETSRC ; grab low 8 bits
TAX ; move to X
; fall through grab high 8 bits
GETSRC
LZSA_SRC_LO = *+1
LZSA_SRC_HI = *+2
LDA $AAAA
INC GETSRC+1
BNE GETSRC_DONE
INC GETSRC+2
GETSRC_DONE
RTS
}

View File

@ -1,32 +1,125 @@
; lzsa1fta.asm time-efficient decompressor implementation for 8086 CPUs.
; Turbo Assembler IDEAL mode dialect; can also be assembled with NASM.
; lzsa1fta.asm time-efficient decompressor implementation for 808x CPUs.
; Turbo Assembler IDEAL mode dialect.
; (Is supposed to also assemble with NASM's IDEAL mode support, but YMMV.)
;
; Usual DOS assembler SMALL model assumptions apply. This code:
; - Assumes it was invoked via NEAR call (change RET to RETF for FAR calls)
; - Is interrupt-safe
; - Is not re-entrant (do not decompress while already running decompression)
; - Trashes all data and segment registers
; This code assembles to about 3K of lookup tables and unrolled code,
; but the tradeoff for that size is the absolute fastest decompressor
; of LZSA1 block data for 808x CPUs.
; If you need moderately fast code with less size, see LZSA1FTA.ASM.
; If you need the smallest decompression code, see decompress_small_v1.S.
;
; Copyright (C) 2019 Jim Leonard, Emmanuel Marty
; Usual DOS assembler SMALL model assumptions apply. This code:
; - Assumes it was invoked via NEAR call (change RET to RETF for FAR calls)
; - Is interrupt-safe
; - Is not re-entrant (do not decompress while already running decompression)
; - Trashes all data and segment registers
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
; Copyright (C) 2019 Jim Leonard, Emmanuel Marty
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
;
; ===========================================================================
;
; The key area to concentrate on when optimizing LZSA1 decompression speed is
; reducing time spent handling the shortest matches. This is for two reasons:
; 1. shorter matches are more common
; 2. short matches are least efficient in terms of decomp speed per byte
; You can confirm #1 using the --stats mode of the compressor.
;
; Branches are costly on 8086. To ensure we branch as little as possible, a
; jumptable will be used to branch directly to as many direct decode paths as
; possible. This will burn up 512 bytes of RAM for a jumptable, and a few
; hundred bytes of duplicated program code (rather than JMP/CALL common code
; blocks, we inline them to avoid the branch overhead).
;
; ===========================================================================
;
; === LZSA1 block reference:
;
; Blocks encoded as LZSA1 are composed from consecutive commands.
; Each command follows this format:
;
; token: <O|LLL|MMMM>
; optional extra literal length
; literal values
; match offset low
; optional match offset high
; optional extra encoded match length
;
;
; === LZSA1 Token Reference:
;
; 7 6 5 4 3 2 1 0
; O L L L M M M M
;
; L: 3-bit literals length (0-6, or 7 if extended). If the number of literals for
; this command is 0 to 6, the length is encoded in the token and no extra bytes
; are required. Otherwise, a value of 7 is encoded and extra bytes follow as
; 'optional extra literal length'
;
; M: 4-bit encoded match length (0-14, or 15 if extended). Likewise, if the
; encoded match length for this command is 0 to 14, it is directly stored,
; otherwise 15 is stored and extra bytes follow as 'optional extra encoded match
; length'. Except for the last command in a block, a command always contains a
; match, so the encoded match length is the actual match length, offset by the
; minimum which is 3 bytes. For instance, an actual match length of 10 bytes to
; be copied, is encoded as 7.
;
; O: set for a 2-bytes match offset, clear for a 1-byte match offset
;
;
; === Decoding extended literal length:
;
; If the literals length is 7 or more, then an extra byte follows here, with
; three possible values:
;
; 0-248: the value is added to the 7 stored in the token.
; 250: a second byte follows. The final literals value is 256 + the second byte.
; 249: a little-endian 16-bit value follows, forming the final literals value.
;
;
; === Decoding match offsets:
;
; match offset low: The low 8 bits of the match offset follows.
;
; optional match offset high: If the 'O' bit (bit 7) is set in the token, the
; high 8 bits of the match offset follow, otherwise they are understood to be all
; set to 1. For instance, a short offset of 0x70 is interpreted as 0xff70
;
;
; === Decoding extra encoded match length:
;
; optional extra encoded match length: If the encoded match length is 15 or more,
; the 'M' bits in the token form the value 15, and an extra byte follows here,
; with three possible types of value.
;
; 0-237: the value is added to the 15 stored in the token. The final value is 3 + 15 + this byte.
; 239: a second byte follows. The final match length is 256 + the second byte.
; 238: a second and third byte follow, forming a little-endian 16-bit value.
; The final encoded match length is that 16-bit value.
;
; ===========================================================================
IDEAL
P8086
IDEAL ; Use Turbo Assembler IDEAL syntax checking
P8086 ; Restrict code generation to the 808x and later
JUMPS ; Perform fixups for out-of-bound conditional jumps
; This is required for the (L=07 & M=0Fh) decode paths as they
; have the most code, but these are uncommon paths so the
; tiny speed loss in just these paths is not a concern.
SEGMENT CODE para public
@ -34,203 +127,385 @@ ASSUME cs:CODE, ds:CODE
PUBLIC lzsa1_decompress_speed_jumptable
; ---------------------------------------------------------------------------
; Decompress raw LZSA1 block
; inputs:
; * ds:si: raw LZSA1 block
; * es:di: output buffer
; output:
; * ax: decompressed size
; ---------------------------------------------------------------------------
; EQU helper statements (so we can construct a jump table without going crazy)
;Jump table for handling LLL bits in initial LZSA1 tokens.
;Previous code would SHR val,4 to get a count from 0 to 7, then rep movsb.
;We can overload the shift operation into a jump table that jumps directly
;to optimized copying routine for 0-7 bytes. Must declare in code segment.
;Note: If this looks strange for declaring a jump table, that's because it
;is a workaround for the Turbo Pascal harness that tests it. Turbo Pascal
;treats OFFSET (label) as a relocatble item and throws an error, so we fool
;it by building the table with absolute EQU/literals instead.
L0b EQU OFFSET check_offset_size
L1b EQU OFFSET copy1b
L2b EQU OFFSET copy2b
L3b EQU OFFSET copy3b
L4b EQU OFFSET copy4b
L5b EQU OFFSET copy5b
L6b EQU OFFSET copy6b
L7b EQU OFFSET need_length_byte
copytable DW L0b,L0b,L0b,L0b,L0b,L0b,L0b,L0b
DW L1b,L1b,L1b,L1b,L1b,L1b,L1b,L1b
DW L2b,L2b,L2b,L2b,L2b,L2b,L2b,L2b
DW L3b,L3b,L3b,L3b,L3b,L3b,L3b,L3b
DW L4b,L4b,L4b,L4b,L4b,L4b,L4b,L4b
DW L5b,L5b,L5b,L5b,L5b,L5b,L5b,L5b
DW L6b,L6b,L6b,L6b,L6b,L6b,L6b,L6b
DW L7b,L7b,L7b,L7b,L7b,L7b,L7b,L7b
minmatch EQU 3
litrunlen EQU 7
leml1 EQU OFFSET lit_ext_mat_len_1b
leme1 EQU OFFSET lit_ext_mat_ext_1b
leml2 EQU OFFSET lit_ext_mat_len_2b
leme2 EQU OFFSET lit_ext_mat_ext_2b
;short-circuit special cases for 0 through 6 literal copies:
l6ml1 EQU OFFSET lit_len_mat_len_1b
l6me1 EQU OFFSET lit_len_mat_ext_1b
l6ml2 EQU OFFSET lit_len_mat_len_2b
l6me2 EQU OFFSET lit_len_mat_ext_2b
l5ml1 EQU OFFSET lit_len_mat_len_1b + 1
l5me1 EQU OFFSET lit_len_mat_ext_1b + 1
l5ml2 EQU OFFSET lit_len_mat_len_2b + 1
l5me2 EQU OFFSET lit_len_mat_ext_2b + 1
l4ml1 EQU OFFSET lit_len_mat_len_1b + 2
l4me1 EQU OFFSET lit_len_mat_ext_1b + 2
l4ml2 EQU OFFSET lit_len_mat_len_2b + 2
l4me2 EQU OFFSET lit_len_mat_ext_2b + 2
l3ml1 EQU OFFSET lit_len_mat_len_1b + 3
l3me1 EQU OFFSET lit_len_mat_ext_1b + 3
l3ml2 EQU OFFSET lit_len_mat_len_2b + 3
l3me2 EQU OFFSET lit_len_mat_ext_2b + 3
l2ml1 EQU OFFSET lit_len_mat_len_1b + 4
l2me1 EQU OFFSET lit_len_mat_ext_1b + 4
l2ml2 EQU OFFSET lit_len_mat_len_2b + 4
l2me2 EQU OFFSET lit_len_mat_ext_2b + 4
l1ml1 EQU OFFSET lit_len_mat_len_1b + 5
l1me1 EQU OFFSET lit_len_mat_ext_1b + 5
l1ml2 EQU OFFSET lit_len_mat_len_2b + 5
l1me2 EQU OFFSET lit_len_mat_ext_2b + 5
l0ml1 EQU OFFSET lit_len_mat_len_1b + 6 ; MMMM handling comes after LLL code
l0me1 EQU OFFSET lit_len_mat_ext_1b + 6 ; MMMM handling comes after LLL code
l0ml2 EQU OFFSET lit_len_mat_len_2b + 6 ; MMMM handling comes after LLL code
l0me2 EQU OFFSET lit_len_mat_ext_2b + 6 ; MMMM handling comes after LLL code
; === Hand-written (!) jumptable actually begins here.
; Located before the program code results in an extra JMP and 3 wasted bytes,
; but it makes the code easier to follow in this location.
; Relocate the jump table after the ENDP directive to save 3 bytes.
;
; 7 6 5 4 3 2 1 0
; O L L L M M M M
;
; 0 1 2 3 4 5 6 7 8 9 a b c d e f
jtbl DW l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0me1 ;0
DW l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1me1 ;1
DW l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2me1 ;2
DW l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3me1 ;3
DW l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4me1 ;4
DW l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5me1 ;5
DW l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6me1 ;6
DW leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leme1 ;7
DW l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0me2 ;8
DW l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1me2 ;9
DW l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2me2 ;a
DW l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3me2 ;b
DW l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4me2 ;c
DW l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5me2 ;d
DW l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6me2 ;e
DW leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leme2 ;f
PROC lzsa1_decompress_speed_jumptable NEAR
; ---------------------------------------------------------------------------
; Decompress raw LZSA1 block
; inputs:
; * ds:si: raw LZSA1 block
; * es:di: output buffer
; output:
; * ax: decompressed size
; ---------------------------------------------------------------------------
MACRO get_byte_match_offset
mov ah,0ffh ;O=0, so set up offset's high byte
lodsb ;load low byte; ax=match offset
xchg bp,ax ;bp=match offset ax=00 + original token
ENDM
MACRO get_word_match_offset
lodsw ;ax=match offset
xchg bp,ax ;bp=match offset ax=00 + original token
ENDM
MACRO do_match_copy_long
LOCAL do_run, do_run_w
; Copies a long match as optimally as possible.
; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output
; trashes: ax, bx
; must leave cx=0 at exit
mov bx,ds ;save ds
mov ax,es
mov ds,ax ;ds=es
xchg ax,si ;save si
lea si,[bp+di] ;si = output buffer + negative match offset
cmp bp,-2 ;do we have a byte/word run to optimize?
jae do_run ;perform a run if so, otherwise fall through
;You may be tempted to change "jae" to "jge" because DX is a signed number.
;Don't! The total window is 64k, so if you treat this as a signed comparison,
;you will get incorrect results for offsets over 32K.
;If we're here, we have a long copy and it isn't byte-overlapping (if it
;overlapped, we'd be in @@do_run) So, let's copy faster with REP MOVSW.
;This affects 8088 only slightly, but is a bigger win on 8086 and higher.
shr cx,1
rep movsw
adc cl,0
rep movsb
xchg si,ax ;restore si
mov ds,bx ;restore ds
jmp decode_token
do_run:
je do_run_w ;if applicable, handle word-sized value faster
xchg dx,ax ;save si into dx, as ax is getting trashed
lodsb ;load first byte of run into al
mov ah,al
shr cx,1
rep stosw ;perform word run
adc cl,0
rep stosb ;finish word run
mov si,dx ;restore si
mov ds,bx ;restore ds
jmp decode_token
do_run_w:
xchg dx,ax ;save si into dx, as ax is getting trashed
lodsw ;load first word of run
shr cx,1
rep stosw ;perform word run
adc cl,0 ;despite 2-byte offset, compressor might
rep stosb ;output odd length. better safe than sorry.
mov si,dx ;restore si
mov ds,bx ;restore ds
jmp decode_token
ENDM
MACRO do_match_copy
; Copies a shorter match with as little overhead as possible.
; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output
; trashes: ax, bx
; must leave cx=0 at exit
mov bx,ds ;save ds
mov ax,es
mov ds,ax ;ds=es
xchg ax,si ;save si
lea si,[bp+di] ;si = output buffer + negative match offset
rep movsb
xchg si,ax ;restore si
mov ds,bx ;restore ds
jmp decode_token
ENDM
MACRO do_literal_copy
; Copies a literal sequence using words.
; Meant for longer lengths; for 128 bytes or less, use REP MOVSB.
; requirements: cx=length, ds:si=compdata, es:di=output
; must leave cx=0 at exit
shr cx,1
rep movsw
adc cl,0
rep movsb
ENDM
MACRO copy_small_match_len
and al,0FH ;isolate length in token (MMMM)
add al,minmatch ;ax=match length
xchg cx,ax ;cx=match length
do_match_copy ;copy match with cx=length, bp=offset
ENDM
MACRO copy_large_match_len
LOCAL val239, val238, EOD
; Handle MMMM=Fh
; Assumptions: ah=0 from get_????_match_offset's xchg
lodsb ;grab extra match length byte
add al,0Fh+minmatch ;add MATCH_RUN_LEN + MIN_MATCH_SIZE
jz val238 ;if zf & cf, 238: get 16-bit match length
jc val239 ;if cf, 239: get extra match length byte
xchg cx,ax ;otherwise, we have our match length
do_match_copy_long ;copy match with cx=length, bp=offset
val239:
lodsb ;ah=0; grab single extra length byte
inc ah ;ax=256+length byte
xchg cx,ax
do_match_copy_long ;copy match with cx=length, bp=offset
val238:
lodsw ;grab 16-bit length
xchg cx,ax
jcxz EOD ;is it the EOD marker? Exit if so
do_match_copy_long ;copy match with cx=length, bp=offset
EOD:
jmp done_decompressing
ENDM
lzsa1_start:
push di ;remember decompression offset
cld ;ensure string ops move forward
xor cx,cx
@@decode_token:
xchg cx,ax ;clear ah (cx = 0 from match copy's rep movsb)
decode_token:
xchg cx,ax ;clear ah (cx = 0 from match copy's REP)
lodsb ;read token byte: O|LLL|MMMM
mov dx,ax ;copy our token to dl for later MMMM handling
mov bp,ax ;preserve 0+token in bp for later MMMM handling
mov bx,ax ;prep for table lookup
shl bx,1 ;adjust for offset word size
jmp [cs:jtbl+bx] ;jump directly to relevant decode path
and al,070H ;isolate literals length in token (LLL)
jz check_offset_size ;if LLL=0, we have no literals; goto match
; There are eight basic decode paths for an LZSA1 token. Each of these
; paths perform only the necessary actions to decode the token and then
; fetch the next token. This results in a lot of code duplication, but
; it is the only way to get down to two branches per token (jump to unique
; decode path, then jump back to next token) for the most common cases.
; Jump to short copy routine for LLL=1 though 6, need_length_byte for LLL=7
mov bx,ax ;prep for table lookup (must copy, don't XCHG!)
jmp [cs:copytable+bx]
; Path #1: LLL=0-6, MMMM=0-Eh, O=0 (1-byte match offset)
; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
lit_len_mat_len_1b:
movsb
movsb
movsb
movsb
movsb
movsb
get_byte_match_offset
copy_small_match_len
need_length_byte:
lodsb ;grab extra length byte
add al,07H ;add LITERALS_RUN_LEN
jnc @@got_literals_exact ;if no overflow, we have full count
je @@big_literals
@@mid_literals:
lodsb ;grab single extra length byte
inc ah ;add 256
xchg cx,ax ;with longer counts, we can save some time
shr cx,1 ;by doing a word copy instead of a byte copy.
rep movsw ;We don't need to account for overlap because
adc cx,0 ;source for literals isn't the output buffer.
rep movsb
jmp check_offset_size
; Path #2: LLL=0-6, MMMM=Fh, O=0 (1-byte match offset)
lit_len_mat_ext_1b:
movsb
movsb
movsb
movsb
movsb
movsb
get_byte_match_offset
copy_large_match_len
@@big_literals:
lodsw ;grab 16-bit extra length
xchg cx,ax ;with longer counts, we can save some time
shr cx,1 ;by doing a word copy instead of a byte copy.
rep movsw
adc cx,0
rep movsb
jmp check_offset_size
; Used for counts 7-248. In test data, average value around 1Ah. YMMV.
@@got_literals_exact:
; Path #3: LLL=7, MMMM=0-Eh, O=0 (1-byte match offset)
lit_ext_mat_len_1b:
; on entry: ax=0 + token, bp=ax
lodsb ;grab extra literal length byte
add al,litrunlen ;add 7h literal run length
jz @@val249_3 ;if zf & cf, 249: get 16-bit literal length
jc @@val250_3 ;if cf, 250: get extra literal length byte
xchg cx,ax ;otherwise, we have our literal length
do_literal_copy ;this might be better as rep movsw !!! benchmark
get_byte_match_offset
copy_small_match_len
@@val250_3:
lodsb ;ah=0; grab single extra length byte
inc ah ;ax=256+length byte
xchg cx,ax
rep movsb ;copy cx literals from ds:si to es:di
jmp check_offset_size
;Literal copy sequence for lengths 1-6:
copy6b: movsb
copy5b: movsb
copy4b: movsb
copy3b: movsb
copy2b: movsb
copy1b: movsb
;Literals done; fall through to match offset determination
check_offset_size:
test dl,dl ;check match offset size in token (O bit)
js @@get_long_offset ;load absolute 16-bit match offset
mov ah,0ffh ;set up high byte
lodsb ;load low byte
@@get_match_length:
xchg dx,ax ;dx: match offset ax: original token
and al,0FH ;isolate match length in token (MMMM)
cmp al,0FH ;MATCH_RUN_LEN?
jne @@got_matchlen_short ;no, we have the full match length from the token, go copy
lodsb ;grab extra length byte
add al,012H ;add MIN_MATCH_SIZE + MATCH_RUN_LEN
jnc @@do_long_copy ;if no overflow, we have the entire length
jne @@mid_matchlen
do_literal_copy
get_byte_match_offset
copy_small_match_len
@@val249_3:
lodsw ;grab 16-bit length
xchg cx,ax ;get ready to do a long copy
jcxz @@done_decompressing ;wait, is it the EOD marker? Exit if so
jmp @@copy_len_preset ;otherwise, do the copy
xchg cx,ax
do_literal_copy
get_byte_match_offset
copy_small_match_len
@@got_matchlen_short:
add al,3 ;add MIN_MATCH_SIZE
xchg cx,ax ;copy match length into cx
mov bp,ds ;save ds
mov ax,es
mov ds,ax ;ds=es
xchg ax,si ;save si
mov si,di ;ds:si now points at back reference in output data
add si,dx
rep movsb ;copy match
xchg si,ax ;restore si
mov ds,bp ;restore ds
jmp @@decode_token ;go decode another token
@@done_decompressing:
; Path #4: LLL=7, MMMM=Fh, O=0 (1-byte match offset)
lit_ext_mat_ext_1b:
; on entry: ax=0 + token, bp=ax
lodsb ;grab extra literal length byte
add al,litrunlen ;add 7h literal run length
jz @@val249_4 ;if zf & cf, 249: get 16-bit literal length
jc @@val250_4 ;if cf, 250: get extra literal length byte
xchg cx,ax ;otherwise, we have our literal length
do_literal_copy ;this might be better as rep movsw !!! benchmark
get_byte_match_offset
copy_large_match_len
@@val250_4:
lodsb ;ah=0; grab single extra length byte
inc ah ;ax=256+length byte
xchg cx,ax
do_literal_copy
get_byte_match_offset
copy_large_match_len
@@val249_4:
lodsw ;grab 16-bit length
xchg cx,ax
do_literal_copy
get_byte_match_offset
copy_large_match_len
; Path #5: LLL=0-6, MMMM=0-Eh, O=1 (2-byte match offset)
; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
lit_len_mat_len_2b:
movsb
movsb
movsb
movsb
movsb
movsb
get_word_match_offset
copy_small_match_len
; Path #6: LLL=0-6, MMMM=Fh, O=1 (2-byte match offset)
lit_len_mat_ext_2b:
movsb
movsb
movsb
movsb
movsb
movsb
get_word_match_offset
copy_large_match_len
; Path #7: LLL=7, MMMM=0-Eh, O=1 (2-byte match offset)
lit_ext_mat_len_2b:
; on entry: ax=0 + token, bp=ax
lodsb ;grab extra literal length byte
add al,litrunlen ;add 7h literal run length
jz @@val249_7 ;if zf & cf, 249: get 16-bit literal length
jc @@val250_7 ;if cf, 250: get extra literal length byte
xchg cx,ax ;otherwise, we have our literal length
do_literal_copy ;this might be better as rep movsw !!! benchmark
get_word_match_offset
copy_small_match_len
@@val250_7:
lodsb ;ah=0; grab single extra length byte
inc ah ;ax=256+length byte
xchg cx,ax
do_literal_copy
get_word_match_offset
copy_small_match_len
@@val249_7:
lodsw ;grab 16-bit length
xchg cx,ax
do_literal_copy
get_word_match_offset
copy_small_match_len
; Path #8: LLL=7, MMMM=Fh, O=1 (2-byte match offset)
lit_ext_mat_ext_2b:
; on entry: ax=0 + token, bp=ax
lodsb ;grab extra literal length byte
add al,litrunlen ;add 7h literal run length
jz @@val249_8 ;if zf & cf, 249: get 16-bit literal length
jc @@val250_8 ;if cf, 250: get extra literal length byte
xchg cx,ax ;otherwise, we have our literal length
do_literal_copy ;this might be better as rep movsw !!! benchmark
get_word_match_offset
copy_large_match_len
@@val250_8:
lodsb ;ah=0; grab single extra length byte
inc ah ;ax=256+length byte
xchg cx,ax
do_literal_copy
get_word_match_offset
copy_large_match_len
@@val249_8:
lodsw ;grab 16-bit length
xchg cx,ax
do_literal_copy
get_word_match_offset
copy_large_match_len
done_decompressing:
;return # of decompressed bytes in ax
pop ax ;retrieve the original decompression offset
xchg di,ax ;compute decompressed size
sub ax,di
sub di,ax ;adjust for original offset
xchg di,ax ;return adjusted value in ax
ret ;done decompressing, exit to caller
;These are called less often; moved here to optimize the fall-through case
@@get_long_offset:
lodsw ;Get 2-byte match offset
jmp @@get_match_length
;With a confirmed longer match length, we have an opportunity to optimize for
;the case where a single byte is repeated long enough that we can benefit
;from rep movsw to perform the run (instead of rep movsb).
@@mid_matchlen:
lodsb ;grab single extra length byte
inc ah ;add 256
@@do_long_copy:
xchg cx,ax ;copy match length into cx
@@copy_len_preset:
push ds ;save ds
mov bp,es
mov ds,bp ;ds=es
mov bp,si ;save si
mov si,di ;ds:si now points at back reference in output data
add si,dx
cmp dx,-2 ;do we have a byte/word run to optimize?
jae @@do_run ;perform a run
;You may be tempted to change "jae" to "jge" because DX is a signed number.
;Don't! The total window is 64k, so if you treat this as a signed comparison,
;you will get incorrect results for offsets over 32K.
;If we're here, we have a long copy and it isn't byte-overlapping (if it
;overlapped, we'd be in @@do_run_1) So, let's copy faster with REP MOVSW.
;This won't affect 8088 that much, but it speeds up 8086 and higher.
shr cx,1
rep movsw
adc cx,0
rep movsb
mov si,bp ;restore si
pop ds
jmp @@decode_token ;go decode another token
@@do_run:
je @@do_run_2 ;fall through to byte (common) if not word run
@@do_run_1:
lodsb ;load first byte of run into al
mov ah,al
shr cx,1
rep stosw ;perform word run
adc cx,0
rep stosb ;finish word run
mov si,bp ;restore si
pop ds
jmp @@decode_token ;go decode another token
@@do_run_2:
lodsw ;load first word of run
shr cx,1
rep stosw ;perform word run
adc cx,0 ;despite 2-byte offset, compressor might
rep stosb ;output odd length. better safe than sorry.
mov si,bp ;restore si
pop ds
jmp @@decode_token ;go decode another token
ENDP lzsa1_decompress_speed_jumptable
ENDS CODE
@ -238,37 +513,11 @@ ENDS CODE
END
;Speed optimization history (decompression times in microseconds @ 4.77 MHz):
; original E. Marty code shuttle 123208 alice 65660 robotron 407338 ***
; table for shr al,4 shuttle 120964 alice 63230 robotron 394733 +++
; push/pop to mov/mov shuttle 118176 alice 61835 robotron 386762 +++
; movsw for literalcpys shuttle 124102 alice 64908 robotron 400220 --- rb
; stosw for byte runs shuttle 118897 alice 65040 robotron 403518 --- rb
; better stosw for runs shuttle 117712 alice 65040 robotron 403343 +--
; disable RLE by default shuttle 116924 alice 60783 robotron 381226 +++
; optimize got_matchlen shuttle 115294 alice 59588 robotron 374330 +++
; fall through to getML shuttle 113258 alice 59572 robotron 372004 +++
; fall through to midLI shuttle 113258 alice 59572 robotron 375060 ..- rb
; fall through midMaLen shuttle 113247 alice 59572 robotron 372004 +.+
; movsw for litlen > 255 shuttle 113247 alice 59572 robotron 371612 ..+
; rep stosw for long runs shuttle 113247 alice 59572 robotron 371612 ...
; rep movsw for long cpys shuttle 113247 alice 59572 robotron 371035 ..+
; xchg/dec ah -> mov ah,val shuttle 112575 alice 59272 robotron 369198 +++
; force >12h len.to longcpy shuttle 101998 alice 59266 robotron 364459 +.+
; more efficient run branch shuttle 102239 alice 59297 robotron 364716 --- rb
; even more eff. run branch shuttle 101998 alice 59266 robotron 364459 ***
; BUGFIX - bad sign compare shuttle 101955 alice 59225 robotron 364117 +++
; reverse 16-bit len compar shuttle 102000 alice 59263 robotron 364460 --- rb
; jcxz for EOD detection no change to speed, but is 1 byte shorter +++
; force movsw for literals shuttle 107183 alice 62555 robotron 379524 --- rb
; defer shr4 until necessry shuttle 102069 alice 60236 robotron 364096 ---
; skip literals if LLL=0 shuttle 98655 alice 57849 robotron 363358 ---
; fall through to mid_liter shuttle 98595 alice 57789 robotron 361998 +++
; == jumptable experiments begin ==
; jumptable for small copys shuttle 101594 alice 61078 robotron 386018 ---
; start:xchg instead of mov shuttle 100948 alice 60467 robotron 381112 +++
; use table for LLL=0 check shuttle 106972 alice 63333 robotron 388304 --- rb
; jmptbl to fallthrough mov shuttle 102532 alice 60760 robotron 383070 ---
; cpy fallthrough check_ofs shuttle 98939 alice 58917 robotron 371019 +**
; single jumptable jump shuttle 97528 alice 57264 robotron 362194 ++*
; conditional check for L=7 shuttle 98610 alice 58521 robotron 368153 --- rb
; defer add MIN_MATCH_SIZE shuttle 97207 alice 57200 robotron 362884 ++*
; jumptable rewrite, no RLE shuttle 97744 alice 46905 robotron 309032 -++
; adc cx,0 -> adc cl,0 shuttle 97744 alice 46893 robotron 309032 .+.!
; jumptable rewrite w/RLE shuttle 88776 alice 50433 robotron 319222 +--
; short match copies movsb shuttle 97298 alice 49769 robotron 326282 ---rb
; long match copy #1 16-bit shuttle 92490 alice 46905 robotron 308722 +*+
; long match copy #2 extraB shuttle 92464 alice 46905 robotron 308371 +.+
; long match copy #3 0f->ed shuttle 86765 alice 46864 robotron 303895 +++!

View File

@ -146,7 +146,7 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
}
}
if ((pInBlock + 1) < pInBlockEnd) { /* The last token in the block does not include match information */
if (pInBlock < pInBlockEnd) { /* The last token in the block does not include match information */
unsigned char nOffsetMode = token & 0xc0;
unsigned int nValue;
@ -185,6 +185,7 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
if ((token & 0x20) == 0) {
/* 16 bit offset */
nMatchOffset = (((unsigned int)(*pInBlock++)) << 8);
if (pInBlock >= pInBlockEnd) return -1;
nMatchOffset |= (unsigned int)(*pInBlock++);
nMatchOffset ^= 0xffff;
nMatchOffset++;

View File

@ -48,7 +48,7 @@
#define OPT_RAW_BACKWARD 8
#define OPT_STATS 16
#define TOOL_VERSION "1.1.2"
#define TOOL_VERSION "1.2.0"
/*---------------------------------------------------------------------------*/
@ -512,7 +512,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
size_t nDataSizeStep = 128;
float fProbabilitySizeStep = 0.0005f;
for (nGeneratedDataSize = 1024; nGeneratedDataSize <= ((nOptions & OPT_RAW) ? BLOCK_SIZE : (4 * BLOCK_SIZE)); nGeneratedDataSize += nDataSizeStep) {
for (nGeneratedDataSize = 1024; nGeneratedDataSize <= ((size_t)((nOptions & OPT_RAW) ? BLOCK_SIZE : (4 * BLOCK_SIZE))); nGeneratedDataSize += nDataSizeStep) {
float fMatchProbability;
fprintf(stdout, "size %zd", nGeneratedDataSize);
@ -530,7 +530,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
/* Try to compress it, expected to succeed */
size_t nActualCompressedSize = lzsa_compress_inmem(pGeneratedData, pCompressedData, nGeneratedDataSize, lzsa_get_max_compressed_size_inmem(nGeneratedDataSize),
nFlags, nMinMatchSize, nFormatVersion);
if (nActualCompressedSize == -1 || nActualCompressedSize < (lzsa_get_header_size() + lzsa_get_frame_size() + lzsa_get_frame_size() /* footer */)) {
if (nActualCompressedSize == -1 || (int)nActualCompressedSize < (lzsa_get_header_size() + lzsa_get_frame_size() + lzsa_get_frame_size() /* footer */)) {
free(pTmpDecompressedData);
pTmpDecompressedData = NULL;
free(pTmpCompressedData);

View File

@ -91,7 +91,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
int nMinMatchSize = pCompressor->min_match_size;
if (pCompressor->format_version >= 2) {
for (i = 1; i < nInWindowSize - 1; i++) {
for (i = 1; i < nInWindowSize; i++) {
int nIndex = (int)(intervals[i] & POS_MASK);
int nLen = PLCP[nIndex];
if (nLen < nMinMatchSize)
@ -105,7 +105,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
}
}
else {
for (i = 1; i < nInWindowSize - 1; i++) {
for (i = 1; i < nInWindowSize; i++) {
int nIndex = (int)(intervals[i] & POS_MASK);
int nLen = PLCP[nIndex];
if (nLen < nMinMatchSize)
@ -116,9 +116,6 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
}
}
if (i < nInWindowSize)
intervals[i] &= POS_MASK;
/**
* Build intervals for finding matches
*
@ -195,16 +192,18 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
* @param nOffset offset to find matches at, in the input window
* @param pMatches pointer to returned matches
* @param nMaxMatches maximum number of matches to return (0 for none)
* @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
*
* @return number of matches
*/
int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches) {
int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches, const int nInWindowSize) {
unsigned int *intervals = pCompressor->intervals;
unsigned int *pos_data = pCompressor->pos_data;
unsigned int ref;
unsigned int super_ref;
unsigned int match_pos;
lzsa_match *matchptr;
int nPrevOffset = 0;
/**
* Find matches using intervals
@ -238,7 +237,40 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
/* Ascend indirectly via pos_data[] links. */
match_pos = super_ref & EXCL_VISITED_MASK;
matchptr = pMatches;
if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
if ((matchptr - pMatches) < nMaxMatches) {
int nMatchOffset = (int)(nOffset - match_pos);
if (nMatchOffset <= MAX_OFFSET) {
matchptr->length = (unsigned short)(ref >> (LCP_SHIFT + TAG_BITS));
matchptr->offset = (unsigned short)nMatchOffset;
matchptr++;
nPrevOffset = nMatchOffset;
}
}
}
for (;;) {
if ((super_ref = pos_data[match_pos]) > ref) {
match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
if ((matchptr - pMatches) < nMaxMatches) {
int nMatchOffset = (int)(nOffset - match_pos);
if (nMatchOffset <= MAX_OFFSET && nMatchOffset != nPrevOffset) {
matchptr->length = ((unsigned short)(ref >> (LCP_SHIFT + TAG_BITS))) | 0x8000;
matchptr->offset = (unsigned short)nMatchOffset;
matchptr++;
nPrevOffset = nMatchOffset;
}
}
}
}
while ((super_ref = pos_data[match_pos]) > ref)
match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
@ -247,7 +279,7 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
if ((matchptr - pMatches) < nMaxMatches) {
int nMatchOffset = (int)(nOffset - match_pos);
if (nMatchOffset <= MAX_OFFSET) {
if (nMatchOffset <= MAX_OFFSET && nMatchOffset != nPrevOffset) {
if (pCompressor->format_version >= 2) {
matchptr->length = (unsigned short)(ref >> (LCP_SHIFT + TAG_BITS));
}
@ -263,6 +295,23 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
break;
ref = super_ref;
match_pos = intervals[ref & POS_MASK] & EXCL_VISITED_MASK;
if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
if ((matchptr - pMatches) < nMaxMatches) {
int nMatchOffset = (int)(nOffset - match_pos);
if (nMatchOffset <= MAX_OFFSET && nMatchOffset != nPrevOffset) {
matchptr->length = ((unsigned short)(ref >> (LCP_SHIFT + TAG_BITS))) | 0x8000;
matchptr->offset = (unsigned short)nMatchOffset;
if ((matchptr->length & 0x7fff) > 2) {
matchptr++;
nPrevOffset = nMatchOffset;
}
}
}
}
}
return (int)(matchptr - pMatches);
@ -282,7 +331,7 @@ void lzsa_skip_matches(lzsa_compressor *pCompressor, const int nStartOffset, con
/* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
* we don't store the matches. */
for (i = nStartOffset; i < nEndOffset; i++) {
lzsa_find_matches_at(pCompressor, i, &match, 0);
lzsa_find_matches_at(pCompressor, i, &match, 0, 0);
}
}
@ -295,11 +344,11 @@ void lzsa_skip_matches(lzsa_compressor *pCompressor, const int nStartOffset, con
* @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
*/
void lzsa_find_all_matches(lzsa_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset) {
lzsa_match *pMatch = pCompressor->match + (nStartOffset * nMatchesPerOffset);
lzsa_match *pMatch = pCompressor->match;
int i;
for (i = nStartOffset; i < nEndOffset; i++) {
int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, nMatchesPerOffset);
int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, nMatchesPerOffset, nEndOffset - nStartOffset);
while (nMatches < nMatchesPerOffset) {
pMatch[nMatches].length = 0;

View File

@ -59,10 +59,11 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
* @param nOffset offset to find matches at, in the input window
* @param pMatches pointer to returned matches
* @param nMaxMatches maximum number of matches to return (0 for none)
* @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
*
* @return number of matches
*/
int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches);
int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches, const int nInWindowSize);
/**
* Skip previously compressed bytes

View File

@ -157,24 +157,26 @@ static inline int lzsa_get_offset_cost_v1(const unsigned int nMatchOffset) {
* @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
*/
static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce) {
lzsa_arrival *arrival = pCompressor->arrival;
lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
const int nMinMatchSize = pCompressor->min_match_size;
const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
int i, j, n;
memset(arrival + (nStartOffset << MATCHES_PER_OFFSET_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset) << MATCHES_PER_OFFSET_SHIFT));
if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;
arrival[nStartOffset << MATCHES_PER_OFFSET_SHIFT].from_slot = -1;
memset(arrival + (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT));
for (i = nStartOffset; i != (nEndOffset - 1); i++) {
arrival[nStartOffset << MATCHES_PER_ARRIVAL_SHIFT].from_slot = -1;
for (i = nStartOffset; i != nEndOffset; i++) {
int m;
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost;
for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
int nCodingChoiceCost = nPrevCost + 8 /* literal */;
int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 1;
int nNumLiterals = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals + 1;
int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 1;
int nNumLiterals = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals + 1;
if (nNumLiterals == LITERALS_RUN_LEN_V1 || nNumLiterals == 256 || nNumLiterals == 512) {
nCodingChoiceCost += 8;
@ -183,15 +185,15 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
if (!nFavorRatio && nNumLiterals == 1)
nCodingChoiceCost += MODESWITCH_PENALTY;
for (n = 0; n < NMATCHES_PER_OFFSET /* we only need the literals + short match cost + long match cost cases */; n++) {
lzsa_arrival *pDestArrival = &arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n];
for (n = 0; n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
lzsa_arrival *pDestArrival = &arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n];
if (pDestArrival->from_slot == 0 ||
nCodingChoiceCost < pDestArrival->cost ||
(nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
memmove(&arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n + 1],
&arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n],
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
memmove(&arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
&arrival[((i + 1) << MATCHES_PER_ARRIVAL_SHIFT) + n],
sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));
pDestArrival->cost = nCodingChoiceCost;
pDestArrival->from_pos = i;
@ -200,15 +202,15 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
pDestArrival->match_len = 0;
pDestArrival->num_literals = nNumLiterals;
pDestArrival->score = nScore;
pDestArrival->rep_offset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
pDestArrival->rep_offset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
break;
}
}
}
const lzsa_match *match = pCompressor->match + (i << 3);
const lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V1);
for (m = 0; m < 8 && match[m].length; m++) {
for (m = 0; m < NMATCHES_PER_INDEX_V1 && match[m].length; m++) {
int nMatchLen = match[m].length;
int nMatchOffsetCost = lzsa_get_offset_cost_v1(match[m].offset);
int nStartingMatchLen, k;
@ -223,33 +225,33 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
for (k = nStartingMatchLen; k <= nMatchLen; k++) {
int nMatchLenCost = lzsa_get_match_varlen_size_v1(k - MIN_MATCH_SIZE_V1);
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost;
for (j = 0; j < NMATCHES_PER_ARRIVAL_V1 && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost;
int nCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchOffsetCost + nMatchLenCost;
int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 5;
int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 5;
int exists = 0;
if (!nFavorRatio && !arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals)
if (!nFavorRatio && !arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals)
nCodingChoiceCost += MODESWITCH_PENALTY;
for (n = 0;
n < NMATCHES_PER_OFFSET && arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n].from_slot && arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n].cost <= nCodingChoiceCost;
n < NMATCHES_PER_ARRIVAL_V1 && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].from_slot && arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].cost <= nCodingChoiceCost;
n++) {
if (lzsa_get_offset_cost_v1(arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n].rep_offset) == lzsa_get_offset_cost_v1(match[m].offset)) {
if (lzsa_get_offset_cost_v1(arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n].rep_offset) == lzsa_get_offset_cost_v1(match[m].offset)) {
exists = 1;
break;
}
}
for (n = 0; !exists && n < NMATCHES_PER_OFFSET /* we only need the literals + short match cost + long match cost cases */; n++) {
lzsa_arrival *pDestArrival = &arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n];
for (n = 0; !exists && n < NMATCHES_PER_ARRIVAL_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
lzsa_arrival *pDestArrival = &arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n];
if (pDestArrival->from_slot == 0 ||
nCodingChoiceCost < pDestArrival->cost ||
(nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
memmove(&arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n + 1],
&arrival[((i + k) << MATCHES_PER_OFFSET_SHIFT) + n],
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
memmove(&arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n + 1],
&arrival[((i + k) << MATCHES_PER_ARRIVAL_SHIFT) + n],
sizeof(lzsa_arrival) * (NMATCHES_PER_ARRIVAL_V1 - n - 1));
pDestArrival->cost = nCodingChoiceCost;
pDestArrival->from_pos = i;
@ -267,15 +269,14 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
}
}
lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_OFFSET_SHIFT) + 0];
pBestMatch[i].length = 0;
pBestMatch[i].offset = 0;
lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + 0];
while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0) {
if (end_arrival->from_pos >= nEndOffset) return;
pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
pBestMatch[end_arrival->from_pos].offset = end_arrival->match_offset;
end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_OFFSET_SHIFT) + (end_arrival->from_slot - 1)];
end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_ARRIVAL_SHIFT) + (end_arrival->from_slot - 1)];
}
}
@ -284,13 +285,14 @@ static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, lzsa_match *p
* impacting the compression ratio
*
* @param pCompressor compression context
* @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
* @param pBestMatch optimal matches to emit
* @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
* @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
*
* @return non-zero if the number of tokens was reduced, 0 if it wasn't
*/
static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset) {
static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset) {
int i;
int nNumLiterals = 0;
int nDidReduce = 0;
@ -298,6 +300,28 @@ static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, lzsa_mat
for (i = nStartOffset; i < nEndOffset; ) {
lzsa_match *pMatch = pBestMatch + i;
if (pMatch->length == 0 &&
(i + 1) < (nEndOffset - LAST_LITERALS) &&
pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V1 &&
pBestMatch[i + 1].length < MAX_VARLEN &&
pBestMatch[i + 1].offset &&
i >= pBestMatch[i + 1].offset &&
(i + pBestMatch[i + 1].length + 1) <= (nEndOffset - LAST_LITERALS) &&
!memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
int nCurLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V1);
int nReducedLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V1);
if ((nReducedLenSize - nCurLenSize) <= 8) {
/* Merge */
pBestMatch[i].length = pBestMatch[i + 1].length + 1;
pBestMatch[i].offset = pBestMatch[i + 1].offset;
pBestMatch[i + 1].length = 0;
pBestMatch[i + 1].offset = 0;
nDidReduce = 1;
continue;
}
}
if (pMatch->length >= MIN_MATCH_SIZE_V1) {
if (pMatch->length <= 9 /* Don't waste time considering large matches, they will always win over literals */ &&
(i + pMatch->length) < nEndOffset /* Don't consider the last token in the block, we can only reduce a match inbetween other tokens */) {
@ -326,17 +350,33 @@ static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, lzsa_mat
}
}
if ((i + pMatch->length) < nEndOffset && pMatch->length >= LCP_MAX &&
pMatch->offset && pMatch->offset <= 32 && pBestMatch[i + pMatch->length].offset == pMatch->offset && (pMatch->length % pMatch->offset) == 0 &&
(pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN) {
int nMatchLen = pMatch->length;
if ((i + pMatch->length) <= nEndOffset && pMatch->offset > 0 && pMatch->length >= MIN_MATCH_SIZE_V1 &&
pBestMatch[i + pMatch->length].offset > 0 &&
pBestMatch[i + pMatch->length].length >= MIN_MATCH_SIZE_V1 &&
(pMatch->length + pBestMatch[i + pMatch->length].length) >= LEAVE_ALONE_MATCH_SIZE &&
(pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN &&
(i + pMatch->length) > pMatch->offset &&
(i + pMatch->length) > pBestMatch[i + pMatch->length].offset &&
(i + pMatch->length + pBestMatch[i + pMatch->length].length) <= nEndOffset &&
!memcmp(pInWindow + i - pMatch->offset + pMatch->length,
pInWindow + i + pMatch->length - pBestMatch[i + pMatch->length].offset,
pBestMatch[i + pMatch->length].length)) {
/* Join */
int nCurPartialSize = lzsa_get_match_varlen_size_v1(pMatch->length - MIN_MATCH_SIZE_V1);
nCurPartialSize += 8 /* token */ + lzsa_get_literals_varlen_size_v1(0) + ((pBestMatch[i + pMatch->length].offset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size_v1(pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V1);
pMatch->length += pBestMatch[i + nMatchLen].length;
pBestMatch[i + nMatchLen].offset = 0;
pBestMatch[i + nMatchLen].length = -1;
continue;
int nReducedPartialSize = lzsa_get_match_varlen_size_v1(pMatch->length + pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V1);
if (nCurPartialSize >= nReducedPartialSize) {
int nMatchLen = pMatch->length;
/* Join */
pMatch->length += pBestMatch[i + nMatchLen].length;
pBestMatch[i + nMatchLen].offset = 0;
pBestMatch[i + nMatchLen].length = -1;
continue;
}
}
i += pMatch->length;
@ -620,34 +660,36 @@ int lzsa_optimize_and_write_block_v1(lzsa_compressor *pCompressor, const unsigne
/* Compress optimally without breaking ties in favor of less tokens */
lzsa_optimize_forward_v1(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */);
memset(pCompressor->best_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
lzsa_optimize_forward_v1(pCompressor, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */);
int nDidReduce;
int nPasses = 0;
do {
nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nPasses++;
} while (nDidReduce && nPasses < 20);
nBaseCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
lzsa_match *pBestMatch = pCompressor->best_match;
nBaseCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
lzsa_match *pBestMatch = pCompressor->best_match - nPreviousBlockSize;
if (nBaseCompressedSize > 0 && nInDataSize < 65536) {
int nReducedCompressedSize;
/* Compress optimally and do break ties in favor of less tokens */
lzsa_optimize_forward_v1(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */);
memset(pCompressor->improved_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
lzsa_optimize_forward_v1(pCompressor, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */);
nPasses = 0;
do {
nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nPasses++;
} while (nDidReduce && nPasses < 20);
nReducedCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nReducedCompressedSize = lzsa_get_compressed_size_v1(pCompressor, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
if (nReducedCompressedSize > 0 && nReducedCompressedSize <= nBaseCompressedSize) {
/* Pick the parse with the reduced number of tokens as it didn't negatively affect the size */
pBestMatch = pCompressor->improved_match;
pBestMatch = pCompressor->improved_match - nPreviousBlockSize;
}
}

View File

@ -174,36 +174,110 @@ static inline int lzsa_write_match_varlen_v2(unsigned char *pOutData, int nOutOf
return nOutOffset;
}
/**
* Insert forward rep candidate
*
* @param pCompressor compression context
* @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
* @param i input data window position whose matches are being considered
* @param nMatchOffset match offset to use as rep candidate
* @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
* @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
* @param nMatchesPerArrival number of arrivals to record per input buffer position
* @param nDepth current insertion depth
*/
static void lzsa_insert_forward_match_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int i, const int nMatchOffset, const int nStartOffset, const int nEndOffset, const int nMatchesPerArrival, int nDepth) {
lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
int j;
if (nDepth >= 10) return;
for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
if (nMatchOffset != nRepOffset && nRepOffset && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len >= MIN_MATCH_SIZE_V2) {
int nRepPos = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_pos;
int nRepLen = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len;
if (nRepPos > nMatchOffset &&
(nRepPos - nMatchOffset + nRepLen) <= (nEndOffset - LAST_LITERALS) &&
!memcmp(pInWindow + nRepPos - nRepOffset, pInWindow + nRepPos - nMatchOffset, nRepLen)) {
int nCurRepLen = nRepLen;
int nMaxRepLen = nEndOffset - nRepPos;
if (nMaxRepLen > LCP_MAX)
nMaxRepLen = LCP_MAX;
while ((nCurRepLen + 8) < nMaxRepLen && !memcmp(pInWindow + nRepPos + nCurRepLen, pInWindow + nRepPos - nMatchOffset + nCurRepLen, 8))
nCurRepLen += 8;
while ((nCurRepLen + 4) < nMaxRepLen && !memcmp(pInWindow + nRepPos + nCurRepLen, pInWindow + nRepPos - nMatchOffset + nCurRepLen, 4))
nCurRepLen += 4;
while (nCurRepLen < nMaxRepLen && pInWindow[nRepPos + nCurRepLen] == pInWindow[nRepPos - nMatchOffset + nCurRepLen])
nCurRepLen++;
lzsa_match *fwd_match = pCompressor->match + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2);
int exists = 0;
int r;
for (r = 0; r < NMATCHES_PER_INDEX_V2 && fwd_match[r].length >= MIN_MATCH_SIZE_V2; r++) {
if (fwd_match[r].offset == nMatchOffset) {
exists = 1;
if (fwd_match[r].length < nCurRepLen) {
fwd_match[r].length = nCurRepLen;
lzsa_insert_forward_match_v2(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, nDepth + 1);
}
break;
}
}
if (!exists && r < NMATCHES_PER_INDEX_V2) {
fwd_match[r].offset = nMatchOffset;
fwd_match[r].length = nCurRepLen;
lzsa_insert_forward_match_v2(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, nDepth + 1);
}
}
}
}
}
/**
* Attempt to pick optimal matches using a forward arrivals parser, so as to produce the smallest possible output that decompresses to the same input
*
* @param pCompressor compression context
* @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
* @param pBestMatch pointer to buffer for outputting optimal matches
* @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
* @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
* @param nReduce non-zero to reduce the number of tokens when the path costs are equal, zero not to
* @param nInsertForwardReps non-zero to insert forward repmatch candidates, zero to use the previously inserted candidates
* @param nMatchesPerArrival number of arrivals to record per input buffer position
*/
static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce, const int nInsertForwardReps) {
lzsa_arrival *arrival = pCompressor->arrival;
static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int nReduce, const int nInsertForwardReps, const int nMatchesPerArrival) {
lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT);
const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
const int nMinMatchSize = pCompressor->min_match_size;
const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
const int nLeaveAloneMatchSize = (nMatchesPerArrival == NMATCHES_PER_ARRIVAL_V2_SMALL) ? LEAVE_ALONE_MATCH_SIZE_SMALL : LEAVE_ALONE_MATCH_SIZE;
int i, j, n;
memset(arrival + (nStartOffset << MATCHES_PER_OFFSET_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset) << MATCHES_PER_OFFSET_SHIFT));
if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;
for (i = (nStartOffset << MATCHES_PER_OFFSET_SHIFT); i != (nEndOffset << MATCHES_PER_OFFSET_SHIFT); i++) {
memset(arrival + (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT));
for (i = (nStartOffset << MATCHES_PER_ARRIVAL_SHIFT); i != ((nEndOffset + 1) << MATCHES_PER_ARRIVAL_SHIFT); i++) {
arrival[i].cost = 0x40000000;
}
arrival[nStartOffset << MATCHES_PER_OFFSET_SHIFT].from_slot = -1;
arrival[nStartOffset << MATCHES_PER_ARRIVAL_SHIFT].from_slot = -1;
for (i = nStartOffset; i != (nEndOffset - 1); i++) {
for (i = nStartOffset; i != nEndOffset; i++) {
int m;
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
const int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost & 0x3fffffff;
for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
const int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost & 0x3fffffff;
int nCodingChoiceCost = nPrevCost + 8 /* literal */;
int nNumLiterals = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals + 1;
int nNumLiterals = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals + 1;
if (nNumLiterals == LITERALS_RUN_LEN_V2) {
nCodingChoiceCost += 4;
@ -218,29 +292,36 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
if (!nFavorRatio && nNumLiterals == 1)
nCodingChoiceCost += MODESWITCH_PENALTY;
lzsa_arrival *pDestSlots = &arrival[(i + 1) << MATCHES_PER_OFFSET_SHIFT];
if (nCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
lzsa_arrival *pDestSlots = &arrival[(i + 1) << MATCHES_PER_ARRIVAL_SHIFT];
if (nCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
int exists = 0;
for (n = 0;
n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nCodingChoiceCost;
n < nMatchesPerArrival && pDestSlots[n].cost <= nCodingChoiceCost;
n++) {
if (pDestSlots[n].rep_offset == arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset) {
if (pDestSlots[n].rep_offset == arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset) {
exists = 1;
break;
}
}
if (!exists) {
int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 1;
for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 1;
for (n = 0; n < nMatchesPerArrival; n++) {
lzsa_arrival *pDestArrival = &pDestSlots[n];
if (nCodingChoiceCost < pDestArrival->cost ||
(nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
if (pDestArrival->from_slot) {
int z;
for (z = n; z < nMatchesPerArrival - 1; z++) {
if (pDestSlots[z].rep_offset == arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset)
break;
}
memmove(&pDestSlots[n + 1],
&pDestSlots[n],
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
sizeof(lzsa_arrival) * (z - n));
}
pDestArrival->cost = nCodingChoiceCost;
@ -250,9 +331,9 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
pDestArrival->match_len = 0;
pDestArrival->num_literals = nNumLiterals;
pDestArrival->score = nScore;
pDestArrival->rep_offset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
pDestArrival->rep_pos = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_pos;
pDestArrival->rep_len = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_len;
pDestArrival->rep_offset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
pDestArrival->rep_pos = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_pos;
pDestArrival->rep_len = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_len;
break;
}
}
@ -260,125 +341,143 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
}
}
lzsa_match *match = pCompressor->match + (i << 5);
lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V2);
for (m = 0; m < 32 && match[m].length; m++) {
int nMatchLen = match[m].length;
int nMinRepLen[NMATCHES_PER_ARRIVAL_V2_BIG];
memset(nMinRepLen, 0, nMatchesPerArrival * sizeof(int));
for (m = 0; m < NMATCHES_PER_INDEX_V2 && match[m].length; m++) {
int nMatchLen = match[m].length & 0x7fff;
int nMatchOffset = match[m].offset;
int nScorePenalty = ((match[m].length & 0x8000) >> 15);
int nNoRepmatchOffsetCost = (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16));
int nStartingMatchLen, k;
int nMaxRepLen[NMATCHES_PER_OFFSET];
int nMaxRepLen[NMATCHES_PER_ARRIVAL_V2_BIG];
if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
nMatchLen = nEndOffset - LAST_LITERALS - i;
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
int nCurMaxRepLen = 0;
if (nMatchOffset != nRepOffset &&
nRepOffset &&
i > nRepOffset &&
(i - nRepOffset + nMatchLen) <= (nEndOffset - LAST_LITERALS)) {
while (nCurMaxRepLen < nMatchLen && pInWindow[i - nRepOffset + nCurMaxRepLen] == pInWindow[i - nMatchOffset + nCurMaxRepLen])
nCurMaxRepLen++;
if (nRepOffset) {
if (nMatchOffset == nRepOffset)
nCurMaxRepLen = nMatchLen;
else {
if (i > nRepOffset &&
(i - nRepOffset + nMatchLen) <= (nEndOffset - LAST_LITERALS)) {
nCurMaxRepLen = nMinRepLen[j];
while ((nCurMaxRepLen + 8) < nMatchLen && !memcmp(pInWindow + i - nRepOffset + nCurMaxRepLen, pInWindow + i + nCurMaxRepLen, 8))
nCurMaxRepLen += 8;
while ((nCurMaxRepLen + 4) < nMatchLen && !memcmp(pInWindow + i - nRepOffset + nCurMaxRepLen, pInWindow + i + nCurMaxRepLen, 4))
nCurMaxRepLen += 4;
while (nCurMaxRepLen < nMatchLen && pInWindow[i - nRepOffset + nCurMaxRepLen] == pInWindow[i + nCurMaxRepLen])
nCurMaxRepLen++;
nMinRepLen[j] = nCurMaxRepLen;
}
}
}
nMaxRepLen[j] = nCurMaxRepLen;
}
while (j < NMATCHES_PER_OFFSET)
while (j < nMatchesPerArrival)
nMaxRepLen[j++] = 0;
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
if (nInsertForwardReps)
lzsa_insert_forward_match_v2(pCompressor, pInWindow, i, nMatchOffset, nStartOffset, nEndOffset, nMatchesPerArrival, 0);
if (nMatchOffset != nRepOffset && nRepOffset && nInsertForwardReps && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_len >= MIN_MATCH_SIZE_V2) {
int nRepPos = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_pos;
int nRepLen = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_len;
if (nRepPos > nMatchOffset &&
(nRepPos - nMatchOffset + nRepLen) <= (nEndOffset - LAST_LITERALS) &&
!memcmp(pInWindow + nRepPos - nRepOffset, pInWindow + nRepPos - nMatchOffset, nRepLen)) {
lzsa_match *fwd_match = pCompressor->match + (nRepPos << 5);
int exists = 0;
int r;
for (r = 0; r < 32 && fwd_match[r].length >= MIN_MATCH_SIZE_V2; r++) {
if (fwd_match[r].offset == nMatchOffset) {
exists = 1;
break;
}
}
if (!exists && r < 32) {
fwd_match[r].offset = nMatchOffset;
fwd_match[r].length = nRepLen;
}
}
}
int nMatchLenCost = 0;
if (nMatchLen >= nLeaveAloneMatchSize) {
nStartingMatchLen = nMatchLen;
nMatchLenCost = 4 + 24;
}
else {
nStartingMatchLen = nMinMatchSize;
nMatchLenCost = 0;
}
if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE)
nStartingMatchLen = nMatchLen;
else
nStartingMatchLen = nMinMatchSize;
for (k = nStartingMatchLen; k <= nMatchLen; k++) {
int nMatchLenCost = lzsa_get_match_varlen_size_v2(k - MIN_MATCH_SIZE_V2);
lzsa_arrival *pDestSlots = &arrival[(i + k) << MATCHES_PER_OFFSET_SHIFT];
if (k == (MATCH_RUN_LEN_V2 + MIN_MATCH_SIZE_V2)) {
nMatchLenCost = 4;
}
else {
if (k == (MATCH_RUN_LEN_V2 + 15 + MIN_MATCH_SIZE_V2))
nMatchLenCost = 4 + 8;
else {
if (k == 256)
nMatchLenCost = 4 + 24;
}
}
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
const int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost & 0x3fffffff;
int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
lzsa_arrival *pDestSlots = &arrival[(i + k) << MATCHES_PER_ARRIVAL_SHIFT];
int nInsertedNoRepMatchCandidate = 0;
int nMatchOffsetCost = (nMatchOffset == nRepOffset) ? 0 : nNoRepmatchOffsetCost;
for (j = 0; j < nMatchesPerArrival && arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].from_slot; j++) {
const int nPrevCost = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].cost & 0x3fffffff;
int nRepCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchLenCost;
int nCodingChoiceCost = nRepCodingChoiceCost + nMatchOffsetCost;
if (!nFavorRatio && !arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals)
nCodingChoiceCost += MODESWITCH_PENALTY;
if (nRepCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
int nRepOffset = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].rep_offset;
if (nRepCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
if (nCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
int exists = 0;
if (nMatchOffset != nRepOffset && !nInsertedNoRepMatchCandidate) {
int nCodingChoiceCost = nRepCodingChoiceCost + nNoRepmatchOffsetCost;
for (n = 0;
n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nCodingChoiceCost;
n++) {
if (pDestSlots[n].rep_offset == nMatchOffset) {
exists = 1;
break;
}
}
if (!nFavorRatio && !arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].num_literals)
nCodingChoiceCost += MODESWITCH_PENALTY;
if (!exists) {
int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + ((nMatchOffset == nRepOffset) ? 2 : 3);
if (nCodingChoiceCost <= pDestSlots[nMatchesPerArrival - 1].cost) {
int exists = 0;
int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 3 + nScorePenalty;
for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
lzsa_arrival *pDestArrival = &pDestSlots[n];
if (nCodingChoiceCost < pDestArrival->cost ||
(nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
if (pDestArrival->from_slot) {
memmove(&pDestSlots[n + 1],
&pDestSlots[n],
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
}
pDestArrival->cost = nCodingChoiceCost;
pDestArrival->from_pos = i;
pDestArrival->from_slot = j + 1;
pDestArrival->match_offset = nMatchOffset;
pDestArrival->match_len = k;
pDestArrival->num_literals = 0;
pDestArrival->score = nScore;
pDestArrival->rep_offset = nMatchOffset;
pDestArrival->rep_pos = i;
pDestArrival->rep_len = k;
for (n = 0;
n < nMatchesPerArrival && pDestSlots[n].cost <= nCodingChoiceCost;
n++) {
if (pDestSlots[n].rep_offset == nMatchOffset &&
(!nInsertForwardReps || pDestSlots[n].cost != nCodingChoiceCost || pDestSlots[n].rep_pos >= i || nScore >= (pDestSlots[n].score + nDisableScore) ||
pDestSlots[nMatchesPerArrival - 1].from_slot)) {
exists = 1;
break;
}
}
if (!exists) {
for (n = 0; n < nMatchesPerArrival - 1; n++) {
lzsa_arrival *pDestArrival = &pDestSlots[n];
if (nCodingChoiceCost < pDestArrival->cost ||
(nCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
if (pDestArrival->from_slot) {
int z;
for (z = n; z < nMatchesPerArrival - 1; z++) {
if (pDestSlots[z].rep_offset == nMatchOffset)
break;
}
if (z == (nMatchesPerArrival - 1) && pDestSlots[z].from_slot && pDestSlots[z].match_len < MIN_MATCH_SIZE_V2)
z--;
memmove(&pDestSlots[n + 1],
&pDestSlots[n],
sizeof(lzsa_arrival) * (z - n));
}
pDestArrival->cost = nCodingChoiceCost;
pDestArrival->from_pos = i;
pDestArrival->from_slot = j + 1;
pDestArrival->match_offset = nMatchOffset;
pDestArrival->match_len = k;
pDestArrival->num_literals = 0;
pDestArrival->score = nScore;
pDestArrival->rep_offset = nMatchOffset;
pDestArrival->rep_pos = i;
pDestArrival->rep_len = k;
nInsertedNoRepMatchCandidate = 1;
break;
}
}
}
}
}
@ -392,7 +491,7 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
/* A match is possible at the rep offset; insert the extra coding choice. */
for (n = 0;
n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nRepCodingChoiceCost;
n < nMatchesPerArrival && pDestSlots[n].cost <= nRepCodingChoiceCost;
n++) {
if (pDestSlots[n].rep_offset == nRepOffset) {
exists = 1;
@ -401,17 +500,24 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
}
if (!exists) {
int nScore = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].score + 2;
int nScore = arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + j].score + 2;
for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
for (n = 0; n < nMatchesPerArrival; n++) {
lzsa_arrival *pDestArrival = &pDestSlots[n];
if (nRepCodingChoiceCost < pDestArrival->cost ||
(nRepCodingChoiceCost == pDestArrival->cost && nScore < (pDestArrival->score + nDisableScore))) {
if (pDestArrival->from_slot) {
int z;
for (z = n; z < nMatchesPerArrival - 1; z++) {
if (pDestSlots[z].rep_offset == nRepOffset)
break;
}
memmove(&pDestSlots[n + 1],
&pDestSlots[n],
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
sizeof(lzsa_arrival) * (z - n));
}
pDestArrival->cost = nRepCodingChoiceCost;
@ -430,19 +536,24 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
}
}
}
else {
break;
}
}
}
if (nMatchLen >= LCP_MAX && ((m + 1) >= NMATCHES_PER_INDEX_V2 || match[m + 1].length < LCP_MAX))
break;
}
}
lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_OFFSET_SHIFT) + 0];
pBestMatch[i].length = 0;
pBestMatch[i].offset = 0;
lzsa_arrival *end_arrival = &arrival[(i << MATCHES_PER_ARRIVAL_SHIFT) + 0];
while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0) {
if (end_arrival->from_pos >= nEndOffset) return;
pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
pBestMatch[end_arrival->from_pos].offset = end_arrival->match_offset;
end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_OFFSET_SHIFT) + (end_arrival->from_slot - 1)];
end_arrival = &arrival[(end_arrival->from_pos << MATCHES_PER_ARRIVAL_SHIFT) + (end_arrival->from_slot - 1)];
}
}
@ -470,6 +581,28 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
for (i = nStartOffset; i < nEndOffset; ) {
lzsa_match *pMatch = pBestMatch + i;
if (pMatch->length == 0 &&
(i + 1) < (nEndOffset - LAST_LITERALS) &&
pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V2 &&
pBestMatch[i + 1].length < MAX_VARLEN &&
pBestMatch[i + 1].offset &&
i >= pBestMatch[i + 1].offset &&
(i + pBestMatch[i + 1].length + 1) <= (nEndOffset - LAST_LITERALS) &&
!memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
int nCurLenSize = lzsa_get_match_varlen_size_v2(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V2);
int nReducedLenSize = lzsa_get_match_varlen_size_v2(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V2);
if ((nReducedLenSize - nCurLenSize) <= 8) {
/* Merge */
pBestMatch[i].length = pBestMatch[i + 1].length + 1;
pBestMatch[i].offset = pBestMatch[i + 1].offset;
pBestMatch[i + 1].length = 0;
pBestMatch[i + 1].offset = 0;
nDidReduce = 1;
continue;
}
}
if (pMatch->length >= MIN_MATCH_SIZE_V2) {
if ((i + pMatch->length) < nEndOffset /* Don't consider the last match in the block, we can only reduce a match inbetween other tokens */) {
int nNextIndex = i + pMatch->length;
@ -583,18 +716,51 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
}
}
if ((i + pMatch->length) < nEndOffset && pMatch->length >= LCP_MAX &&
pMatch->offset && pMatch->offset <= 32 && pBestMatch[i + pMatch->length].offset == pMatch->offset && (pMatch->length % pMatch->offset) == 0 &&
(pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN) {
int nMatchLen = pMatch->length;
if ((i + pMatch->length) <= nEndOffset && pMatch->offset > 0 && pMatch->length >= MIN_MATCH_SIZE_V2 &&
pBestMatch[i + pMatch->length].offset > 0 &&
pBestMatch[i + pMatch->length].length >= MIN_MATCH_SIZE_V2 &&
(pMatch->length + pBestMatch[i + pMatch->length].length) >= LEAVE_ALONE_MATCH_SIZE &&
(pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN &&
(i + pMatch->length) > pMatch->offset &&
(i + pMatch->length) > pBestMatch[i + pMatch->length].offset &&
(i + pMatch->length + pBestMatch[i + pMatch->length].length) <= nEndOffset &&
!memcmp(pInWindow + i - pMatch->offset + pMatch->length,
pInWindow + i + pMatch->length - pBestMatch[i + pMatch->length].offset,
pBestMatch[i + pMatch->length].length)) {
/* Join */
int nNextIndex = i + pMatch->length;
int nNextLiterals = 0;
pMatch->length += pBestMatch[i + nMatchLen].length;
pBestMatch[i + nMatchLen].offset = 0;
pBestMatch[i + nMatchLen].length = -1;
nDidReduce = 1;
continue;
while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < MIN_MATCH_SIZE_V2) {
nNextLiterals++;
nNextIndex++;
}
int nCurPartialSize = lzsa_get_match_varlen_size_v2(pMatch->length - MIN_MATCH_SIZE_V2);
nCurPartialSize += 8 /* token */ + lzsa_get_literals_varlen_size_v2(0) + lzsa_get_match_varlen_size_v2(pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V2);
if (pBestMatch[i + pMatch->length].offset != pMatch->offset)
nCurPartialSize += (pBestMatch[i + pMatch->length].offset <= 32) ? 4 : ((pBestMatch[i + pMatch->length].offset <= 512) ? 8 : ((pBestMatch[i + pMatch->length].offset <= (8192 + 512)) ? 12 : 16));
if (pBestMatch[nNextIndex].offset != pBestMatch[i + pMatch->length].offset)
nCurPartialSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
int nReducedPartialSize = lzsa_get_match_varlen_size_v2(pMatch->length + pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V2);
if (pBestMatch[nNextIndex].offset != pMatch->offset)
nReducedPartialSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
if (nCurPartialSize >= nReducedPartialSize) {
int nMatchLen = pMatch->length;
/* Join */
pMatch->length += pBestMatch[i + nMatchLen].length;
pBestMatch[i + nMatchLen].offset = 0;
pBestMatch[i + nMatchLen].length = -1;
nDidReduce = 1;
continue;
}
}
nPrevRepMatchOffset = nRepMatchOffset;
@ -971,37 +1137,40 @@ static int lzsa_write_raw_uncompressed_block_v2(lzsa_compressor *pCompressor, co
*/
int lzsa_optimize_and_write_block_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
int nResult, nBaseCompressedSize;
int nMatchesPerArrival = (nInDataSize < 65536) ? NMATCHES_PER_ARRIVAL_V2_BIG : NMATCHES_PER_ARRIVAL_V2_SMALL;
/* Compress optimally without breaking ties in favor of less tokens */
lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */, (nInDataSize < 65536) ? 1 : 0 /* insert forward reps */);
memset(pCompressor->best_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */, (nInDataSize < 65536) ? 1 : 0 /* insert forward reps */, nMatchesPerArrival);
int nDidReduce;
int nPasses = 0;
do {
nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nPasses++;
} while (nDidReduce && nPasses < 20);
nBaseCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->best_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
lzsa_match *pBestMatch = pCompressor->best_match;
nBaseCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
lzsa_match *pBestMatch = pCompressor->best_match - nPreviousBlockSize;
if (nBaseCompressedSize > 0 && nInDataSize < 65536) {
int nReducedCompressedSize;
/* Compress optimally and do break ties in favor of less tokens */
lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */, 0 /* use forward reps */);
memset(pCompressor->improved_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
lzsa_optimize_forward_v2(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */, 0 /* use forward reps */, nMatchesPerArrival);
nPasses = 0;
do {
nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nDidReduce = lzsa_optimize_command_count_v2(pCompressor, pInWindow, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nPasses++;
} while (nDidReduce && nPasses < 20);
nReducedCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->improved_match, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
nReducedCompressedSize = lzsa_get_compressed_size_v2(pCompressor, pCompressor->improved_match - nPreviousBlockSize, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
if (nReducedCompressedSize > 0 && nReducedCompressedSize <= nBaseCompressedSize) {
/* Pick the parse with the reduced number of tokens as it didn't negatively affect the size */
pBestMatch = pCompressor->improved_match;
pBestMatch = pCompressor->improved_match - nPreviousBlockSize;
}
}

View File

@ -89,19 +89,19 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
pCompressor->open_intervals = (unsigned int *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned int));
if (pCompressor->open_intervals) {
pCompressor->arrival = (lzsa_arrival *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(lzsa_arrival));
pCompressor->arrival = (lzsa_arrival *)malloc(((BLOCK_SIZE + 1) << MATCHES_PER_ARRIVAL_SHIFT) * sizeof(lzsa_arrival));
if (pCompressor->arrival) {
pCompressor->best_match = (lzsa_match *)malloc(nMaxWindowSize * sizeof(lzsa_match));
pCompressor->best_match = (lzsa_match *)malloc(BLOCK_SIZE * sizeof(lzsa_match));
if (pCompressor->best_match) {
pCompressor->improved_match = (lzsa_match *)malloc(nMaxWindowSize * sizeof(lzsa_match));
pCompressor->improved_match = (lzsa_match *)malloc(BLOCK_SIZE * sizeof(lzsa_match));
if (pCompressor->improved_match) {
if (pCompressor->format_version == 2)
pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * 32 * sizeof(lzsa_match));
pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V2 * sizeof(lzsa_match));
else
pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * 8 * sizeof(lzsa_match));
pCompressor->match = (lzsa_match *)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V1 * sizeof(lzsa_match));
if (pCompressor->match)
return 0;
}
@ -185,7 +185,7 @@ int lzsa_compressor_shrink_block(lzsa_compressor *pCompressor, unsigned char *pI
if (nPreviousBlockSize) {
lzsa_skip_matches(pCompressor, 0, nPreviousBlockSize);
}
lzsa_find_all_matches(pCompressor, (pCompressor->format_version == 2) ? 32 : 8, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
lzsa_find_all_matches(pCompressor, (pCompressor->format_version == 2) ? NMATCHES_PER_INDEX_V2 : NMATCHES_PER_INDEX_V1, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
if (pCompressor->format_version == 1) {
nCompressedSize = lzsa_optimize_and_write_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);

View File

@ -40,8 +40,8 @@ extern "C" {
#endif
#define LCP_BITS 14
#define TAG_BITS 3
#define LCP_MAX (1U<<(LCP_BITS - TAG_BITS - 1))
#define TAG_BITS 4
#define LCP_MAX ((1U<<(LCP_BITS - TAG_BITS)) - 1)
#define LCP_AND_TAG_MAX (1U<<(LCP_BITS - 1))
#define LCP_SHIFT (31-LCP_BITS)
#define LCP_MASK (((1U<<LCP_BITS) - 1) << LCP_SHIFT)
@ -49,13 +49,21 @@ extern "C" {
#define VISITED_FLAG 0x80000000
#define EXCL_VISITED_MASK 0x7fffffff
#define NMATCHES_PER_OFFSET 8
#define MATCHES_PER_OFFSET_SHIFT 3
#define NMATCHES_PER_ARRIVAL_V1 8
#define NMATCHES_PER_ARRIVAL_V2_SMALL 9
#define NMATCHES_PER_ARRIVAL_V2_BIG 32
#define MATCHES_PER_ARRIVAL_SHIFT 5
#define LEAVE_ALONE_MATCH_SIZE 1000
#define NMATCHES_PER_INDEX_V1 8
#define MATCHES_PER_INDEX_SHIFT_V1 3
#define LAST_MATCH_OFFSET 4
#define LAST_LITERALS 1
#define NMATCHES_PER_INDEX_V2 64
#define MATCHES_PER_INDEX_SHIFT_V2 6
#define LEAVE_ALONE_MATCH_SIZE 300
#define LEAVE_ALONE_MATCH_SIZE_SMALL 1000
#define LAST_LITERALS 0
#define MODESWITCH_PENALTY 3
@ -68,10 +76,10 @@ typedef struct _lzsa_match {
/** Forward arrival slot */
typedef struct {
int cost;
int from_pos;
unsigned short rep_offset;
short from_slot;
unsigned short rep_offset;
int from_pos;
unsigned short rep_len;
int rep_pos;
int num_literals;

View File

@ -142,7 +142,7 @@ size_t lzsa_compress_inmem(unsigned char *pInputData, unsigned char *pOutBuffer,
if (nBlockheaderSize < 0)
nError = LZSA_ERROR_COMPRESSION;
else {
if (nInDataSize > (nMaxOutBufferSize - (nCompressedSize + nBlockheaderSize)))
if ((size_t)nInDataSize > (nMaxOutBufferSize - (nCompressedSize + nBlockheaderSize)))
nError = LZSA_ERROR_DST;
else {
memcpy(pOutBuffer + nBlockheaderSize + nCompressedSize, pInputData + nOriginalSize, nInDataSize);