Bump version

Optimal LZSA1 compression
Update README
2024-06-18 09:30:40 +00:00 · 2023-02-27 18:23:00 +01:00 · 2023-02-27 08:26:42 +01:00 · 2023-02-13 10:37:25 +01:00 · 2023-02-10 17:08:03 +01:00 · 2023-02-02 11:11:14 +01:00
66 changed files with 7101 additions and 2522 deletions
--- a/BlockFormat_LZSA2.md
+++ b/BlockFormat_LZSA2.md
@ -44,7 +44,7 @@ The match offset is decoded according to the XYZ bits in the token
    XYZ
    00Z 5-bit offset: read a nibble for offset bits 1-4 and use the inverted bit Z of the token as bit 0 of the offset. set bits 5-15 of the offset to 1.
    01Z 9-bit offset: read a byte for offset bits 0-7 and use the inverted bit Z for bit 8 of the offset. set bits 9-15 of the offset to 1.
-    10Z 13-bit offset: read a nibble for offset bits 9-12 and use the inverted bit Z for bit 8 of the offset, then read a byte for offset bits 0-7. set bits 13-15 of the offset to 1.
+    10Z 13-bit offset: read a nibble for offset bits 9-12 and use the inverted bit Z for bit 8 of the offset, then read a byte for offset bits 0-7. set bits 13-15 of the offset to 1. substract 512 from the offset to get the final value.
    110 16-bit offset: read a byte for offset bits 8-15, then another byte for offset bits 0-7.
    111 repeat offset: reuse the offset value of the previous match command.

@ -58,7 +58,7 @@ Note that the match offset is negative: it is added to the current decompressed

 If the encoded match length is 7 or more, the 'M' bits in the token form the value 7, and an extra nibble is read:

-* 0-14: the value is added to the 3 stored in the token, and then the minmatch of 2 is added, to compose the final match length.
+* 0-14: the value is added to the 7 stored in the token, and then the minmatch of 2 is added, to compose the final match length.
 * 15: an extra byte follows

 If an extra byte follows here, it can have two possible types of value:
--- a/6
+++ b/6
@ -1,8 +1,7 @@
 CC=clang
-CFLAGS=-O3 -fomit-frame-pointer -Isrc/libdivsufsort/include -Isrc
+CFLAGS=-O3 -g -fomit-frame-pointer -Isrc/libdivsufsort/include -Isrc
 OBJDIR=obj
 LDFLAGS=
-STRIP=strip

 $(OBJDIR)/%.o: src/../%.c
 	@mkdir -p '$(@D)'
@ -18,7 +17,6 @@ OBJS += $(OBJDIR)/src/expand_context.o
 OBJS += $(OBJDIR)/src/expand_inmem.o
 OBJS += $(OBJDIR)/src/expand_streaming.o
 OBJS += $(OBJDIR)/src/frame.o
-OBJS += $(OBJDIR)/src/hashmap.o
 OBJS += $(OBJDIR)/src/matchfinder.o
 OBJS += $(OBJDIR)/src/shrink_block_v1.o
 OBJS += $(OBJDIR)/src/shrink_block_v2.o
@ -34,9 +32,7 @@ OBJS += $(OBJDIR)/src/libdivsufsort/lib/trsort.o
 all: $(APP)

 $(APP): $(OBJS)
-	@mkdir -p ../../bin/posix
 	$(CC) $^ $(LDFLAGS) -o $(APP)
-	$(STRIP) $(APP)

 clean:
 	@rm -rf $(APP) $(OBJDIR)
--- a/README.md
+++ b/README.md
@ -3,6 +3,26 @@ LZSA is a collection of byte-aligned compression formats that are specifically e
 ![Pareto frontier](pareto_graph.png)
 <sup>*ZX Spectrum</sup>

+Check out [The Hollow](https://www.pouet.net/prod.php?which=81909) by Darklite and Offense, winner of the Solskogen 2019 wild compo, that uses LZSA on Z80.
+
+[Gabba](https://www.pouet.net/prod.php?which=83539) by Stardust ranked 2nd in the ZX Spectrum demo compo at CAFe demoparty 2019 and also used LZSA on Z80. 
+
+[Myst Demake](http://www.deater.net/weave/vmwprod/mist/) for the Apple II by Vince Weaver, uses LZSA on 6502.
+
+The 8 bit guy's [Commander X16 ROM](https://github.com/commanderx16/x16-rom) uses LZSA on 6502 as well.
+
+[RomWBW](https://github.com/wwarthen/RomWBW) uses LZSA on Z80 for a variety of hobbyist computers.
+
+The popular [rasm](https://github.com/EdouardBERGE/rasm) assembler for Z80 features LZSA-compressed data sections.
+
+The [desolate](https://github.com/nzeemin/spectrum-desolate) game port to the ZX Spectrum uses LZSA compression on Z80.
+
+[Marsmare: Alienation](https://zxonline.net/game/marsmare-alienation/), the winner of the recent [Yandex Retro Games Battle 2020](https://yandex.ru/museum/yrgb-2020-en), is using LZSA to compress its assets.
+
+The [Lowtech demo](https://github.com/wiz21b/lowtech) for the Apple II+ and IIe, by Wiz/Imphobia, compresses data with LZSA.
+
+The [Druid & Droid](https://leosoft.itch.io/druid-and-droid) game for the Amstrad CPC, also uses LZSA for compression.
+
 The LZSA compression tool uses an aggressive optimal packing strategy to try to find the sequence of commands that gives the smallest packed file that decompresses to the original while maintaining the maximum possible decompression speed.

 The compression formats give the user choices that range from decompressing faster than LZ4 on 8-bit systems with better compression, to compressing as well as ZX7 with much better decompression speed. LZSA1 is designed to replace LZ4 and LZSA2 to replace ZX7, in 8-bit scenarios.
@ -10,10 +30,11 @@ The compression formats give the user choices that range from decompressing fast
 Compression ratio comparison between LZSA and other optimal packers, for a workload composed of ZX Spectrum and C64 files:

                         Bytes            Ratio            Decompression speed vs. LZ4
-    LZSA2                685610           53,18% <------   75%                
+    LZSA2                676681           52,49% <------   75%   
+    MegaLZ 4.89          679041           52,68%           Not measured
    ZX7                  687133           53,30%           47,73%
    LZ5 1.4.1            727107           56,40%           75%
-    LZSA1                736169           57,11% <------   90%
+    LZSA1                735785           57,08% <------   90%
    Lizard -29           776122           60,21%           Not measured
    LZ4_HC -19 -B4 -BD   781049           60,59%           100%
    Uncompressed         1289127          100%             N/A
@ -21,13 +42,13 @@ Compression ratio comparison between LZSA and other optimal packers, for a workl
 Performance over well-known compression corpus files:

                         Uncompressed     LZ4_HC -19 -B4 -BD    LZSA1                LZSA2
-    Canterbury           2810784          935827 (33,29%)       855044 (30,42%)      789075 (28,07%)
-    Silesia              211938580        77299725 (36,47%)     73707039 (34,78%)    69983184 (33,02%)
-    Calgary              3251493          1248780 (38,40%)      1196448 (36,80%)     1125462 (34,61%)
-    Large                11159482         3771025 (33,79%)      3648420 (32,69%)     3528725 (31,62%)
-    enwik9               1000000000       371841591 (37,18%)    355360717 (35,54%)   337063553 (33,71%)
+    Canterbury           2810784          935827 (33,29%)       850792 (30,27%)      770877 (27,43%)
+    Silesia              211938580        77299725 (36,47%)     73706340 (34,78%)    68928564 (32,52%)
+    Calgary              3251493          1248780 (38,40%)      1192123 (36,67%)     1110290 (34,15%)
+    Large                11159482         3771025 (33,79%)      3648393 (32,69%)     3519480 (31,54%)
+    enwik9               1000000000       371841591 (37,18%)    355360043 (35,54%)   334900611 (33,49%)

-As an example of LZSA1's simplicity, a size-optimized decompressor on Z80 has been implemented in 69 bytes.
+As an example of LZSA1's simplicity, a size-optimized decompressor on Z80 has been implemented in 67 bytes.

 The compressor is approximately 2X slower than LZ4_HC but compresses better while maintaining similar decompression speeds and decompressor simplicity.

@ -39,6 +60,7 @@ The main differences between LZSA1 and the LZ4 compression format are:

 As for LZSA2:
 * 5-bit, 9-bit, 13-bit and 16-bit match offsets, using nibble encoding
+* Rep-matches
 * Shorter encoding of lengths, also using nibbles
 * A minmatch of 2 bytes
 * No (slow) bit-packing. LZSA2 uses byte alignment in the hot path, and nibbles.
@ -49,6 +71,8 @@ Inspirations:
 * [LZ5/Lizard](https://github.com/inikep/lizard) by Przemyslaw Skibinski and Yann Collet.
 * The suffix array intervals in [Wimlib](https://wimlib.net/git/?p=wimlib;a=tree) by Eric Biggers.
 * ZX7 by Einar Saukas
+* [apc](https://github.com/svendahl/cap) by Sven-Åke Dahl
+* [Charles Bloom](http://cbloomrants.blogspot.com/)'s compression blog

 License:

@ -57,9 +81,20 @@ License:

 8-bit assembly code:

-* Z80 decompressors (size- and speed-optimized) written by [introspec](https://github.com/specke)
+* Z80 decompressors (size- and speed-optimized) written by [introspec](https://github.com/specke) with optimizations by [uniabis](https://github.com/uniabis)
 * 6502 and 8088 size-optimized improvements by [Peter Ferrie](https://github.com/peterferrie)
+* 6502 speed-optimized decompressor by [John Brandwood](https://github.com/jbrandwood)
 * 8088 speed-optimized decompressor by [Jim Leonard](https://github.com/mobygamer)
+* 6809 decompressors (Tandy CoCo, Thomson MO/TO, Dragon 32/64..) optimized by [Doug Masten](https://github.com/dougmasten)
+* Hitachi 6309 decompressors (Tandy CoCo 3) also contributed by [Doug Masten](https://github.com/dougmasten)
+
+External links:
+
+* [i8080 and PDP-11 decompressors](https://github.com/ivagorRetrocomp/DeLZSA) by Ivan Gorodetsky
+* [MC68000 decompressors](https://github.com/tattlemuss/lz4-m68k/blob/master/src/lzsa.s) by Steven Tattersall
+* [Gameboy decompressors](https://github.com/meltycode) by Meltycode, based on the Z80 code by introspec
+* [Streamed LZSA2 depacker](https://hg.ulukai.org/ecm/inicomp/file/c1a1f9bd4382/lzsa2.asm) by C. Masloch
+* LZSA's page on [Pouet](https://www.pouet.net/prod.php?which=81573)

 # Compressed format

--- a/StreamFormat.md
+++ b/StreamFormat.md
@ -17,7 +17,7 @@ The 3-bytes LZSA header contains a signature and a traits byte:

 Trait bits:

-* V: 3 bit code that indicates which block data encoding is used. 0 is LZSA1 and 2 is LZSA2.
+* V: 3 bit code that indicates which block data encoding is used. 0 is LZSA1 and 1 is LZSA2.
 * Z: these bits in the traits are set to 0 for LZSA1 and LZSA2.

 # Frame format
--- a/VS2017/lzsa.vcxproj
+++ b/VS2017/lzsa.vcxproj
@ -185,7 +185,6 @@
    <ClInclude Include="..\src\format.h" />
    <ClInclude Include="..\src\frame.h" />
    <ClInclude Include="..\src\expand_inmem.h" />
-    <ClInclude Include="..\src\hashmap.h" />
    <ClInclude Include="..\src\lib.h" />
    <ClInclude Include="..\src\libdivsufsort\include\divsufsort_config.h" />
    <ClInclude Include="..\src\libdivsufsort\include\divsufsort.h" />
@ -207,7 +206,6 @@
    <ClCompile Include="..\src\expand_block_v2.c" />
    <ClCompile Include="..\src\frame.c" />
    <ClCompile Include="..\src\expand_inmem.c" />
-    <ClCompile Include="..\src\hashmap.c" />
    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort.c" />
    <ClCompile Include="..\src\libdivsufsort\lib\sssort.c" />
    <ClCompile Include="..\src\libdivsufsort\lib\trsort.c" />
--- a/VS2017/lzsa.vcxproj.filters
+++ b/VS2017/lzsa.vcxproj.filters
@ -84,9 +84,6 @@
    <ClInclude Include="..\src\libdivsufsort\include\divsufsort_config.h">
      <Filter>Fichiers sources\libdivsufsort\include</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\hashmap.h">
-      <Filter>Fichiers sources</Filter>
-    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort.c">
@ -146,8 +143,5 @@
    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort_utils.c">
      <Filter>Fichiers sources\libdivsufsort\lib</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\hashmap.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
  </ItemGroup>
 </Project>
--- a/Xcode/lzsa.xcodeproj/project.pbxproj
+++ b/Xcode/lzsa.xcodeproj/project.pbxproj
@ -26,7 +26,6 @@
 		0CADC64722AAD8EB003E9821 /* expand_context.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CADC62F22AAD8EB003E9821 /* expand_context.c */; };
 		0CADC64822AAD8EB003E9821 /* shrink_block_v2.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CADC63022AAD8EB003E9821 /* shrink_block_v2.c */; };
 		0CADC64A22AB8DAD003E9821 /* divsufsort_utils.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CADC64922AB8DAD003E9821 /* divsufsort_utils.c */; };
-		0CADC69622C8A420003E9821 /* hashmap.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CADC69522C8A41F003E9821 /* hashmap.c */; };
 /* End PBXBuildFile section */

 /* Begin PBXCopyFilesBuildPhase section */
@ -81,8 +80,6 @@
 		0CADC63022AAD8EB003E9821 /* shrink_block_v2.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = shrink_block_v2.c; path = ../../src/shrink_block_v2.c; sourceTree = "<group>"; };
 		0CADC64922AB8DAD003E9821 /* divsufsort_utils.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = divsufsort_utils.c; sourceTree = "<group>"; };
 		0CADC64B22AB8DC3003E9821 /* divsufsort_config.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = divsufsort_config.h; sourceTree = "<group>"; };
-		0CADC69422C8A41F003E9821 /* hashmap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = hashmap.h; path = ../../src/hashmap.h; sourceTree = "<group>"; };
-		0CADC69522C8A41F003E9821 /* hashmap.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = hashmap.c; path = ../../src/hashmap.c; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@ -130,8 +127,6 @@
 				0CADC62422AAD8EB003E9821 /* format.h */,
 				0CADC5F322AAD8EB003E9821 /* frame.c */,
 				0CADC62C22AAD8EB003E9821 /* frame.h */,
-				0CADC69522C8A41F003E9821 /* hashmap.c */,
-				0CADC69422C8A41F003E9821 /* hashmap.h */,
 				0CADC5F222AAD8EB003E9821 /* lib.h */,
 				0CADC5FC22AAD8EB003E9821 /* libdivsufsort */,
 				0CADC62222AAD8EB003E9821 /* lzsa.c */,
@ -240,7 +235,6 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				0CADC69622C8A420003E9821 /* hashmap.c in Sources */,
 				0CADC64822AAD8EB003E9821 /* shrink_block_v2.c in Sources */,
 				0CADC63D22AAD8EB003E9821 /* sssort.c in Sources */,
 				0CADC64322AAD8EB003E9821 /* expand_block_v2.c in Sources */,
--- a/asm/6502/decompress_fast_v1.asm
+++ b/asm/6502/decompress_fast_v1.asm
@ -0,0 +1,305 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+DECOMPRESS_LZSA1_FAST
+   LDY #$00
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: O|LLL|MMMM
+   PHA                                  ; preserve token on stack
+
+   AND #$70                             ; isolate literals count
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$70                             ; LITERALS_RUN_LEN?
+   BNE PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$F9                             ; (LITERALS_RUN_LEN)
+   BCC PREPARE_COPY_LITERALS_DIRECT
+   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
+
+   JSR GETSRC                           ; get single extended byte of variable literals count
+   INY                                  ; add 256 to literals count
+   BCS PREPARE_COPY_LITERALS_DIRECT     ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
+
+LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   TXA
+   BCS PREPARE_COPY_LARGE_LITERALS      ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
+
+PREPARE_COPY_LITERALS
+   TAX
+   LDA SHIFT_TABLE-1,X                  ; shift literals length into place
+                                        ; -1 because position 00 is reserved
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+
+PREPARE_COPY_LARGE_LITERALS
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
+
+   JSR GETSRC                           ; get 8 bit offset from stream in A
+   TAX                                  ; save for later
+   LDA #$FF                             ; high 8 bits
+   BNE GOT_OFFSET                       ; go prepare match
+                                        ; (*like JMP GOT_OFFSET but shorter)
+
+SHORT_VARLEN_MATCHLEN
+   JSR GETSRC                           ; get single extended byte of variable match len
+   INY                                  ; add 256 to match length
+
+PREPARE_COPY_MATCH
+   TAX
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
+
+!ifdef BACKWARD_DECOMPRESS {
+
+GETMATCH_ADJ_HI
+   DEC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+} else {
+
+GETMATCH_ADJ_HI
+   INC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+}
+
+GET_LONG_OFFSET                         ; handle 16 bit offset:
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+
+GOT_OFFSET
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   STX OFFSLO
+
+   SEC                                  ; substract dest - match offset
+   LDA PUTDST+1
+OFFSLO = *+1
+   SBC #$AA                             ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   TXA
+
+   CLC                                  ; add dest + match offset
+   ADC PUTDST+1                         ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+
+   PLA                                  ; retrieve token from stack again
+   AND #$0F                             ; isolate match len (MMMM)
+   ADC #$02                             ; plus carry which is always set by the high ADC
+   CMP #$12                             ; MATCH_RUN_LEN?
+   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
+   BCC PREPARE_COPY_MATCH
+   BNE SHORT_VARLEN_MATCHLEN
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+                                        ; large match length with zero high byte?
+   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
+
+DECOMPRESSION_DONE
+   RTS
+
+SHIFT_TABLE
+   !BYTE     $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00
+   !BYTE $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01
+   !BYTE $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02
+   !BYTE $03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03
+   !BYTE $04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04
+   !BYTE $05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05
+   !BYTE $06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06
+   !BYTE $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   DEC PUTDST+1
+   RTS
+
+PUTDST_ADJ_HI
+   DEC PUTDST+2
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   DEC GETSRC+1
+   PLA
+   RTS
+
+GETSRC_ADJ_HI
+   DEC GETSRC+2
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   RTS
+
+PUTDST_ADJ_HI
+   INC PUTDST+2
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   RTS
+
+GETSRC_ADJ_HI
+   INC GETSRC+2
+   RTS
+}
--- a/asm/6502/decompress_fast_v2.asm
+++ b/asm/6502/decompress_fast_v2.asm
@ -0,0 +1,359 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA2 block.
+; Create one with lzsa -r -f2 <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+NIBCOUNT = $FC                          ; zero-page location for temp offset
+
+DECOMPRESS_LZSA2_FAST
+   LDY #$00
+   STY NIBCOUNT
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
+   PHA                                  ; preserve token on stack
+
+   AND #$18                             ; isolate literals count (LL)
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$18                             ; LITERALS_RUN_LEN_V2?
+   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra literals length nibble
+                                        ; add nibble to len from token
+   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
+   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
+   BCC PREPARE_COPY_LITERALS_DIRECT     ; if less, literals count is complete
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; overflow?
+   JMP PREPARE_COPY_LITERALS_DIRECT
+
+PREPARE_COPY_LITERALS_LARGE
+                                        ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   BCS PREPARE_COPY_LITERALS_HIGH       ; (*same as JMP PREPARE_COPY_LITERALS_HIGH but shorter)
+
+PREPARE_COPY_LITERALS
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+   BCS PREPARE_COPY_LITERALS_LARGE      ; if so, literals count is large
+
+PREPARE_COPY_LITERALS_HIGH
+   TXA
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   ASL
+   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
+
+   ASL                                  ; 0YZ: 5 or 9 bit offset
+   BCS OFFSET_9_BIT         
+    
+                                        ; 00Z: 5 bit offset
+
+   LDX #$FF                             ; set offset bits 15-8 to 1
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
+   ORA #$E0                             ; set bits 7-5 to 1
+   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
+   
+OFFSET_9_BIT                            ; 01Z: 9 bit offset
+   ROL                                  ; carry: Z bit; A: xxxxxxx1 (carry known set from BCS OFFSET_9_BIT)
+   ADC #$00                             ; if Z bit is set, add 1 to A (bit 0 of A is now 0), otherwise bit 0 is 1
+   ORA #$FE                             ; set offset bits 15-9 to 1. reversed Z is already in bit 0
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_LARGE_OFFSET
+   ASL                                  ; 13 bit offset?
+   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
+
+                                        ; 10Z: 13 bit offset
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
+   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
+   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
+   
+                                        ; 110: handle 16 bit offset
+   JSR GETSRC                           ; grab high 8 bits
+GOT_OFFSET_HI
+   TAX
+   JSR GETSRC                           ; grab low 8 bits
+GOT_OFFSET_LO
+   STA OFFSLO                           ; store low byte of match offset
+   STX OFFSHI                           ; store high byte of match offset
+
+REP_MATCH
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   SEC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   SBC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   CLC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   ADC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+   
+   PLA                                  ; retrieve token from stack again
+   AND #$07                             ; isolate match len (MMM)
+   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
+   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra match length nibble
+                                        ; add nibble to len from token
+   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
+   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$E8                             ; overflow?
+
+PREPARE_COPY_MATCH
+   TAX
+   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
+   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BEQ GETMATCH_ADJ_HI
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   JMP DECODE_TOKEN
+
+!ifdef BACKWARD_DECOMPRESS {
+
+GETMATCH_ADJ_HI
+   DEC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+} else {
+
+GETMATCH_ADJ_HI
+   INC COPY_MATCH_LOOP+2
+   JMP GETMATCH_DONE
+
+}
+
+GETCOMBINEDBITS
+   EOR #$80
+   ASL
+   PHP
+
+   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
+   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
+   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
+DECOMPRESSION_DONE
+   RTS
+
+GETNIBBLE
+NIBBLES = *+1
+   LDA #$AA
+   LSR NIBCOUNT
+   BCC NEED_NIBBLES
+   AND #$0F                             ; isolate low 4 bits of nibble
+   RTS
+
+NEED_NIBBLES
+   INC NIBCOUNT
+   JSR GETSRC                           ; get 2 nibbles
+   STA NIBBLES
+   LSR 
+   LSR 
+   LSR 
+   LSR 
+   SEC
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   DEC PUTDST+1
+   RTS
+
+PUTDST_ADJ_HI
+   DEC PUTDST+2
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   DEC GETSRC+1
+   PLA
+   RTS
+
+GETSRC_ADJ_HI
+   DEC GETSRC+2
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BEQ PUTDST_ADJ_HI
+   RTS
+
+PUTDST_ADJ_HI
+   INC PUTDST+2
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BEQ GETSRC_ADJ_HI
+   RTS
+
+GETSRC_ADJ_HI
+   INC GETSRC+2
+   RTS
+}
+
--- a/asm/6502/decompress_faster_v1.asm
+++ b/asm/6502/decompress_faster_v1.asm
@ -0,0 +1,282 @@
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa1_6502.s
+;
+; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA1 format.
+;
+; This code is written for the ACME assembler.
+;
+; The code is 165 bytes for the small version, and 191 bytes for the normal.
+;
+; Copyright John Brandwood 2021.
+;
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+;  http://www.boost.org/LICENSE_1_0.txt)
+;
+; ***************************************************************************
+; ***************************************************************************
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Decompression Options & Macros
+;
+
+                ;
+                ; Choose size over decompression speed (within sane limits)?
+                ;
+
+LZSA_SMALL_SIZE =       0
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Data usage is last 7 bytes of zero-page.
+;
+
+lzsa_cmdbuf     =       $F9                     ; 1 byte.
+lzsa_winptr     =       $FA                     ; 1 word.
+lzsa_srcptr     =       $FC                     ; 1 word.
+lzsa_dstptr     =       $FE                     ; 1 word.
+
+lzsa_offset     =       lzsa_winptr
+
+LZSA_SRC_LO     =       $FC
+LZSA_SRC_HI     =       $FD
+LZSA_DST_LO     =       $FE
+LZSA_DST_HI     =       $FF
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa1_unpack - Decompress data stored in Emmanuel Marty's LZSA1 format.
+;
+; Args: lzsa_srcptr = ptr to compessed data
+; Args: lzsa_dstptr = ptr to output buffer
+;
+
+DECOMPRESS_LZSA1_FAST:
+lzsa1_unpack:   ldy     #0                      ; Initialize source index.
+                ldx     #0                      ; Initialize hi-byte of length.
+
+                ;
+                ; Copy bytes from compressed source data.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.cp_length:     !if     LZSA_SMALL_SIZE {
+
+                jsr     .get_byte
+
+                } else {
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                bne     .cp_skip0
+                inc     <lzsa_srcptr + 1
+
+                }
+
+.cp_skip0:      sta     <lzsa_cmdbuf            ; Preserve this for later.
+                and     #$70                    ; Extract literal length.
+                lsr                             ; Set CC before ...
+                beq     .lz_offset              ; Skip directly to match?
+
+                lsr                             ; Get 3-bit literal length.
+                lsr
+                lsr
+                cmp     #$07                    ; Extended length?
+                bcc     .cp_got_len
+
+                jsr     .get_length             ; X=0, CS from CMP, returns CC.
+                stx     .cp_npages + 1          ; Hi-byte of length.
+
+.cp_got_len:    tax                             ; Lo-byte of length.
+
+.cp_byte:       lda     (lzsa_srcptr),y         ; CC throughout the execution of
+                sta     (lzsa_dstptr),y         ; of this .cp_page loop.
+                inc     <lzsa_srcptr + 0
+                bne     .cp_skip1
+                inc     <lzsa_srcptr + 1
+.cp_skip1:      inc     <lzsa_dstptr + 0
+                bne     .cp_skip2
+                inc     <lzsa_dstptr + 1
+.cp_skip2:      dex
+                bne     .cp_byte
+.cp_npages:     lda     #0                      ; Any full pages left to copy?
+                beq     .lz_offset
+
+                dec     .cp_npages + 1          ; Unlikely, so can be slow.
+                bcc     .cp_byte                ; Always true!
+
+                !if     LZSA_SMALL_SIZE {
+
+                ;
+                ; Copy bytes from decompressed window.
+                ;
+                ; Shorter but slower version.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.lz_offset:     jsr     .get_byte               ; Get offset-lo.
+
+.offset_lo:     adc     <lzsa_dstptr + 0        ; Always CC from .cp_page loop.
+                sta     <lzsa_winptr + 0
+
+                lda     #$FF
+                bit     <lzsa_cmdbuf
+                bpl     .offset_hi
+
+                jsr     .get_byte               ; Get offset-hi.
+
+.offset_hi:     adc     <lzsa_dstptr + 1        ; lzsa_winptr < lzsa_dstptr, so
+                sta     <lzsa_winptr + 1        ; always leaves CS.
+
+.lz_length:     lda     <lzsa_cmdbuf            ; X=0 from previous loop.
+                and     #$0F
+                adc     #$03 - 1                ; CS from previous ADC.
+                cmp     #$12                    ; Extended length?
+                bcc     .lz_got_len
+
+                jsr     .get_length             ; CS from CMP, X=0, returns CC.
+                stx     .lz_npages + 1          ; Hi-byte of length.
+
+.lz_got_len:    tax                             ; Lo-byte of length.
+
+.lz_byte:       lda     (lzsa_winptr),y         ; CC throughout the execution of
+                sta     (lzsa_dstptr),y         ; of this .lz_page loop.
+                inc     <lzsa_winptr + 0
+                bne     .lz_skip1
+                inc     <lzsa_winptr + 1
+.lz_skip1:      inc     <lzsa_dstptr + 0
+                bne     .lz_skip2
+                inc     <lzsa_dstptr + 1
+.lz_skip2:      dex
+                bne     .lz_byte
+.lz_npages:     lda     #0                      ; Any full pages left to copy?
+                beq     .cp_length
+
+                dec     .lz_npages + 1          ; Unlikely, so can be slow.
+                bcc     .lz_byte                ; Always true!
+
+                } else {
+
+                ;
+                ; Copy bytes from decompressed window.
+                ;
+                ; Longer but faster.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.lz_offset:     lda     (lzsa_srcptr),y         ; Get offset-lo.
+                inc     <lzsa_srcptr + 0
+                bne     .offset_lo
+                inc     <lzsa_srcptr + 1
+
+.offset_lo:     sta     <lzsa_offset + 0
+
+                lda     #$FF                    ; Get offset-hi.
+                bit     <lzsa_cmdbuf
+                bpl     .offset_hi
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                bne     .offset_hi
+                inc     <lzsa_srcptr + 1
+
+.offset_hi:     sta     <lzsa_offset + 1
+
+.lz_length:     lda     <lzsa_cmdbuf            ; X=0 from previous loop.
+                and     #$0F
+                adc     #$03                    ; Always CC from .cp_page loop.
+                cmp     #$12                    ; Extended length?
+                bcc     .got_lz_len
+
+                jsr     .get_length             ; X=0, CS from CMP, returns CC.
+
+.got_lz_len:    inx                             ; Hi-byte of length+256.
+
+                eor     #$FF                    ; Negate the lo-byte of length
+                tay
+                eor     #$FF
+
+.get_lz_dst:    adc     <lzsa_dstptr + 0        ; Calc address of partial page.
+                sta     <lzsa_dstptr + 0        ; Always CC from previous CMP.
+                iny
+                bcs     .get_lz_win
+                beq     .get_lz_win             ; Is lo-byte of length zero?
+                dec     <lzsa_dstptr + 1
+
+.get_lz_win:    clc                             ; Calc address of match.
+                adc     <lzsa_offset + 0        ; N.B. Offset is negative!
+                sta     <lzsa_winptr + 0
+                lda     <lzsa_dstptr + 1
+                adc     <lzsa_offset + 1
+                sta     <lzsa_winptr + 1
+
+.lz_byte:       lda     (lzsa_winptr),y
+                sta     (lzsa_dstptr),y
+                iny
+                bne     .lz_byte
+                inc     <lzsa_dstptr + 1
+                dex                             ; Any full pages left to copy?
+                bne     .lz_more
+
+                jmp     .cp_length              ; Loop around to the beginning.
+
+.lz_more:       inc     <lzsa_winptr + 1        ; Unlikely, so can be slow.
+                bne     .lz_byte                ; Always true!
+
+                }
+
+                ;
+                ; Get 16-bit length in X:A register pair, return with CC.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.get_length:    clc                             ; Add on the next byte to get
+                adc     (lzsa_srcptr),y         ; the length.
+                inc     <lzsa_srcptr + 0
+                bne     .skip_inc
+                inc     <lzsa_srcptr + 1
+
+.skip_inc:      bcc     .got_length             ; No overflow means done.
+                clc                             ; MUST return CC!
+                tax                             ; Preserve overflow value.
+
+.extra_byte:    jsr     .get_byte               ; So rare, this can be slow!
+                pha
+                txa                             ; Overflow to 256 or 257?
+                beq     .extra_word
+
+.check_length:  pla                             ; Length-lo.
+                bne     .got_length             ; Check for zero.
+                dex                             ; Do one less page loop if so.
+.got_length:    rts
+
+.extra_word:    jsr     .get_byte               ; So rare, this can be slow!
+                tax
+                bne     .check_length           ; Length-hi == 0 at EOF.
+
+.finished:      pla                             ; Length-lo.
+                pla                             ; Decompression completed, pop
+                pla                             ; return address.
+                rts
+
+.get_byte:      lda     (lzsa_srcptr),y         ; Subroutine version for when
+                inc     <lzsa_srcptr + 0        ; inlining isn't advantageous.
+                bne     .got_byte
+                inc     <lzsa_srcptr + 1        ; Inc & test for bank overflow.
+.got_byte:      rts
--- a/asm/6502/decompress_faster_v2.asm
+++ b/asm/6502/decompress_faster_v2.asm
@ -0,0 +1,308 @@
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa2_6502.s
+;
+; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
+;
+; This code is written for the ACME assembler.
+;
+; The code is 241 bytes for the small version, and 256 bytes for the normal.
+;
+; Copyright John Brandwood 2021.
+;
+; Distributed under the Boost Software License, Version 1.0.
+; (See accompanying file LICENSE_1_0.txt or copy at
+;  http://www.boost.org/LICENSE_1_0.txt)
+;
+; ***************************************************************************
+; ***************************************************************************
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Decompression Options & Macros
+;
+
+                ;
+                ; Choose size over decompression speed (within sane limits)?
+                ;
+
+LZSA_SMALL_SIZE =       0
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; Data usage is last 11 bytes of zero-page.
+;
+
+lzsa_length     =       lzsa_winptr             ; 1 word.
+
+lzsa_cmdbuf     =       $F5                     ; 1 byte.
+lzsa_nibflg     =       $F6                     ; 1 byte.
+lzsa_nibble     =       $F7                     ; 1 byte.
+lzsa_offset     =       $F8                     ; 1 word.
+lzsa_winptr     =       $FA                     ; 1 word.
+lzsa_srcptr     =       $FC                     ; 1 word.
+lzsa_dstptr     =       $FE                     ; 1 word.
+
+lzsa_length     =       lzsa_winptr             ; 1 word.
+
+LZSA_SRC_LO     =       $FC
+LZSA_SRC_HI     =       $FD
+LZSA_DST_LO     =       $FE
+LZSA_DST_HI     =       $FF
+
+
+
+; ***************************************************************************
+; ***************************************************************************
+;
+; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2 format.
+;
+; Args: lzsa_srcptr = ptr to compessed data
+; Args: lzsa_dstptr = ptr to output buffer
+;
+
+DECOMPRESS_LZSA2_FAST:
+lzsa2_unpack:   ldx     #$00                    ; Hi-byte of length or offset.
+                ldy     #$00                    ; Initialize source index.
+                sty     <lzsa_nibflg            ; Initialize nibble buffer.
+
+                ;
+                ; Copy bytes from compressed source data.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+
+.cp_length:     !if     LZSA_SMALL_SIZE {
+
+                jsr     .get_byte
+
+                } else {
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                bne     .cp_skip0
+                inc     <lzsa_srcptr + 1
+
+                }
+
+.cp_skip0:      sta     <lzsa_cmdbuf            ; Preserve this for later.
+                and     #$18                    ; Extract literal length.
+                beq     .lz_offset              ; Skip directly to match?
+
+                lsr                             ; Get 2-bit literal length.
+                lsr
+                lsr
+                cmp     #$03                    ; Extended length?
+                bcc     .cp_got_len
+
+                jsr     .get_length             ; X=0 for literals, returns CC.
+                stx     .cp_npages + 1          ; Hi-byte of length.
+
+.cp_got_len:    tax                             ; Lo-byte of length.
+
+.cp_byte:       lda     (lzsa_srcptr),y         ; CC throughout the execution of
+                sta     (lzsa_dstptr),y         ; of this .cp_page loop.
+                inc     <lzsa_srcptr + 0
+                bne     .cp_skip1
+                inc     <lzsa_srcptr + 1
+.cp_skip1:      inc     <lzsa_dstptr + 0
+                bne     .cp_skip2
+                inc     <lzsa_dstptr + 1
+.cp_skip2:      dex
+                bne     .cp_byte
+.cp_npages:     lda     #0                      ; Any full pages left to copy?
+                beq     .lz_offset
+
+                dec     .cp_npages + 1          ; Unlikely, so can be slow.
+                bcc     .cp_byte                ; Always true!
+
+                ;
+                ; Copy bytes from decompressed window.
+                ;
+                ; N.B. X=0 is expected and guaranteed when we get here.
+                ;
+                ; xyz
+                ; ===========================
+                ; 00z  5-bit offset
+                ; 01z  9-bit offset
+                ; 10z  13-bit offset
+                ; 110  16-bit offset
+                ; 111  repeat offset
+                ;
+
+.lz_offset:     lda     <lzsa_cmdbuf
+                asl
+                bcs     .get_13_16_rep
+
+.get_5_9_bits:  dex                             ; X=$FF for a 5-bit offset.
+                asl
+                bcs     .get_9_bits             ; Fall through if 5-bit.
+
+.get_13_bits:   asl                             ; Both 5-bit and 13-bit read
+                php                             ; a nibble.
+                jsr     .get_nibble
+                plp
+                rol                             ; Shift into position, clr C.
+                eor     #$E1
+                cpx     #$00                    ; X=$FF for a 5-bit offset.
+                bne     .set_offset
+                sbc     #2                      ; 13-bit offset from $FE00.
+                bne     .set_hi_8               ; Always NZ from previous SBC.
+
+.get_9_bits:    asl                             ; X=$FF if CC, X=$FE if CS.
+                bcc     .get_lo_8
+                dex
+                bcs     .get_lo_8               ; Always CS from previous BCC.
+
+.get_13_16_rep: asl
+                bcc     .get_13_bits            ; Shares code with 5-bit path.
+
+.get_16_rep:    bmi     .lz_length              ; Repeat previous offset.
+
+.get_16_bits:   jsr     .get_byte               ; Get hi-byte of offset.
+
+.set_hi_8:      tax
+
+.get_lo_8:      !if     LZSA_SMALL_SIZE {
+
+                jsr     .get_byte               ; Get lo-byte of offset.
+
+                } else {
+
+                lda     (lzsa_srcptr),y         ; Get lo-byte of offset.
+                inc     <lzsa_srcptr + 0
+                bne     .set_offset
+                inc     <lzsa_srcptr + 1
+
+                }
+
+.set_offset:    sta     <lzsa_offset + 0        ; Save new offset.
+                stx     <lzsa_offset + 1
+
+.lz_length:     ldx     #1                      ; Hi-byte of length+256.
+
+                lda     <lzsa_cmdbuf
+                and     #$07
+                clc
+                adc     #$02
+                cmp     #$09                    ; Extended length?
+                bcc     .got_lz_len
+
+                jsr     .get_length             ; X=1 for match, returns CC.
+                inx                             ; Hi-byte of length+256.
+
+.got_lz_len:    eor     #$FF                    ; Negate the lo-byte of length.
+                tay
+                eor     #$FF
+
+.get_lz_dst:    adc     <lzsa_dstptr + 0        ; Calc address of partial page.
+                sta     <lzsa_dstptr + 0        ; Always CC from previous CMP.
+                iny
+                bcs     .get_lz_win
+                beq     .get_lz_win             ; Is lo-byte of length zero?
+                dec     <lzsa_dstptr + 1
+
+.get_lz_win:    clc                             ; Calc address of match.
+                adc     <lzsa_offset + 0        ; N.B. Offset is negative!
+                sta     <lzsa_winptr + 0
+                lda     <lzsa_dstptr + 1
+                adc     <lzsa_offset + 1
+                sta     <lzsa_winptr + 1
+
+.lz_byte:       lda     (lzsa_winptr),y
+                sta     (lzsa_dstptr),y
+                iny
+                bne     .lz_byte
+                inc     <lzsa_dstptr + 1
+                dex                             ; Any full pages left to copy?
+                bne     .lz_more
+
+                jmp     .cp_length              ; Loop around to the beginning.
+
+.lz_more:       inc     <lzsa_winptr + 1        ; Unlikely, so can be slow.
+                bne     .lz_byte                ; Always true!
+
+                ;
+                ; Lookup tables to differentiate literal and match lengths.
+                ;
+
+.nibl_len_tbl:  !byte   3                       ; 0+3 (for literal).
+                !byte   9                       ; 2+7 (for match).
+
+.byte_len_tbl:  !byte   18 - 1                  ; 0+3+15 - CS (for literal).
+                !byte   24 - 1                  ; 2+7+15 - CS (for match).
+
+                ;
+                ; Get 16-bit length in X:A register pair, return with CC.
+                ;
+
+.get_length:    jsr     .get_nibble
+                cmp     #$0F                    ; Extended length?
+                bcs     .byte_length
+                adc     .nibl_len_tbl,x         ; Always CC from previous CMP.
+
+.got_length:    ldx     #$00                    ; Set hi-byte of 4 & 8 bit
+                rts                             ; lengths.
+
+.byte_length:   jsr     .get_byte               ; So rare, this can be slow!
+                adc     .byte_len_tbl,x         ; Always CS from previous CMP.
+                bcc     .got_length
+                beq     .finished
+
+.word_length:   clc                             ; MUST return CC!
+                jsr     .get_byte               ; So rare, this can be slow!
+                pha
+                jsr     .get_byte               ; So rare, this can be slow!
+                tax
+                pla
+                bne     .got_word               ; Check for zero lo-byte.
+                dex                             ; Do one less page loop if so.
+.got_word:      rts
+
+.get_byte:      lda     (lzsa_srcptr),y         ; Subroutine version for when
+                inc     <lzsa_srcptr + 0        ; inlining isn't advantageous.
+                bne     .got_byte
+                inc     <lzsa_srcptr + 1
+.got_byte:      rts
+
+.finished:      pla                             ; Decompression completed, pop
+                pla                             ; return address.
+                rts
+
+                ;
+                ; Get a nibble value from compressed data in A.
+                ;
+
+.get_nibble:    lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+                lda     <lzsa_nibble            ; Extract the lo-nibble.
+                bcs     .got_nibble
+
+                inc     <lzsa_nibflg            ; Reset the flag.
+
+                !if     LZSA_SMALL_SIZE {
+                jsr     .get_byte
+
+                } else {
+
+                lda     (lzsa_srcptr),y
+                inc     <lzsa_srcptr + 0
+                bne     .set_nibble
+                inc     <lzsa_srcptr + 1
+
+                }
+
+.set_nibble:    sta     <lzsa_nibble            ; Preserve for next time.
+                lsr                             ; Extract the hi-nibble.
+                lsr
+                lsr
+                lsr
+
+.got_nibble:    and     #$0F
+                rts
--- a/asm/6502/decompress_small_v1.asm
+++ b/asm/6502/decompress_small_v1.asm
@ -1,170 +1,270 @@
-; -----------------------------------------------------------------------------
-; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
-;
-; in:
-; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
-; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
-;
-; out:
-; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
-; -----------------------------------------------------------------------------
-;
-;  Copyright (C) 2019 Emmanuel Marty
-;
-;  This software is provided 'as-is', without any express or implied
-;  warranty.  In no event will the authors be held liable for any damages
-;  arising from the use of this software.
-;
-;  Permission is granted to anyone to use this software for any purpose,
-;  including commercial applications, and to alter it and redistribute it
-;  freely, subject to the following restrictions:
-;
-;  1. The origin of this software must not be misrepresented; you must not
-;     claim that you wrote the original software. If you use this software
-;     in a product, an acknowledgment in the product documentation would be
-;     appreciated but is not required.
-;  2. Altered source versions must be plainly marked as such, and must not be
-;     misrepresented as being the original software.
-;  3. This notice may not be removed or altered from any source distribution.
-; -----------------------------------------------------------------------------
-
-DECOMPRESS_LZSA1
-   LDY #$00
-
-DECODE_TOKEN
-   JSR GETSRC                           ; read token byte: O|LLL|MMMM
-   PHA                                  ; preserve token on stack
-
-   AND #$70                             ; isolate literals count
-   BEQ NO_LITERALS                      ; skip if no literals to copy
-   LSR A                                ; shift literals count into place
-   LSR A
-   LSR A
-   LSR A
-   CMP #$07                             ; LITERALS_RUN_LEN?
-   BCC PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
-
-   JSR GETSRC                           ; get extra byte of variable literals count
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$F9                             ; (LITERALS_RUN_LEN)
-   BCC PREPARE_COPY_LITERALS
-   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
-
-   JSR GETSRC                           ; get single extended byte of variable literals count
-   INY                                  ; add 256 to literals count
-   BCS PREPARE_COPY_LITERALS            ; (*like JMP PREPARE_COPY_LITERALS but shorter)
-
-LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
-                                        ; literals count = directly these 16 bits
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-   BYTE $A9                             ; mask TAX (faster than BCS)
-PREPARE_COPY_LITERALS
-   TAX
-   INY
-
-COPY_LITERALS
-   JSR GETPUT                           ; copy one byte of literals
-   DEX
-   BNE COPY_LITERALS
-   DEY
-   BNE COPY_LITERALS
-   
-NO_LITERALS
-   PLA                                  ; retrieve token from stack
-   PHA                                  ; preserve token again
-   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
-
-   JSR GETSRC                           ; get 8 bit offset from stream in A
-   TAX                                  ; save for later
-   LDA #$0FF                            ; high 8 bits
-   BNE GOT_OFFSET                       ; go prepare match
-                                        ; (*like JMP GOT_OFFSET but shorter)
-
-SHORT_VARLEN_MATCHLEN
-   JSR GETSRC                           ; get single extended byte of variable match len
-   INY                                  ; add 256 to match length
-
-PREPARE_COPY_MATCH
-   TAX
-PREPARE_COPY_MATCH_Y
-   INY
-
-COPY_MATCH_LOOP
-   LDA $AAAA                            ; get one byte of backreference
-   INC COPY_MATCH_LOOP+1
-   BNE GETMATCH_DONE
-   INC COPY_MATCH_LOOP+2
-GETMATCH_DONE
-   JSR PUTDST                           ; copy to destination
-   DEX
-   BNE COPY_MATCH_LOOP
-   DEY
-   BNE COPY_MATCH_LOOP
-   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
-
-GET_LONG_OFFSET                         ; handle 16 bit offset:
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-
-GOT_OFFSET
-   STA OFFSHI                           ; store high 8 bits of offset
-   TXA
-
-   CLC                                  ; add dest + match offset
-   ADC PUTDST+1                         ; low 8 bits
-   STA COPY_MATCH_LOOP+1                ; store back reference address
-OFFSHI = *+1
-   LDA #$AA                             ; high 8 bits
-
-   ADC PUTDST+2
-   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
-   
-   PLA                                  ; retrieve token from stack again
-   AND #$0F                             ; isolate match len (MMMM)
-   ADC #$02                             ; plus carry which is always set by the high ADC
-   CMP #$12                             ; MATCH_RUN_LEN?
-   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
-
-   JSR GETSRC                           ; get extra byte of variable match length
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
-   BCC PREPARE_COPY_MATCH
-   BNE SHORT_VARLEN_MATCHLEN
-
-                                        ; Handle 16 bits match length
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-                                        ; large match length with zero high byte?
-   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
-
-DECOMPRESSION_DONE
-   RTS
-
-GETPUT
-   JSR GETSRC
-PUTDST
-LZSA_DST_LO = *+1
-LZSA_DST_HI = *+2
-   STA $AAAA
-   INC PUTDST+1
-   BNE PUTDST_DONE
-   INC PUTDST+2
-PUTDST_DONE
-   RTS
-
-GETLARGESRC
-   JSR GETSRC                           ; grab low 8 bits
-   TAX                                  ; move to X
-                                        ; fall through grab high 8 bits
-
-GETSRC
-LZSA_SRC_LO = *+1
-LZSA_SRC_HI = *+2
-   LDA $AAAA
-   INC GETSRC+1
-   BNE GETSRC_DONE
-   INC GETSRC+2
-GETSRC_DONE
-   RTS
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+DECOMPRESS_LZSA1
+   LDY #$00
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: O|LLL|MMMM
+   PHA                                  ; preserve token on stack
+
+   AND #$70                             ; isolate literals count
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+   LSR
+   CMP #$07                             ; LITERALS_RUN_LEN?
+   BCC PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$F9                             ; (LITERALS_RUN_LEN)
+   BCC PREPARE_COPY_LITERALS
+   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
+
+   JSR GETSRC                           ; get single extended byte of variable literals count
+   INY                                  ; add 256 to literals count
+   BCS PREPARE_COPY_LITERALS            ; (*like JMP PREPARE_COPY_LITERALS but shorter)
+
+LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   TXA
+
+PREPARE_COPY_LITERALS
+   TAX
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
+
+   JSR GETSRC                           ; get 8 bit offset from stream in A
+   TAX                                  ; save for later
+   LDA #$FF                             ; high 8 bits
+   BNE GOT_OFFSET                       ; go prepare match
+                                        ; (*like JMP GOT_OFFSET but shorter)
+
+SHORT_VARLEN_MATCHLEN
+   JSR GETSRC                           ; get single extended byte of variable match len
+   INY                                  ; add 256 to match length
+
+PREPARE_COPY_MATCH
+   TAX
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   INC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
+
+GET_LONG_OFFSET                         ; handle 16 bit offset:
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+
+GOT_OFFSET
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   STX OFFSLO
+
+   SEC                                  ; substract dest - match offset
+   LDA PUTDST+1
+OFFSLO = *+1
+   SBC #$AA                             ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   TXA
+
+   CLC                                  ; add dest + match offset
+   ADC PUTDST+1                         ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+
+   PLA                                  ; retrieve token from stack again
+   AND #$0F                             ; isolate match len (MMMM)
+   ADC #$02                             ; plus carry which is always set by the high ADC
+   CMP #$12                             ; MATCH_RUN_LEN?
+   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
+   BCC PREPARE_COPY_MATCH
+   BNE SHORT_VARLEN_MATCHLEN
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+                                        ; large match length with zero high byte?
+   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
+
+DECOMPRESSION_DONE
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BNE PUTDST_DONE
+   DEC PUTDST+2
+PUTDST_DONE
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BNE GETSRC_DONE
+   DEC GETSRC+2
+GETSRC_DONE
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BNE PUTDST_DONE
+   INC PUTDST+2
+PUTDST_DONE
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BNE GETSRC_DONE
+   INC GETSRC+2
+GETSRC_DONE
+   RTS
+
+}
--- a/asm/6502/decompress_small_v2.asm
+++ b/asm/6502/decompress_small_v2.asm
@ -1,239 +1,332 @@
-; -----------------------------------------------------------------------------
-; Decompress raw LZSA2 block.
-; Create one with lzsa -r -f2 <original_file> <compressed_file>
-;
-; in:
-; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
-; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
-;
-; out:
-; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
-; -----------------------------------------------------------------------------
-;
-;  Copyright (C) 2019 Emmanuel Marty
-;
-;  This software is provided 'as-is', without any express or implied
-;  warranty.  In no event will the authors be held liable for any damages
-;  arising from the use of this software.
-;
-;  Permission is granted to anyone to use this software for any purpose,
-;  including commercial applications, and to alter it and redistribute it
-;  freely, subject to the following restrictions:
-;
-;  1. The origin of this software must not be misrepresented; you must not
-;     claim that you wrote the original software. If you use this software
-;     in a product, an acknowledgment in the product documentation would be
-;     appreciated but is not required.
-;  2. Altered source versions must be plainly marked as such, and must not be
-;     misrepresented as being the original software.
-;  3. This notice may not be removed or altered from any source distribution.
-; -----------------------------------------------------------------------------
-
-NIBCOUNT = $FC                          ; zero-page location for temp offset
-
-DECOMPRESS_LZSA2
-   LDY #$00
-   STY NIBCOUNT
-
-DECODE_TOKEN
-   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
-   PHA                                  ; preserve token on stack
-
-   AND #$18                             ; isolate literals count (LL)
-   BEQ NO_LITERALS                      ; skip if no literals to copy
-   LSR A                                ; shift literals count into place
-   LSR A
-   LSR A
-   CMP #$03                             ; LITERALS_RUN_LEN_V2?
-   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
-
-   JSR GETNIBBLE                        ; get extra literals length nibble
-                                        ; add nibble to len from token
-   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
-   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
-   BCC PREPARE_COPY_LITERALS            ; if less, literals count is complete
-
-   JSR GETSRC                           ; get extra byte of variable literals count
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$EE                             ; overflow?
-
-PREPARE_COPY_LITERALS
-   TAX
-   BCC PREPARE_COPY_LITERALS_HIGH       ; if not, literals count is complete
-
-                                        ; handle 16 bits literals count
-                                        ; literals count = directly these 16 bits
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-
-PREPARE_COPY_LITERALS_HIGH
-   INY
-
-COPY_LITERALS
-   JSR GETPUT                           ; copy one byte of literals
-   DEX
-   BNE COPY_LITERALS
-   DEY
-   BNE COPY_LITERALS
-   
-NO_LITERALS
-   PLA                                  ; retrieve token from stack
-   PHA                                  ; preserve token again
-   ASL
-   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
-
-   ASL                                  ; 0YZ: 5 or 9 bit offset
-   BCS OFFSET_9_BIT         
-    
-                                        ; 00Z: 5 bit offset
-
-   LDX #$0FF                            ; set offset bits 15-8 to 1
-
-   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
-   ORA #$E0                             ; set bits 7-5 to 1
-   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
-   
-OFFSET_9_BIT                            ; 01Z: 9 bit offset
-   ;;ASL                                  ; shift Z (offset bit 8) in place
-   ROL
-   ROL
-   AND #$01
-   EOR #$FF                             ; set offset bits 15-9 to 1
-   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
-                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
-
-REPMATCH_OR_LARGE_OFFSET
-   ASL                                  ; 13 bit offset?
-   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
-
-                                        ; 10Z: 13 bit offset
-
-   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
-   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
-   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
-                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
-
-REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
-   ;;ASL                                  ; XYZ=111?
-   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
-   
-                                        ; 110: handle 16 bit offset
-   JSR GETSRC                           ; grab high 8 bits
-GOT_OFFSET_HI
-   TAX
-   JSR GETSRC                           ; grab low 8 bits
-GOT_OFFSET_LO
-   STA OFFSLO                           ; store low byte of match offset
-   STX OFFSHI                           ; store high byte of match offset
-
-REP_MATCH
-   CLC                                  ; add dest + match offset
-   LDA PUTDST+1                         ; low 8 bits
-OFFSLO = *+1
-   ADC #$AA
-   STA COPY_MATCH_LOOP+1                ; store back reference address
-OFFSHI = *+1
-   LDA #$AA                             ; high 8 bits
-   ADC PUTDST+2
-   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
-   
-   PLA                                  ; retrieve token from stack again
-   AND #$07                             ; isolate match len (MMM)
-   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
-   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
-   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
-
-   JSR GETNIBBLE                        ; get extra match length nibble
-                                        ; add nibble to len from token
-   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
-   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
-   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
-
-   JSR GETSRC                           ; get extra byte of variable match length
-                                        ; the carry is always set by the CMP above
-                                        ; GETSRC doesn't change it
-   SBC #$E8                             ; overflow?
-
-PREPARE_COPY_MATCH
-   TAX
-   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
-   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
-
-                                        ; Handle 16 bits match length
-   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
-   TAY                                  ; put high 8 bits in Y
-
-PREPARE_COPY_MATCH_Y
-   INY
-
-COPY_MATCH_LOOP
-   LDA $AAAA                            ; get one byte of backreference
-   INC COPY_MATCH_LOOP+1
-   BNE GETMATCH_DONE
-   INC COPY_MATCH_LOOP+2
-GETMATCH_DONE
-   JSR PUTDST                           ; copy to destination
-   DEX
-   BNE COPY_MATCH_LOOP
-   DEY
-   BNE COPY_MATCH_LOOP
-   JMP DECODE_TOKEN
-
-GETCOMBINEDBITS
-   EOR #$80
-   ASL
-   PHP
-
-   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
-   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
-COMBINEDBITZ
-   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
-DECOMPRESSION_DONE
-   RTS
-
-GETNIBBLE
-NIBBLES = *+1
-   LDA #$AA
-   LSR NIBCOUNT
-   BCS HAS_NIBBLES
-
-   INC NIBCOUNT
-   JSR GETSRC                           ; get 2 nibbles
-   STA NIBBLES
-   LSR 
-   LSR 
-   LSR 
-   LSR 
-   SEC
-
-HAS_NIBBLES
-   AND #$0F                             ; isolate low 4 bits of nibble
-   RTS
-
-GETPUT
-   JSR GETSRC
-PUTDST
-LZSA_DST_LO = *+1
-LZSA_DST_HI = *+2
-   STA $AAAA
-   INC PUTDST+1
-   BNE PUTDST_DONE
-   INC PUTDST+2
-PUTDST_DONE
-   RTS
-
-GETLARGESRC
-   JSR GETSRC                           ; grab low 8 bits
-   TAX                                  ; move to X
-                                        ; fall through grab high 8 bits
-
-GETSRC
-LZSA_SRC_LO = *+1
-LZSA_SRC_HI = *+2
-   LDA $AAAA
-   INC GETSRC+1
-   BNE GETSRC_DONE
-   INC GETSRC+2
-GETSRC_DONE
-   RTS
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA2 block.
+; Create one with lzsa -r -f2 <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+NIBCOUNT = $FC                          ; zero-page location for temp offset
+
+DECOMPRESS_LZSA2
+   LDY #$00
+   STY NIBCOUNT
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
+   PHA                                  ; preserve token on stack
+
+   AND #$18                             ; isolate literals count (LL)
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+   CMP #$03                             ; LITERALS_RUN_LEN_V2?
+   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra literals length nibble
+                                        ; add nibble to len from token
+   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
+   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
+   BCC PREPARE_COPY_LITERALS            ; if less, literals count is complete
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; overflow?
+
+PREPARE_COPY_LITERALS
+   TAX
+   BCC PREPARE_COPY_LITERALS_HIGH       ; if not, literals count is complete
+
+                                        ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_LITERALS_HIGH
+   TXA
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   ASL
+   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
+
+   ASL                                  ; 0YZ: 5 or 9 bit offset
+   BCS OFFSET_9_BIT         
+    
+                                        ; 00Z: 5 bit offset
+
+   LDX #$FF                             ; set offset bits 15-8 to 1
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
+   ORA #$E0                             ; set bits 7-5 to 1
+   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
+   
+OFFSET_9_BIT                            ; 01Z: 9 bit offset
+   ROL                                  ; carry: Z bit; A: xxxxxxx1 (carry known set from BCS OFFSET_9_BIT)
+   ADC #$00                             ; if Z bit is set, add 1 to A (bit 0 of A is now 0), otherwise bit 0 is 1
+   ORA #$FE                             ; set offset bits 15-9 to 1. reversed Z is already in bit 0
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_LARGE_OFFSET
+   ASL                                  ; 13 bit offset?
+   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
+
+                                        ; 10Z: 13 bit offset
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
+   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
+   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
+   
+                                        ; 110: handle 16 bit offset
+   JSR GETSRC                           ; grab high 8 bits
+GOT_OFFSET_HI
+   TAX
+   JSR GETSRC                           ; grab low 8 bits
+GOT_OFFSET_LO
+   STA OFFSLO                           ; store low byte of match offset
+   STX OFFSHI                           ; store high byte of match offset
+
+REP_MATCH
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   SEC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   SBC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA PUTDST+2
+OFFSHI = *+1
+   SBC #$AA                             ; high 8 bits
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   CLC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+OFFSLO = *+1
+   ADC #$AA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+   
+   PLA                                  ; retrieve token from stack again
+   AND #$07                             ; isolate match len (MMM)
+   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
+   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra match length nibble
+                                        ; add nibble to len from token
+   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
+   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$E8                             ; overflow?
+
+PREPARE_COPY_MATCH
+   TAX
+   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
+   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   LDA COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   INC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+
+}
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   JMP DECODE_TOKEN
+
+GETCOMBINEDBITS
+   EOR #$80
+   ASL
+   PHP
+
+   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
+   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
+   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
+DECOMPRESSION_DONE
+   RTS
+
+GETNIBBLE
+NIBBLES = *+1
+   LDA #$AA
+   LSR NIBCOUNT
+   BCS HAS_NIBBLES
+
+   INC NIBCOUNT
+   JSR GETSRC                           ; get 2 nibbles
+   STA NIBBLES
+   LSR 
+   LSR 
+   LSR 
+   LSR 
+   SEC
+
+HAS_NIBBLES
+   AND #$0F                             ; isolate low 4 bits of nibble
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   LDA PUTDST+1
+   BNE PUTDST_DONE
+   DEC PUTDST+2
+PUTDST_DONE
+   DEC PUTDST+1
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   PHA
+   LDA GETSRC+1
+   BNE GETSRC_DONE
+   DEC GETSRC+2
+GETSRC_DONE
+   DEC GETSRC+1
+   PLA
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BNE PUTDST_DONE
+   INC PUTDST+2
+PUTDST_DONE
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BNE GETSRC_DONE
+   INC GETSRC+2
+GETSRC_DONE
+   RTS
+
+}
+
--- a/asm/65816/decompress_v1.asm
+++ b/asm/65816/decompress_v1.asm
@ -0,0 +1,281 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK contain the compressed raw block address
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/BANK contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019-2020 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+!cpu 65816
+DECOMPRESS_LZSA1
+   SEP #$30
+!as
+!rs
+   LDY #$00
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: O|LLL|MMMM
+   PHA                                  ; preserve token on stack
+
+   AND #$70                             ; isolate literals count
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$70                             ; LITERALS_RUN_LEN?
+   BNE PREPARE_COPY_LITERALS            ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$F9                             ; (LITERALS_RUN_LEN)
+   BCC PREPARE_COPY_LITERALS_DIRECT
+   BEQ LARGE_VARLEN_LITERALS            ; if adding up to zero, go grab 16-bit count
+
+   JSR GETSRC                           ; get single extended byte of variable literals count
+   INY                                  ; add 256 to literals count
+   BCS PREPARE_COPY_LITERALS_DIRECT     ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
+
+LARGE_VARLEN_LITERALS                   ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   TXA
+   BCS PREPARE_COPY_LARGE_LITERALS      ; (*like JMP PREPARE_COPY_LITERALS_DIRECT but shorter)
+
+PREPARE_COPY_LITERALS
+   TAX
+   LDA SHIFT_TABLE-1,X                  ; shift literals length into place
+                                        ; -1 because position 00 is reserved
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+
+PREPARE_COPY_LARGE_LITERALS
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   BMI GET_LONG_OFFSET                  ; $80: 16 bit offset
+
+   JSR GETSRC                           ; get 8 bit offset from stream in A
+   TAX                                  ; save for later
+   LDA #$FF                             ; high 8 bits
+   BNE GOT_OFFSET                       ; go prepare match
+                                        ; (*like JMP GOT_OFFSET but shorter)
+
+SHORT_VARLEN_MATCHLEN
+   JSR GETSRC                           ; get single extended byte of variable match len
+   INY                                  ; add 256 to match length
+
+PREPARE_COPY_MATCH
+   TAX
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAAAA                          ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+   REP #$20
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+
+}
+   SEP #$20
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   BEQ DECODE_TOKEN                     ; (*like JMP DECODE_TOKEN but shorter)
+
+GET_LONG_OFFSET                         ; handle 16 bit offset:
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+
+GOT_OFFSET
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   STX OFFSLO
+
+   SEC                                  ; substract dest - match offset
+   REP #$20
+!al
+   LDA PUTDST+1
+OFFSLO = *+1
+OFFSHI = *+2
+   SBC #$AAAA                           ; 16 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   SEP #$20
+!as
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   STA OFFSHI                           ; store high 8 bits of offset
+   TXA
+
+   CLC                                  ; add dest + match offset
+   ADC PUTDST+1                         ; low 8 bits
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+OFFSHI = *+1
+   LDA #$AA                             ; high 8 bits
+
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+}
+
+   LDA PUTDST+3                         ; bank
+   STA COPY_MATCH_LOOP+3                ; store back reference address
+
+   PLA                                  ; retrieve token from stack again
+   AND #$0F                             ; isolate match len (MMMM)
+   ADC #$02                             ; plus carry which is always set by the high ADC
+   CMP #$12                             ; MATCH_RUN_LEN?
+   BCC PREPARE_COPY_MATCH               ; if not, count is directly embedded in token
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; add MATCH_RUN_LEN and MIN_MATCH_SIZE to match length
+   BCC PREPARE_COPY_MATCH
+   BNE SHORT_VARLEN_MATCHLEN
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+                                        ; large match length with zero high byte?
+   BNE PREPARE_COPY_MATCH_Y             ; if not, continue
+
+DECOMPRESSION_DONE
+   RTS
+
+SHIFT_TABLE
+   !BYTE     $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00
+   !BYTE $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01
+   !BYTE $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02
+   !BYTE $03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03
+   !BYTE $04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04,$04
+   !BYTE $05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05,$05
+   !BYTE $06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06,$06
+   !BYTE $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   DEC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   DEC GETSRC+1
+   SEP #$20
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   INC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   INC GETSRC+1
+   SEP #$20
+   RTS
+}
--- a/asm/65816/decompress_v2.asm
+++ b/asm/65816/decompress_v2.asm
@ -0,0 +1,338 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA2 block.
+; Create one with lzsa -r -f2 <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK contain the compressed raw block address
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK contain the last decompressed byte address, +1
+;
+; -----------------------------------------------------------------------------
+; Backward decompression is also supported, use lzsa -r -b -f2 <original_file> <compressed_file>
+; To use it, also define BACKWARD_DECOMPRESS=1 before including this code!
+;
+; in:
+; * LZSA_SRC_LO/LZSA_SRC_HI/LZSA_SRC_BANK must contain the address of the last byte of compressed data
+; * LZSA_DST_LO/LZSA_DST_HI/LZSA_DST_BANK must contain the address of the last byte of the destination buffer
+;
+; out:
+; * LZSA_DST_LO/LZSA_DST_HI/BANK contain the last decompressed byte address, -1
+;
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019-2020 Emmanuel Marty, Peter Ferrie
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+!cpu 65816
+NIBCOUNT = $FC                          ; zero-page location for temp offset
+
+DECOMPRESS_LZSA2
+   SEP #$30
+!as
+!rs
+   LDY #$00
+   STY NIBCOUNT
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
+   PHA                                  ; preserve token on stack
+
+   AND #$18                             ; isolate literals count (LL)
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$18                             ; LITERALS_RUN_LEN_V2?
+   BCC PREPARE_COPY_LITERALS            ; if less, count is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra literals length nibble
+                                        ; add nibble to len from token
+   ADC #$02                             ; (LITERALS_RUN_LEN_V2) minus carry
+   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
+   BCC PREPARE_COPY_LITERALS_DIRECT     ; if less, literals count is complete
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$EE                             ; overflow?
+   BRA PREPARE_COPY_LITERALS_DIRECT
+
+PREPARE_COPY_LITERALS_LARGE
+                                        ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   BCS PREPARE_COPY_LITERALS_HIGH       ; (*same as JMP PREPARE_COPY_LITERALS_HIGH but shorter)
+
+PREPARE_COPY_LITERALS
+   LSR                                  ; shift literals count into place
+   LSR
+   LSR
+
+PREPARE_COPY_LITERALS_DIRECT
+   TAX
+   BCS PREPARE_COPY_LITERALS_LARGE      ; if so, literals count is large
+
+PREPARE_COPY_LITERALS_HIGH
+   TXA
+   BEQ COPY_LITERALS
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   ASL
+   BCS REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
+
+   ASL                                  ; 0YZ: 5 or 9 bit offset
+   BCS OFFSET_9_BIT         
+    
+                                        ; 00Z: 5 bit offset
+
+   LDX #$FF                             ; set offset bits 15-8 to 1
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 0, read nibble for bits 4-1
+   ORA #$E0                             ; set bits 7-5 to 1
+   BNE GOT_OFFSET_LO                    ; go store low byte of match offset and prepare match
+   
+OFFSET_9_BIT                            ; 01Z: 9 bit offset
+   ;;ASL                                  ; shift Z (offset bit 8) in place
+   ROL
+   ROL
+   AND #$01
+   EOR #$FF                             ; set offset bits 15-9 to 1
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_LARGE_OFFSET
+   ASL                                  ; 13 bit offset?
+   BCS REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
+
+                                        ; 10Z: 13 bit offset
+
+   JSR GETCOMBINEDBITS                  ; rotate Z bit into bit 8, read nibble for bits 12-9
+   ADC #$DE                             ; set bits 15-13 to 1 and substract 2 (to substract 512)
+   BNE GOT_OFFSET_HI                    ; go store high byte, read low byte of match offset and prepare match
+                                        ; (*same as JMP GOT_OFFSET_HI but shorter)
+
+REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
+   ;;ASL                                  ; XYZ=111?
+   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
+   
+                                        ; 110: handle 16 bit offset
+   JSR GETSRC                           ; grab high 8 bits
+GOT_OFFSET_HI
+   TAX
+   JSR GETSRC                           ; grab low 8 bits
+GOT_OFFSET_LO
+   STA OFFSLO                           ; store low byte of match offset
+   STX OFFSHI                           ; store high byte of match offset
+
+REP_MATCH
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression - substract match offset
+
+   SEC                                  ; add dest + match offset
+   REP #$20
+!al
+   LDA PUTDST+1                         ; 16 bits
+OFFSLO = *+1
+OFFSHI = *+2
+   SBC #$AAAA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   SEP #$20
+!as
+   SEC
+
+} else {
+
+   ; Forward decompression - add match offset
+
+   CLC                                  ; add dest + match offset
+   REP #$20
+!al
+   LDA PUTDST+1                         ; 16 bits
+OFFSLO = *+1
+OFFSHI = *+2
+   ADC #$AAAA
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   SEP #$20
+!as
+}
+
+   LDA PUTDST+3                         ; bank
+   STA COPY_MATCH_LOOP+3                ; store back reference address
+   
+   PLA                                  ; retrieve token from stack again
+   AND #$07                             ; isolate match len (MMM)
+   ADC #$01                             ; add MIN_MATCH_SIZE_V2 and carry
+   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   BCC PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra match length nibble
+                                        ; add nibble to len from token
+   ADC #$08                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2) minus carry
+   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   BCC PREPARE_COPY_MATCH               ; if less, match length is complete
+
+   JSR GETSRC                           ; get extra byte of variable match length
+                                        ; the carry is always set by the CMP above
+                                        ; GETSRC doesn't change it
+   SBC #$E8                             ; overflow?
+
+PREPARE_COPY_MATCH
+   TAX
+   BCC PREPARE_COPY_MATCH_Y             ; if not, the match length is complete
+   BEQ DECOMPRESSION_DONE               ; if EOD code, bail
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+
+PREPARE_COPY_MATCH_Y
+   TXA
+   BEQ COPY_MATCH_LOOP
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAAAA                          ; get one byte of backreference
+   JSR PUTDST                           ; copy to destination
+
+   REP #$20
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- put backreference bytes backward
+
+   DEC COPY_MATCH_LOOP+1
+
+} else {
+
+   ; Forward decompression -- put backreference bytes forward
+
+   INC COPY_MATCH_LOOP+1
+
+}
+   SEP #$20
+
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   JMP DECODE_TOKEN
+
+GETCOMBINEDBITS
+   EOR #$80
+   ASL
+   PHP
+
+   JSR GETNIBBLE                        ; get nibble into bits 0-3 (for offset bits 1-4)
+   PLP                                  ; merge Z bit as the carry bit (for offset bit 0)
+COMBINEDBITZ
+   ROL                                  ; nibble -> bits 1-4; carry(!Z bit) -> bit 0 ; carry cleared
+DECOMPRESSION_DONE
+   RTS
+
+GETNIBBLE
+NIBBLES = *+1
+   LDA #$AA
+   LSR NIBCOUNT
+   BCC NEED_NIBBLES
+   AND #$0F                             ; isolate low 4 bits of nibble
+   RTS
+
+NEED_NIBBLES
+   INC NIBCOUNT
+   JSR GETSRC                           ; get 2 nibbles
+   STA NIBBLES
+   LSR 
+   LSR 
+   LSR 
+   LSR 
+   SEC
+   RTS
+
+!ifdef BACKWARD_DECOMPRESS {
+
+   ; Backward decompression -- get and put bytes backward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   DEC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   DEC GETSRC+1
+   SEP #$20
+   RTS
+
+} else {
+
+   ; Forward decompression -- get and put bytes forward
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+LZSA_DST_BANK = *+3
+   STA $AAAAAA
+   REP #$20
+   INC PUTDST+1
+   SEP #$20
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+LZSA_SRC_BANK = *+3
+   LDA $AAAAAA
+   REP #$20
+   INC GETSRC+1
+   SEP #$20
+   RTS
+}
--- a/asm/6809/unlzsa1-6309.s
+++ b/asm/6809/unlzsa1-6309.s
@ -0,0 +1,90 @@
+;  unlzsa1-6309.s - Hitachi 6309 decompression routine for raw LZSA1 - 92 bytes
+;  compress with lzsa -f1 -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1 equ lz1token
+
+lz1bigof lda ,x+           ; O set: load MSB 16-bit (negative, signed) offest
+lz1gotof leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,x+          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+         tstb
+         bne lz1gotln      ; go copy matched bytes if not
+
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,x+           ; grab low 8 bits of len in B
+
+lz1gotln tfr d,w           ; set W with match length for TFM instruction
+         tfm u+,y+         ; copy match bytes
+
+lz1token ldb ,x+           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,x+           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,x+           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+
+lz1gotla clra              ; clear A (high part of literals count)
+lz1gotlt tfr d,w           ; set W with literals count for TFM instruction
+         tfm x+,y+         ; copy literal bytes
+
+lz1nolt  ldb ,x+           ; load either 8-bit or LSB 16-bit offset (negative, signed)
+         lda ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
--- a/asm/6809/unlzsa1.s
+++ b/asm/6809/unlzsa1.s
@ -0,0 +1,102 @@
+;  unlzsa1.s - 6809 decompression routine for raw LZSA1 - 110 bytes
+;  compress with lzsa -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1 equ lz1token
+
+lz1bigof lda ,x+           ; O set: load MSB 16-bit (negative, signed) offest
+lz1gotof leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,x+          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+         tstb
+         bne lz1gotln      ; go copy matched bytes if not
+
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,x+           ; grab low 8 bits of len in B
+
+lz1gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz1cpymt lda ,u+           ; copy matched byte
+         sta ,y+
+         leax -1,x         ; decrement X
+         bne lz1cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+
+lz1token ldb ,x+           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,x+           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,x+           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+lz1gotla clra              ; clear A (high part of literals count)
+
+lz1gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz1cpylt lda ,u+           ; copy literal byte
+         sta ,y+
+         leax -1,x         ; decrement X and update Z flag
+         bne lz1cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz1nolt  ldb ,x+           ; load either 8-bit or LSB 16-bit offset (negative, signed)
+         lda ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
--- a/asm/6809/unlzsa1b-6309.s
+++ b/asm/6809/unlzsa1b-6309.s
@ -0,0 +1,92 @@
+;  unlzsa1-6309.s - H6309 backward decompressor for raw LZSA1 - 97 bytes
+;  compress with lzsa -f1 -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1
+         leax 1,x
+         bra lz1token
+
+lz1bigof ldd ,--x          ; O set: load long 16-bit (negative, signed) offest
+lz1gotof negd              ; reverse sign of offset in D
+         leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,-x          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+
+         leay 1,y          ; adjust pointer to first byte of decompressed data
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,-x           ; grab low 8 bits of len in B
+
+lz1gotln tfr d,w           ; set W with match length for TFM instruction
+         tfm u-,y-         ; copy match bytes
+
+lz1token ldb ,-x           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,-x           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,-x           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+
+lz1gotla clra              ; clear A (high part of literals count)
+lz1gotlt tfr d,w           ; set W with literals count for TFM instruction
+         leax -1,x         ; tfm is post-decrement
+         tfm x-,y-         ; copy literal bytes
+         leax 1,x
+
+lz1nolt  ldb ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         ldb ,-x           ; load either 8-bit or LSB 16-bit offset (negative, signed)
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
--- a/asm/6809/unlzsa1b.s
+++ b/asm/6809/unlzsa1b.s
@ -0,0 +1,105 @@
+;  unlzsa1b.s - 6809 backward decompression routine for raw LZSA1 - 113 bytes
+;  compress with lzsa -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa1
+         leax 1,x
+         leay 1,y
+         bra lz1token
+
+lz1bigof ldd ,--x          ; O set: load long 16 bit (negative, signed) offset
+lz1gotof nega              ; reverse sign of offset in D
+         negb
+         sbca #0
+         leau d,y          ; put backreference start address in U (dst+offset)
+
+         ldd #$000f        ; clear MSB match length and set mask for MMMM
+         andb ,s+          ; isolate MMMM (embedded match length) in token
+
+         addb #$03         ; add MIN_MATCH_SIZE
+         cmpb #$12         ; MATCH_RUN_LEN?
+         bne lz1gotln      ; no, we have the full match length, go copy
+
+         addb ,-x          ; add extra match length byte + MIN_MATCH_SIZE + MATCH_RUN_LEN
+         bcc lz1gotln      ; if no overflow, we have the full length
+         bne lz1midln
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+         bne lz1gotln      ; check if we hit EOD (16-bit length = 0)
+
+         rts               ; done, bail
+
+lz1midln tfr b,a           ; copy high part of len into A
+         ldb ,-x           ; grab low 8 bits of len in B
+
+lz1gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz1cpymt lda ,-u           ; copy matched byte
+         sta ,-y
+         leax -1,x         ; decrement X
+         bne lz1cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+
+lz1token ldb ,-x           ; load next token into B: O|LLL|MMMM
+         pshs b            ; save it
+
+         andb #$70         ; isolate LLL (embedded literals count) in B
+         beq lz1nolt       ; skip if no literals
+         cmpb #$70         ; LITERALS_RUN_LEN?
+         bne lz1declt      ; if not, we have the complete count, go unshift
+
+         ldb ,-x           ; load extra literals count byte
+         addb #$07         ; add LITERALS_RUN_LEN
+         bcc lz1gotla      ; if no overflow, we got the complete count, copy
+         bne lz1midlt
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1midlt tfr b,a           ; copy high part of literals count into A
+         ldb ,-x           ; load low 8 bits of literals count
+         bra lz1gotlt      ; we now have the complete count, go copy
+
+lz1declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+         lsrb
+
+lz1gotla clra              ; clear A (high part of literals count)
+lz1gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz1cpylt lda ,-u           ; copy literal byte
+         sta ,-y
+         leax -1,x         ; decrement X and update Z flag
+         bne lz1cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz1nolt  ldb ,s            ; get token again, don't pop it from the stack
+         bmi lz1bigof      ; test O bit (small or large offset)
+
+         ldb ,-x           ; O clear: load 8 bit (negative, signed) offset
+         lda #$ff          ; set high 8 bits
+         bra lz1gotof
--- a/asm/6809/unlzsa2-6309.s
+++ b/asm/6809/unlzsa2-6309.s
@ -0,0 +1,129 @@
+;  unlzsa2-6309.s - Hitachi 6309 decompression routine for raw LZSA2 - 150 bytes
+;  compress with lzsa -f2 -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr lz2nibct      ; reset nibble available flag
+         bra lz2token
+
+lz2nibct fcb 0             ; nibble ready flag
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; push token's Z flag bit into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+         ldd ,x++          ; load high then low 8 bits of offset
+
+lz2gotof std lz2moff+2     ; store match offset
+
+lz2repof ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,u           ; isolate MMM (embedded match length) in token
+lz2moff  leau $aaaa,y      ; put backreference start address in U (dst+offset)
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,x+          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+
+lz2gotln tfr d,w           ; set W with match count for TFM instruction
+         tfm u+,y+         ; copy match bytes
+
+lz2token tfr x,u           ; save token address
+         ldb ,x+           ; load next token into B: XYZ|LL|MMM
+         andb #$18         ; isolate LL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,x+          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt tfr d,w           ; set W with literals count for TFM instruction
+         tfm x+,y+         ; copy literal bytes
+
+lz2nolt  ldb ,u            ; get token again
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2nibl  ldb #$aa
+         com lz2nibct      ; nibble ready?
+         bpl lz2gotnb
+
+         ldb ,x+           ; load two nibbles
+         stb lz2nibl+1     ; store nibble for next time (low 4 bits)
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+lz2gotnb andb #$0f         ; only keep low 4 bits
+lz2done  rts
--- a/asm/6809/unlzsa2.s
+++ b/asm/6809/unlzsa2.s
@ -0,0 +1,146 @@
+;  unlzsa2.s - 6809 decompression routine for raw LZSA2 - 169 bytes
+;  compress with lzsa -f2 -r <original_file> <compressed_file>
+;
+;  in:  x = start of compressed data
+;       y = start of decompression buffer
+;  out: y = end of decompression buffer + 1
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr <lz2nibct,pcr ; reset nibble available flag
+
+lz2token ldb ,x+           ; load next token into B: XYZ|LL|MMM
+         pshs b            ; save it
+
+         andb #$18         ; isolate LL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,x+          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldb ,x+           ; load low 8 bits of little-endian literals count
+         lda ,x+           ; load high 8 bits of literal count
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz2cpylt lda ,u+           ; copy literal byte
+         sta ,y+
+         leax -1,x         ; decrement X and update Z flag
+         bne lz2cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz2nolt  ldb ,s            ; get token again, don't pop it from the stack
+
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2nibct fcb $00           ; nibble ready flag
+
+lz2nibl  ldb #$aa
+         com <lz2nibct,pcr ; toggle nibble ready flag and check
+         bpl lz2gotnb
+
+         ldb ,x+           ; load two nibbles
+         stb <lz2nibl+1,pcr ; store nibble for next time (low 4 bits)
+
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+
+lz2gotnb andb #$0f         ; only keep low 4 bits
+lz2done  rts
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; push token's Z flag bit into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         ldb ,x+           ; load low 8 bits of (negative, signed) offset
+         bra lz2gotof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+         ldd ,x++          ; load high then low 8 bits of offset
+
+lz2gotof std <lz2repof+2,pcr ; store match offset
+lz2repof leau $aaaa,y      ; put backreference start address in U (dst+offset)
+
+         ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,s+          ; isolate MMM (embedded match length) in token
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,x+          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldb ,x+           ; load 16-bit len in D (low part in B, high in A)
+         lda ,x+           ; (little endian)
+
+lz2gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz2cpymt lda ,u+           ; copy matched byte
+         sta ,y+
+         leax -1,x         ; decrement X
+         bne lz2cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+         lbra lz2token     ; go decode next token
--- a/asm/6809/unlzsa2b-6309.s
+++ b/asm/6809/unlzsa2b-6309.s
@ -0,0 +1,133 @@
+;  unlzsa2b-6309.s - H6309 backward decompressor for raw LZSA2 - 155 bytes
+;  compress with lzsa -f2 -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty, Doug Masten
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr lz2nibct      ; reset nibble available flag
+         leax 1,x          ; adjust compressed data pointer
+         bra lz2token
+
+lz2nibct fcb 0             ; nibble ready flag
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; push token's Z flag bit into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         bra lz2lowof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+         lda ,-x           ; load high 8 bits of (negative, signed) offset
+lz2lowof ldb ,-x           ; load low 8 bits of offset
+
+lz2gotof negd              ; reverse sign of offset in D
+         std lz2moff+2     ; store match offset
+
+lz2repof ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,u           ; isolate MMM (embedded match length) in token
+lz2moff  leau $aaaa,y      ; put backreference start address in U (dst+offset)
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,-x          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+
+lz2gotln tfr d,w           ; set W with match count for TFM instruction
+         tfm u-,y-         ; copy match bytes
+
+lz2token ldb ,-x           ; load next token into B: XYZ|LL|MMM
+         tfr x,u           ; save token address
+         andb #$18         ; isolate LL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,-x          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2nibl  com lz2nibct      ; nibble ready?
+         bpl lz2gotnb
+
+         ldb ,-x           ; load two nibbles
+         stb lz2gotnb+1    ; store nibble for next time (low 4 bits)
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+         rts
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt tfr d,w           ; set W with literals count for TFM instruction
+         leax -1,x         ; tfm is post-decrement
+         tfm x-,y-         ; copy literal bytes
+         leax 1,x
+
+lz2nolt  ldb ,u            ; get token again
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         bra lz2lowof
+
+lz2done  leay 1,y          ; adjust pointer to first byte of decompressed data and then exit
+lz2gotnb ldb #$aa          ; load nibble
+         andb #$0f         ; only keep low 4 bits
+         rts
--- a/asm/6809/unlzsa2b.s
+++ b/asm/6809/unlzsa2b.s
@ -0,0 +1,152 @@
+;  unlzsa2b.s - 6809 backward decompression routine for raw LZSA2 - 171 bytes
+;  compress with lzsa -f2 -r -b <original_file> <compressed_file>
+;
+;  in:  x = last byte of compressed data
+;       y = last byte of decompression buffer
+;  out: y = first byte of decompressed data
+;
+;  Copyright (C) 2020 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+decompress_lzsa2
+         clr <lz2nibct,pcr ; reset nibble available flag
+         leax 1,x
+         leay 1,y
+
+lz2token ldb ,-x           ; load next token into B: XYZ|LL|MMM
+         pshs b            ; save it
+
+         andb #$18         ; isolate LLL (embedded literals count) in B
+         beq lz2nolt       ; skip if no literals
+         cmpb #$18         ; LITERALS_RUN_LEN_V2?
+         bne lz2declt      ; if not, we have the complete count, go unshift
+
+         bsr lz2nibl       ; get extra literals length nibble in B
+         addb #$03         ; add LITERALS_RUN_LEN_V2
+         cmpb #$12         ; LITERALS_RUN_LEN_V2 + 15 ?
+         bne lz2gotla      ; if not, we have the full literals count, go copy
+
+         addb ,-x          ; add extra literals count byte + LITERALS_RUN_LEN + 15
+         bcc lz2gotla      ; if no overflow, we got the complete count, copy
+
+         ldd ,--x          ; load 16 bit count in D (low part in B, high in A)
+         bra lz2gotlt      ; we now have the complete count, go copy
+
+lz2declt lsrb              ; shift literals count into place
+         lsrb
+         lsrb
+lz2gotla clra              ; clear A (high part of literals count)
+
+lz2gotlt leau ,x
+         tfr d,x           ; transfer 16-bit count into X
+lz2cpylt lda ,-u           ; copy literal byte
+         sta ,-y
+         leax -1,x         ; decrement X and update Z flag
+         bne lz2cpylt      ; loop until all literal bytes are copied
+         leax ,u
+
+lz2nolt  ldb ,s            ; get token again, don't pop it from the stack
+
+         lslb              ; push token's X flag bit into carry
+         bcs lz2replg      ; if token's X bit is set, rep or large offset
+
+         lslb              ; push token's Y flag bit into carry
+         sex               ; push token's Z flag bit into reg A (carry flag is not effected)
+         bcs lz2offs9      ; if token's Y bit is set, 9 bits offset
+
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 5-7 of offset, reverse bit 0
+         sex               ; set bits 8-15 of offset to $FF
+         bra lz2gotof
+
+lz2offs9 deca              ; set bits 9-15 of offset, reverse bit 8
+         bra lz2lowof
+
+lz2nibct fcb $00           ; nibble ready flag
+
+lz2nibl  ldb #$aa
+         com <lz2nibct,pcr ; toggle nibble ready flag and check
+         bpl lz2gotnb
+
+         ldb ,-x           ; load two nibbles
+         stb <lz2nibl+1,pcr ; store nibble for next time (low 4 bits)
+
+         lsrb              ; shift 4 high bits of nibble down
+         lsrb
+         lsrb
+         lsrb
+
+lz2gotnb andb #$0f         ; only keep low 4 bits
+lz2done  rts
+
+lz2replg lslb              ; push token's Y flag bit into carry
+         bcs lz2rep16      ; if token's Y bit is set, rep or 16 bit offset
+
+         sex               ; push token's Z flag bit into reg A
+         bsr lz2nibl       ; get offset nibble in B
+         lsla              ; retrieve token's Z flag bit and push into carry
+
+         rolb              ; shift Z flag from carry into bit 0 of B
+         eorb #$e1         ; set bits 13-15 of offset, reverse bit 8
+         tfr b,a           ; copy bits 8-15 of offset into A
+         suba #$02         ; substract 512 from offset
+         bra lz2lowof
+
+lz2rep16 bmi lz2repof      ; if token's Z flag bit is set, rep match
+
+         lda ,-x           ; load high 8 bits of (negative, signed) offset
+lz2lowof ldb ,-x           ; load low 8 bits of offset
+
+lz2gotof nega              ; reverse sign of offset in D
+         negb
+         sbca #0
+         std <lz2repof+2,pcr ; store match offset
+
+lz2repof leau $aaaa,y      ; put backreference start address in U (dst+offset)
+
+         ldd #$0007        ; clear MSB match length and set mask for MMM
+         andb ,s+          ; isolate MMM (embedded match length) in token
+
+         addb #$02         ; add MIN_MATCH_SIZE_V2
+         cmpb #$09         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+         bne lz2gotln      ; no, we have the full match length, go copy
+
+         bsr lz2nibl       ; get offset nibble in B
+         addb #$09         ; add MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2
+         cmpb #$18         ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+         bne lz2gotln      ; if not, we have the full match length, go copy
+
+         addb ,-x          ; add extra length byte + MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15
+         bcc lz2gotln      ; if no overflow, we have the full length
+         beq lz2done       ; detect EOD code
+
+         ldd ,--x          ; load 16-bit len in D (low part in B, high in A)
+
+lz2gotln pshs x            ; save source compressed data pointer
+         tfr d,x           ; copy match length to X
+
+lz2cpymt lda ,-u           ; copy matched byte
+         sta ,-y
+         leax -1,x         ; decrement X
+         bne lz2cpymt      ; loop until all matched bytes are copied
+
+         puls x            ; restore source compressed data pointer
+         lbra lz2token     ; go decode next token
--- a/asm/8088/LZSA1JMP.ASM
+++ b/asm/8088/LZSA1JMP.ASM
@ -1,32 +1,135 @@
-;  lzsa1fta.asm time-efficient decompressor implementation for 8086 CPUs.
-;  Turbo Assembler IDEAL mode dialect; can also be assembled with NASM.
+; lzsa2fta.asm time-efficient decompressor implementation for 808x CPUs.
+; Turbo Assembler IDEAL mode dialect.
+; (Is supposed to also assemble with NASM's IDEAL mode support, but YMMV.)
 ;
-;  Usual DOS assembler SMALL model assumptions apply.  This code:
-;  - Assumes it was invoked via NEAR call (change RET to RETF for FAR calls)
-;  - Is interrupt-safe
-;  - Is not re-entrant (do not decompress while already running decompression)
-;  - Trashes all data and segment registers
+; This code assembles to about 3K of lookup tables and unrolled code,
+; but the tradeoff for that size is the absolute fastest decompressor
+; of LZSA1 block data for 808x CPUs.
+; If you need moderately fast code with less size, see LZSA1FTA.ASM.
+; If you need the smallest decompression code, see decompress_small_v1.S.
 ;
-;  Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+; Usual DOS assembler SMALL model assumptions apply.  This code:
+; - Assumes it was invoked via NEAR call (change RET to RETF for FAR calls)
+; - Is interrupt-safe
+; - Is not re-entrant (do not decompress while already running decompression)
+; - Trashes all data and segment registers
 ;
-;  This software is provided 'as-is', without any express or implied
-;  warranty.  In no event will the authors be held liable for any damages
-;  arising from the use of this software.
+; Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+; Additional speed optimizations by Pavel Zagrebin
 ;
-;  Permission is granted to anyone to use this software for any purpose,
-;  including commercial applications, and to alter it and redistribute it
-;  freely, subject to the following restrictions:
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
 ;
-;  1. The origin of this software must not be misrepresented; you must not
-;     claim that you wrote the original software. If you use this software
-;     in a product, an acknowledgment in the product documentation would be
-;     appreciated but is not required.
-;  2. Altered source versions must be plainly marked as such, and must not be
-;     misrepresented as being the original software.
-;  3. This notice may not be removed or altered from any source distribution.
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; ===========================================================================
+;
+; The key area to concentrate on when optimizing LZSA1 decompression speed is
+; reducing time spent handling the shortest matches. This is for two reasons:
+;   1. shorter matches are more common
+;   2. short matches are least efficient in terms of decomp speed per byte
+; You can confirm #1 using the --stats mode of the compressor.
+;
+; Branches are costly on 8086.  To ensure we branch as little as possible, a
+; jumptable will be used to branch directly to as many direct decode paths as
+; possible.  This will burn up 512 bytes of RAM for a jumptable, and a few
+; hundred bytes of duplicated program code (rather than JMP/CALL common code
+; blocks, we inline them to avoid the branch overhead).
+;
+; ===========================================================================
+;
+; === LZSA1 block reference:
+;
+; Blocks encoded as LZSA1 are composed from consecutive commands.
+; Each command follows this format:
+;
+; token: <O|LLL|MMMM>
+; optional extra literal length
+; literal values
+; match offset low
+; optional match offset high
+; optional extra encoded match length
+;
+;
+; === LZSA1 Token Reference:
+;
+; 7 6 5 4 3 2 1 0
+; O L L L M M M M
+;
+; L: 3-bit literals length (0-6, or 7 if extended). If the number of literals for
+; this command is 0 to 6, the length is encoded in the token and no extra bytes
+; are required. Otherwise, a value of 7 is encoded and extra bytes follow as
+; 'optional extra literal length'
+;
+; M: 4-bit encoded match length (0-14, or 15 if extended). Likewise, if the
+; encoded match length for this command is 0 to 14, it is directly stored,
+; otherwise 15 is stored and extra bytes follow as 'optional extra encoded match
+; length'. Except for the last command in a block, a command always contains a
+; match, so the encoded match length is the actual match length, offset by the
+; minimum which is 3 bytes. For instance, an actual match length of 10 bytes to
+; be copied, is encoded as 7.
+;
+; O: set for a 2-bytes match offset, clear for a 1-byte match offset
+;
+;
+; === Decoding extended literal length:
+;
+; If the literals length is 7 or more, then an extra byte follows here, with
+; three possible values:
+;
+;   0-248: the value is added to the 7 stored in the token.
+;   250: a second byte follows. The final literals value is 256 + the second byte.
+;   249: a little-endian 16-bit value follows, forming the final literals value.
+;
+;
+; === Decoding match offsets:
+;
+; match offset low: The low 8 bits of the match offset follows.
+;
+; optional match offset high: If the 'O' bit (bit 7) is set in the token, the
+; high 8 bits of the match offset follow, otherwise they are understood to be all
+; set to 1. For instance, a short offset of 0x70 is interpreted as 0xff70
+;
+;
+; === Decoding extra encoded match length:
+;
+; optional extra encoded match length: If the encoded match length is 15 or more,
+; the 'M' bits in the token form the value 15, and an extra byte follows here,
+; with three possible types of value.
+;
+;  0-237: the value is added to the 15 stored in the token.
+;         The final value is 3 + 15 + this byte.
+;  239:   a second byte follows. The final match length is 256 + the second byte.
+;  238:   a second and third byte follow, forming a little-endian 16-bit value.
+;         The final encoded match length is that 16-bit value.
+;
+; ===========================================================================

-        IDEAL
-        P8086
+        IDEAL   ; Use Turbo Assembler IDEAL syntax checking
+        P8086   ; Restrict code generation to the 808x and later
+        JUMPS   ; Perform fixups for out-of-bound conditional jumps
+                ; This is required for the (L=07 & M=0Fh) decode paths as they
+                ; have the most code, but these are uncommon paths so the
+                ; tiny speed loss in just these paths is not a concern.
+
+;Setting OPTIMIZE_LONG_RLE to 1 speeds up decompressing long runs of the
+;same 16-bit word value, but hurts decompression speed of other data
+;types slightly.  Turn this on if you know your data has very long 16-bit
+;word-based runs (reported as RLE2 sequences in the LZSA compressor output
+;with an average length of at least 32 bytes), otherwise leave it off.
+
+OPTIMIZE_LONG_RLE EQU 0

 SEGMENT CODE para public

@ -34,241 +137,445 @@ ASSUME  cs:CODE, ds:CODE

 PUBLIC  lzsa1_decompress_speed_jumptable

-;  ---------------------------------------------------------------------------
-;  Decompress raw LZSA1 block
-;  inputs:
-;  * ds:si: raw LZSA1 block
-;  * es:di: output buffer
-;  output:
-;  * ax:    decompressed size
-;  ---------------------------------------------------------------------------
+; EQU helper statements (so we can construct a jump table without going crazy)

-;Jump table for handling LLL bits in initial LZSA1 tokens.
-;Previous code would SHR val,4 to get a count from 0 to 7, then rep movsb.
-;We can overload the shift operation into a jump table that jumps directly
-;to optimized copying routine for 0-7 bytes.  Must declare in code segment.
-;Note: If this looks strange for declaring a jump table, that's because it
-;is a workaround for the Turbo Pascal harness that tests it.  Turbo Pascal
-;treats OFFSET (label) as a relocatble item and throws an error, so we fool
-;it by building the table with absolute EQU/literals instead.
-L0b EQU OFFSET check_offset_size
-L1b EQU OFFSET copy1b
-L2b EQU OFFSET copy2b
-L3b EQU OFFSET copy3b
-L4b EQU OFFSET copy4b
-L5b EQU OFFSET copy5b
-L6b EQU OFFSET copy6b
-L7b EQU OFFSET need_length_byte
-copytable DW L0b,L0b,L0b,L0b,L0b,L0b,L0b,L0b
-          DW L1b,L1b,L1b,L1b,L1b,L1b,L1b,L1b
-          DW L2b,L2b,L2b,L2b,L2b,L2b,L2b,L2b
-          DW L3b,L3b,L3b,L3b,L3b,L3b,L3b,L3b
-          DW L4b,L4b,L4b,L4b,L4b,L4b,L4b,L4b
-          DW L5b,L5b,L5b,L5b,L5b,L5b,L5b,L5b
-          DW L6b,L6b,L6b,L6b,L6b,L6b,L6b,L6b
-          DW L7b,L7b,L7b,L7b,L7b,L7b,L7b,L7b
+minmatch EQU 3
+litrunlen EQU 7
+
+leml1 EQU OFFSET lit_ext_mat_len_1b
+leme1 EQU OFFSET lit_ext_mat_ext_1b
+leml2 EQU OFFSET lit_ext_mat_len_2b
+leme2 EQU OFFSET lit_ext_mat_ext_2b
+
+;short-circuit special cases for 0 through 6 literal copies:
+l6ml1 EQU OFFSET lit_len_mat_len_1b_6
+l6me1 EQU OFFSET lit_len_mat_ext_1b
+l6ml2 EQU OFFSET lit_len_mat_len_2b_6
+l6me2 EQU OFFSET lit_len_mat_ext_2b
+l5ml1 EQU OFFSET lit_len_mat_len_1b_45
+l5me1 EQU OFFSET lit_len_mat_ext_1b + 1
+l5ml2 EQU OFFSET lit_len_mat_len_2b_45
+l5me2 EQU OFFSET lit_len_mat_ext_2b + 1
+l4ml1 EQU OFFSET lit_len_mat_len_1b_45 + 1
+l4me1 EQU OFFSET lit_len_mat_ext_1b + 2
+l4ml2 EQU OFFSET lit_len_mat_len_2b_45 + 1
+l4me2 EQU OFFSET lit_len_mat_ext_2b + 2
+l3ml1 EQU OFFSET lit_len_mat_len_1b_23
+l3me1 EQU OFFSET lit_len_mat_ext_1b + 3
+l3ml2 EQU OFFSET lit_len_mat_len_2b_23
+l3me2 EQU OFFSET lit_len_mat_ext_2b + 3
+l2ml1 EQU OFFSET lit_len_mat_len_1b_23 + 1
+l2me1 EQU OFFSET lit_len_mat_ext_1b + 4
+l2ml2 EQU OFFSET lit_len_mat_len_2b_23 + 1
+l2me2 EQU OFFSET lit_len_mat_ext_2b + 4
+l1ml1 EQU OFFSET lit_len_mat_len_1b_01
+l1me1 EQU OFFSET lit_len_mat_ext_1b + 5
+l1ml2 EQU OFFSET lit_len_mat_len_2b_01
+l1me2 EQU OFFSET lit_len_mat_ext_2b + 5
+l0ml1 EQU OFFSET lit_len_mat_len_1b_01 + 1 ; MMMM handling comes after LLL code
+l0me1 EQU OFFSET lit_len_mat_ext_1b + 6    ; MMMM handling comes after LLL code
+l0ml2 EQU OFFSET lit_len_mat_len_2b_01 + 1 ; MMMM handling comes after LLL code
+l0me2 EQU OFFSET lit_len_mat_ext_2b + 6    ; MMMM handling comes after LLL code
+
+;         0     1     2     3     4     5     6     7     8     9     a     b     c     d     e     f
+jtbl DW l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0me1 ;0
+     DW l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1me1 ;1
+     DW l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2ml1,l2me1 ;2
+     DW l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3ml1,l3me1 ;3
+     DW l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4ml1,l4me1 ;4
+     DW l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5ml1,l5me1 ;5
+     DW l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6ml1,l6me1 ;6
+     DW leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leml1,leme1 ;7
+     DW l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0ml2,l0me2 ;8
+     DW l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1ml2,l1me2 ;9
+     DW l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2ml2,l2me2 ;a
+     DW l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3ml2,l3me2 ;b
+     DW l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4ml2,l4me2 ;c
+     DW l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5ml2,l5me2 ;d
+     DW l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6ml2,l6me2 ;e
+     DW leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leml2,leme2 ;f

 PROC    lzsa1_decompress_speed_jumptable  NEAR
+; ---------------------------------------------------------------------------
+; Decompress raw LZSA1 block
+; inputs:
+; * ds:si: raw LZSA1 block
+; * es:di: output buffer
+; output:
+; * ax:    decompressed size
+; ---------------------------------------------------------------------------
+
+MACRO get_byte_match_offset
+        mov     ah,0ffh         ;O=0, so set up offset's high byte
+        lodsb                   ;load low byte; ax=match offset
+        xchg    bp,ax           ;bp=match offset  ax=00 + original token
+ENDM
+
+MACRO get_word_match_offset
+        lodsw                   ;ax=match offset
+        xchg    bp,ax           ;bp=match offset  ax=00 + original token
+ENDM
+
+MACRO do_match_copy_long
+LOCAL even0,even1,even2,do_run,do_run_w
+; Copies a long match as optimally as possible.
+; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output
+; trashes: ax, bx
+; must leave cx=0 at exit
+        mov     bx,ds           ;save ds
+        mov     ax,es
+        mov     ds,ax           ;ds=es
+        xchg    ax,si           ;save si
+        lea     si,[bp+di]      ;si = output buffer + negative match offset
+        cmp     bp,-2           ;do we have a byte/word run to optimize?
+IF OPTIMIZE_LONG_RLE
+        jae     do_run          ;catch offset = -2 or -1
+ELSE
+        ja      do_run          ;catch offset = -1
+ENDIF
+
+;If we're here, we have a long copy and it isn't byte-overlapping (if it
+;overlapped, we'd be in @@do_run)  So, let's copy faster with REP MOVSW.
+;This affects 8088 only slightly, but is a bigger win on 8086 and higher.
+        shr     cx,1
+        jnc     even0
+        movsb
+even0:
+        rep     movsw
+        xchg    si,ax           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+do_run:
+IF OPTIMIZE_LONG_RLE
+        je      do_run_w        ;if applicable, handle word-sized value faster
+ENDIF
+        xchg    dx,ax           ;save si into dx, as ax is getting trashed
+        lodsb                   ;load first byte of run into al
+        mov     ah,al
+        shr     cx,1
+        jnc     even1
+        stosb
+even1:
+        rep     stosw           ;perform word run
+        mov     si,dx           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+
+IF OPTIMIZE_LONG_RLE
+do_run_w:
+        xchg    dx,ax           ;save si into dx, as ax is getting trashed
+        lodsw                   ;load first word of run
+        shr     cx,1
+        rep     stosw           ;perform word run
+        jnc     even2
+        stosb                   ;should be after rep stosw!
+even2:
+        mov     si,dx           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+ENDIF
+ENDM
+
+MACRO do_match_copy
+; Copies a shorter match with as little overhead as possible.
+; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output
+; trashes: ax, bx
+; must leave cx=0 at exit
+        mov     bx,ds           ;save ds
+        mov     ax,es
+        mov     ds,ax           ;ds=es
+        xchg    ax,si           ;save si
+        lea     si,[bp+di]      ;si = output buffer + negative match offset
+        movsb
+        movsb
+        movsb                   ;Handle MINMATCH (instead of add cx,MINMATCH)
+        rep     movsb
+        xchg    si,ax           ;restore si
+        mov     ds,bx           ;restore ds
+        jmp     decode_token
+ENDM
+
+MACRO do_literal_copy
+LOCAL even
+; Copies a literal sequence using words.
+; Meant for longer lengths; for 128 bytes or less, use REP MOVSB.
+; requirements: cx=length, ds:si=compdata, es:di=output
+; must leave cx=0 at exit
+        shr     cx,1
+        jnc even
+        movsb
+even:
+        rep     movsw
+ENDM
+
+MACRO copy_small_match_len
+        and     al,0FH          ;isolate length in token (MMMM)
+        xchg    cx,ax           ;cx=match length
+        do_match_copy           ;copy match with cx=length, bp=offset
+ENDM
+
+MACRO copy_large_match_len
+LOCAL val239,val238,EOD
+; Handle MMMM=Fh
+; Assumptions: ah=0 from get_????_match_offset's xchg
+        lodsb                   ;grab extra match length byte
+        add     al,0Fh+minmatch ;add MATCH_RUN_LEN + MIN_MATCH_SIZE
+;       jz      val238          ;if zf & cf, 238: get 16-bit match length
+        jc      val239          ;if cf,      239: get extra match length byte
+        xchg    cx,ax           ;otherwise, we have our match length
+        do_match_copy_long      ;copy match with cx=length, bp=offset
+val239:
+        jz val238
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_match_copy_long      ;copy match with cx=length, bp=offset
+val238:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        jcxz    EOD             ;is it the EOD marker? Exit if so
+        do_match_copy_long      ;copy match with cx=length, bp=offset
+EOD:
+        jmp     done_decompressing
+ENDM
+

 lzsa1_start:
        push    di              ;remember decompression offset
        cld                     ;ensure string ops move forward
        xor     cx,cx

-@@decode_token:
-        xchg    cx,ax           ;clear ah (cx = 0 from match copy's rep movsb)
+decode_token:
+        xchg    cx,ax           ;clear ah (cx = 0 from match copy's REP)
        lodsb                   ;read token byte: O|LLL|MMMM
-        mov     dx,ax           ;copy our token to dl for later MMMM handling
+        mov     bp,ax           ;preserve 0+token in bp for later MMMM handling
+        mov     bx,ax           ;prep for table lookup
+        shl     bx,1            ;adjust for offset word size
+        jmp     [cs:jtbl+bx]    ;jump directly to relevant decode path

-        and     al,070H         ;isolate literals length in token (LLL)
-        jz      check_offset_size ;if LLL=0, we have no literals; goto match
+; There are eight basic decode paths for an LZSA1 token.  Each of these
+; paths perform only the necessary actions to decode the token and then
+; fetch the next token.  This results in a lot of code duplication, but
+; it is the only way to get down to two branches per token (jump to unique
+; decode path, then jump back to next token) for the most common cases.

-; Jump to short copy routine for LLL=1 though 6, need_length_byte for LLL=7
-        mov     bx,ax           ;prep for table lookup (must copy, don't XCHG!)
-        jmp     [cs:copytable+bx]
+; Path #1: LLL=0-6, MMMM=0-Eh, O=0 (1-byte match offset)
+; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
+lit_len_mat_len_1b_01:
+        movsb
+        get_byte_match_offset
+        copy_small_match_len
+lit_len_mat_len_1b_23:
+        movsb
+        movsw
+        get_byte_match_offset
+        copy_small_match_len
+lit_len_mat_len_1b_45:
+        movsb
+        movsw
+        movsw
+        get_byte_match_offset
+        copy_small_match_len
+lit_len_mat_len_1b_6:
+        movsw
+        movsw
+        movsw
+        get_byte_match_offset
+        copy_small_match_len

-need_length_byte:
-        lodsb                   ;grab extra length byte
-        add     al,07H          ;add LITERALS_RUN_LEN
-        jnc     @@got_literals_exact ;if no overflow, we have full count
-        je      @@big_literals
+; Path #2: LLL=0-6, MMMM=Fh,   O=0 (1-byte match offset)
+lit_len_mat_ext_1b:
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        get_byte_match_offset
+        copy_large_match_len

-@@mid_literals:
-        lodsb                   ;grab single extra length byte
-        inc     ah              ;add 256
-        xchg    cx,ax           ;with longer counts, we can save some time
-        shr     cx,1            ;by doing a word copy instead of a byte copy.
-        rep     movsw           ;We don't need to account for overlap because
-        adc     cx,0            ;source for literals isn't the output buffer.
-        rep     movsb
-        jmp     check_offset_size

-@@big_literals:
-        lodsw                   ;grab 16-bit extra length
-        xchg    cx,ax           ;with longer counts, we can save some time
-        shr     cx,1            ;by doing a word copy instead of a byte copy.
-        rep     movsw
-        adc     cx,0
-        rep     movsb
-        jmp     check_offset_size
-
-; Used for counts 7-248. In test data, average value around 1Ah.  YMMV.
-@@got_literals_exact:
+; Path #3: LLL=7,   MMMM=0-Eh, O=0 (1-byte match offset)
+lit_ext_mat_len_1b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+;       jz      @@val249_3      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_3      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_byte_match_offset
+        copy_small_match_len
+@@val250_3:
+jz      @@val249_3
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
        xchg    cx,ax
-        rep     movsb           ;copy cx literals from ds:si to es:di
-        jmp     check_offset_size
-
-;Literal copy sequence for lengths 1-6:
-copy6b: movsb
-copy5b: movsb
-copy4b: movsb
-copy3b: movsb
-copy2b: movsb
-copy1b: movsb
-
-;Literals done; fall through to match offset determination
-check_offset_size:
-        test    dl,dl           ;check match offset size in token (O bit)
-        js      @@get_long_offset ;load absolute 16-bit match offset
-
-        mov     ah,0ffh         ;set up high byte
-        lodsb                   ;load low byte
-
-@@get_match_length:
-        xchg    dx,ax           ;dx: match offset  ax: original token
-        and     al,0FH          ;isolate match length in token (MMMM)
-        cmp     al,0FH          ;MATCH_RUN_LEN?
-        jne     @@got_matchlen_short  ;no, we have the full match length from the token, go copy
-
-        lodsb                   ;grab extra length byte
-        add     al,012H         ;add MIN_MATCH_SIZE + MATCH_RUN_LEN
-        jnc     @@do_long_copy  ;if no overflow, we have the entire length
-        jne     @@mid_matchlen
-
+        do_literal_copy
+        get_byte_match_offset
+        copy_small_match_len
+@@val249_3:
        lodsw                   ;grab 16-bit length
-        xchg    cx,ax           ;get ready to do a long copy
-        jcxz    @@done_decompressing ;wait, is it the EOD marker? Exit if so
-        jmp     @@copy_len_preset ;otherwise, do the copy
+        xchg    cx,ax
+        do_literal_copy
+        get_byte_match_offset
+        copy_small_match_len

-@@got_matchlen_short:
-        add     al,3            ;add MIN_MATCH_SIZE
-        xchg    cx,ax           ;copy match length into cx
-        mov     bp,ds           ;save ds
-        mov     ax,es
-        mov     ds,ax           ;ds=es
-        xchg    ax,si           ;save si
-        mov     si,di           ;ds:si now points at back reference in output data
-        add     si,dx
-        rep     movsb           ;copy match
-        xchg    si,ax           ;restore si
-        mov     ds,bp           ;restore ds
-        jmp     @@decode_token  ;go decode another token

-@@done_decompressing:
+; Path #4: LLL=7,   MMMM=Fh,   O=0 (1-byte match offset)
+lit_ext_mat_ext_1b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+;       jz      @@val249_4      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_4      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_byte_match_offset
+        copy_large_match_len
+@@val250_4:
+jz @@val249_4
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_literal_copy
+        get_byte_match_offset
+        copy_large_match_len
+@@val249_4:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        do_literal_copy
+        get_byte_match_offset
+        copy_large_match_len
+
+
+; Path #5: LLL=0-6, MMMM=0-Eh, O=1 (2-byte match offset)
+; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1)
+lit_len_mat_len_2b_01:
+        movsb
+        get_word_match_offset
+        copy_small_match_len
+lit_len_mat_len_2b_23:
+        movsb
+        movsw
+        get_word_match_offset
+        copy_small_match_len
+lit_len_mat_len_2b_45:
+        movsb
+        movsw
+        movsw
+        get_word_match_offset
+        copy_small_match_len
+lit_len_mat_len_2b_6:
+        movsw
+        movsw
+        movsw
+        get_word_match_offset
+        copy_small_match_len
+
+
+; Path #6: LLL=0-6, MMMM=Fh,   O=1 (2-byte match offset)
+; Path #6: LLL=0-6, MMMM=Fh,   O=1 (2-byte match offset)
+lit_len_mat_ext_2b:
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        movsb
+        get_word_match_offset
+        copy_large_match_len
+
+; Path #7: LLL=7,   MMMM=0-Eh, O=1 (2-byte match offset)
+lit_ext_mat_len_2b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+;       jz      @@val249_7      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_7      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_word_match_offset
+        copy_small_match_len
+@@val250_7:
+jz @@val249_7
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_small_match_len
+@@val249_7:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_small_match_len
+
+
+; Path #8: LLL=7,   MMMM=Fh,   O=1 (2-byte match offset)
+lit_ext_mat_ext_2b:
+; on entry: ax=0 + token, bp=ax
+        lodsb                   ;grab extra literal length byte
+        add     al,litrunlen    ;add 7h literal run length
+;       jz      @@val249_8      ;if zf & cf, 249: get 16-bit literal length
+        jc      @@val250_8      ;if cf,      250: get extra literal length byte
+        xchg    cx,ax           ;otherwise, we have our literal length
+        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        get_word_match_offset
+        copy_large_match_len
+@@val250_8:
+jz @@val249_8
+        lodsb                   ;ah=0; grab single extra length byte
+        inc     ah              ;ax=256+length byte
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_large_match_len
+@@val249_8:
+        lodsw                   ;grab 16-bit length
+        xchg    cx,ax
+        do_literal_copy
+        get_word_match_offset
+        copy_large_match_len
+
+
+done_decompressing:
+;return # of decompressed bytes in ax
        pop     ax              ;retrieve the original decompression offset
-        xchg    di,ax           ;compute decompressed size
-        sub     ax,di
+        sub     di,ax           ;adjust for original offset
+        xchg    di,ax           ;return adjusted value in ax
        ret                     ;done decompressing, exit to caller

-;These are called less often; moved here to optimize the fall-through case
-@@get_long_offset:
-        lodsw                   ;Get 2-byte match offset
-        jmp     @@get_match_length
-
-;With a confirmed longer match length, we have an opportunity to optimize for
-;the case where a single byte is repeated long enough that we can benefit
-;from rep movsw to perform the run (instead of rep movsb).
-@@mid_matchlen:
-        lodsb                   ;grab single extra length byte
-        inc     ah              ;add 256
-@@do_long_copy:
-        xchg    cx,ax           ;copy match length into cx
-@@copy_len_preset:
-        push    ds              ;save ds
-        mov     bp,es
-        mov     ds,bp           ;ds=es
-        mov     bp,si           ;save si
-        mov     si,di           ;ds:si now points at back reference in output data
-        add     si,dx
-        cmp     dx,-2           ;do we have a byte/word run to optimize?
-        jae     @@do_run        ;perform a run
-;You may be tempted to change "jae" to "jge" because DX is a signed number.
-;Don't!  The total window is 64k, so if you treat this as a signed comparison,
-;you will get incorrect results for offsets over 32K.
-
-;If we're here, we have a long copy and it isn't byte-overlapping (if it
-;overlapped, we'd be in @@do_run_1)  So, let's copy faster with REP MOVSW.
-;This won't affect 8088 that much, but it speeds up 8086 and higher.
-        shr     cx,1
-        rep     movsw
-        adc     cx,0
-        rep     movsb
-        mov     si,bp           ;restore si
-        pop     ds
-        jmp     @@decode_token  ;go decode another token
-
-@@do_run:
-        je      @@do_run_2      ;fall through to byte (common) if not word run
-
-@@do_run_1:
-        lodsb                   ;load first byte of run into al
-        mov     ah,al
-        shr     cx,1
-        rep     stosw           ;perform word run
-        adc     cx,0
-        rep     stosb           ;finish word run
-        mov     si,bp           ;restore si
-        pop     ds
-        jmp     @@decode_token  ;go decode another token
-
-@@do_run_2:
-        lodsw                   ;load first word of run
-        shr     cx,1
-        rep     stosw           ;perform word run
-        adc     cx,0            ;despite 2-byte offset, compressor might
-        rep     stosb           ;output odd length. better safe than sorry.
-        mov     si,bp           ;restore si
-        pop     ds
-        jmp     @@decode_token  ;go decode another token
-
 ENDP    lzsa1_decompress_speed_jumptable

 ENDS    CODE

 END

+
+
 ;Speed optimization history (decompression times in microseconds @ 4.77 MHz):
-; original E. Marty code    shuttle 123208 alice 65660 robotron 407338 ***
-; table for shr al,4        shuttle 120964 alice 63230 robotron 394733 +++
-; push/pop to mov/mov       shuttle 118176 alice 61835 robotron 386762 +++
-; movsw for literalcpys     shuttle 124102 alice 64908 robotron 400220 --- rb
-; stosw for byte runs       shuttle 118897 alice 65040 robotron 403518 --- rb
-; better stosw for runs     shuttle 117712 alice 65040 robotron 403343 +--
-; disable RLE by default    shuttle 116924 alice 60783 robotron 381226 +++
-; optimize got_matchlen     shuttle 115294 alice 59588 robotron 374330 +++
-; fall through to getML     shuttle 113258 alice 59572 robotron 372004 +++
-; fall through to midLI     shuttle 113258 alice 59572 robotron 375060 ..- rb
-; fall through midMaLen     shuttle 113247 alice 59572 robotron 372004 +.+
-; movsw for litlen > 255    shuttle 113247 alice 59572 robotron 371612 ..+
-; rep stosw for long runs   shuttle 113247 alice 59572 robotron 371612 ...
-; rep movsw for long cpys   shuttle 113247 alice 59572 robotron 371035 ..+
-; xchg/dec ah -> mov ah,val shuttle 112575 alice 59272 robotron 369198 +++
-; force >12h len.to longcpy shuttle 101998 alice 59266 robotron 364459 +.+
-; more efficient run branch shuttle 102239 alice 59297 robotron 364716 --- rb
-; even more eff. run branch shuttle 101998 alice 59266 robotron 364459 ***
-; BUGFIX - bad sign compare shuttle 101955 alice 59225 robotron 364117 +++
-; reverse 16-bit len compar shuttle 102000 alice 59263 robotron 364460 --- rb
-; jcxz for EOD detection    no change to speed, but is 1 byte shorter  +++
-; force movsw for literals  shuttle 107183 alice 62555 robotron 379524 --- rb
-; defer shr4 until necessry shuttle 102069 alice 60236 robotron 364096 ---
-; skip literals if LLL=0    shuttle  98655 alice 57849 robotron 363358 ---
-; fall through to mid_liter shuttle  98595 alice 57789 robotron 361998 +++
-; == jumptable experiments begin ==
-; jumptable for small copys shuttle 101594 alice 61078 robotron 386018 ---
-; start:xchg instead of mov shuttle 100948 alice 60467 robotron 381112 +++
-; use table for LLL=0 check shuttle 106972 alice 63333 robotron 388304 --- rb
-; jmptbl to fallthrough mov shuttle 102532 alice 60760 robotron 383070 ---
-; cpy fallthrough check_ofs shuttle  98939 alice 58917 robotron 371019 +**
-; single jumptable jump     shuttle  97528 alice 57264 robotron 362194 ++*
-; conditional check for L=7 shuttle  98610 alice 58521 robotron 368153 --- rb
 ; defer add MIN_MATCH_SIZE  shuttle  97207 alice 57200 robotron 362884 ++*
+; jumptable rewrite, no RLE shuttle  97744 alice 46905 robotron 309032 -++
+; adc cx,0 -> adc cl,0      shuttle  97744 alice 46893 robotron 309032 .+.!
+; jumptable rewrite w/RLE   shuttle  88776 alice 50433 robotron 319222 +--
+; short match copies movsb  shuttle  97298 alice 49769 robotron 326282 ---rb
+; long match copy #1 16-bit shuttle  92490 alice 46905 robotron 308722 +*+
+; long match copy #2 extraB shuttle  92464 alice 46905 robotron 308371 +.+
+; long match copy #3 0f->ed shuttle  86765 alice 46864 robotron 303895 +++!
+; baseline new test harness shuttle  83925 alice 37948 robotron 269002 ***
+; Pavel optimizations       shuttle  82225 alice 36798 robotron 261226 +++
+; OPTIMIZE_LONG_RLE 1       shuttle  82242 alice 36787 robotron 261392 **-
+;
+;------
+;
+;Pavel's optimization history:
+;                        shuttle   alice   robotron  time in 1.193 MHz timer clocks
+;baseline                  19109    D9A6      570F6
+;adc cl,0->adc cl,cl       19035    D9A6      56FAB
+;rep movsb->shr cx,1;jnc   18FD4    D998      56F14
+;cmp bp,-2->inc bp;inc bp  18F07    D999      56EA3
+;jz;jc->jc                 18D81    D973      56B2F
+;add al,3->movsb x3        18B1E    D777      56197
+;more lit_len_mat tables   18A83    D341      54ACC
--- a/asm/x86/decompress_small_v1.asm
+++ b/asm/x86/decompress_small_v1.asm
@ -0,0 +1,120 @@
+;  decompress_small_v1.asm - space-efficient decompressor implementation for x86
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+    segment .text
+    bits 32
+
+;  ---------------------------------------------------------------------------
+;  Decompress raw LZSA1 block
+;  inputs:
+;  * esi: raw LZSA1 block
+;  * edi: output buffer
+;  output:
+;  * eax:    decompressed size
+;  ---------------------------------------------------------------------------
+
+    %ifndef BIN
+      global lzsa1_decompress
+      global _lzsa1_decompress
+    %endif
+    
+lzsa1_decompress:
+_lzsa1_decompress:
+    pushad
+    
+    ;mov    edi, [esp+32+4]    ; edi = outbuf
+    ;mov    esi, [esp+32+8]    ; esi = inbuf
+    
+    xor    ecx, ecx
+.decode_token:
+    mul    ecx
+    lodsb                     ; read token byte: O|LLL|MMMM
+    mov    dl, al             ; keep token in dl
+   
+    and    al, 070H           ; isolate literals length in token (LLL)
+    shr    al, 4              ; shift literals length into place
+
+    cmp    al, 07H            ; LITERALS_RUN_LEN?
+    jne    .got_literals      ; no, we have the full literals count from the token, go copy
+
+    lodsb                     ; grab extra length byte
+    add    al, 07H            ; add LITERALS_RUN_LEN
+    jnc    .got_literals      ; if no overflow, we have the full literals count, go copy
+    jne    .mid_literals
+
+    lodsw                     ; grab 16-bit extra length
+    jmp    .got_literals
+
+.mid_literals:
+    lodsb                     ; grab single extra length byte
+    inc    ah                 ; add 256
+
+.got_literals:
+    xchg   ecx, eax
+    rep    movsb              ; copy cx literals from ds:si to es:di
+
+    test   dl, dl             ; check match offset size in token (O bit)
+    js     .get_long_offset
+
+    dec     ecx
+    xchg    eax, ecx          ; clear ah - cx is zero from the rep movsb above
+    lodsb
+    jmp     .get_match_length
+
+.get_long_offset:
+    lodsw                     ; Get 2-byte match offset
+
+.get_match_length:
+    xchg    eax, edx          ; edx: match offset  eax: original token
+    and     al, 0FH           ; isolate match length in token (MMMM)
+    add     al, 3             ; add MIN_MATCH_SIZE
+
+    cmp     al, 012H          ; MATCH_RUN_LEN?
+    jne     .got_matchlen     ; no, we have the full match length from the token, go copy
+
+    lodsb                     ; grab extra length byte
+    add     al,012H           ; add MIN_MATCH_SIZE + MATCH_RUN_LEN
+    jnc     .got_matchlen     ; if no overflow, we have the entire length
+    jne     .mid_matchlen       
+
+    lodsw                     ; grab 16-bit length
+    test    eax, eax          ; bail if we hit EOD
+    je      .done_decompressing 
+    jmp     .got_matchlen
+
+.mid_matchlen:
+    lodsb                     ; grab single extra length byte
+    inc     ah                ; add 256
+
+.got_matchlen:
+    xchg    ecx, eax          ; copy match length into ecx
+    xchg    esi, eax          
+    mov     esi, edi          ; esi now points at back reference in output data
+    movsx   edx, dx           ; sign-extend dx to 32-bits.
+    add     esi, edx
+    rep     movsb             ; copy match
+    xchg    esi, eax          ; restore esi
+    jmp     .decode_token     ; go decode another token
+
+.done_decompressing:
+    sub    edi, [esp+32+4]
+    mov    [esp+28], edi      ; eax = decompressed size
+    popad
+    ret                       ; done
--- a/asm/x86/decompress_small_v2.asm
+++ b/asm/x86/decompress_small_v2.asm
@ -0,0 +1,181 @@
+;  decompress_small_v2.asm - space-efficient decompressor implementation for x86
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+    segment .text
+    bits 32
+
+;  ---------------------------------------------------------------------------
+;  Decompress raw LZSA2 block
+;  inputs:
+;  * esi: raw LZSA2 block
+;  * edi: output buffer
+;  output:
+;  * eax:    decompressed size
+;  ---------------------------------------------------------------------------
+    
+    %ifndef BIN
+      global lzsa2_decompress
+      global _lzsa2_decompress
+    %endif
+    
+lzsa2_decompress:
+_lzsa2_decompress:
+    pushad
+    
+    ;mov    edi, [esp+32+4]      ; edi = outbuf
+    ;mov    esi, [esp+32+8]      ; esi = inbuf
+    
+    xor    ecx, ecx
+    xor    ebx, ebx             ; ebx = 0100H
+    inc    bh
+    xor    ebp, ebp
+
+.decode_token:
+    mul    ecx
+    lodsb                       ; read token byte: XYZ|LL|MMMM
+    mov    dl, al               ; keep token in dl
+   
+    and    al, 018H             ; isolate literals length in token (LL)
+    shr    al, 3                ; shift literals length into place
+
+    cmp    al, 03H              ; LITERALS_RUN_LEN_V2?
+    jne    .got_literals        ; no, we have the full literals count from the token, go copy
+
+    call   .get_nibble          ; get extra literals length nibble
+    add    al, cl               ; add len from token to nibble 
+    cmp    al, 012H             ; LITERALS_RUN_LEN_V2 + 15 ?
+    jne    .got_literals        ; if not, we have the full literals count, go copy
+
+    lodsb                       ; grab extra length byte
+    add    al,012H              ; overflow?
+    jnc    .got_literals        ; if not, we have the full literals count, go copy
+
+    lodsw                       ; grab 16-bit extra length
+
+.got_literals:
+    xchg   ecx, eax
+    rep    movsb                ; copy ecx literals from esi to edi
+
+    test   dl, 0C0h             ; check match offset mode in token (X bit)
+    js     .rep_match_or_large_offset
+
+    ;;cmp dl,040H               ; check if this is a 5 or 9-bit offset (Y bit)
+                                ; discovered via the test with bit 6 set
+    xchg   ecx, eax             ; clear ah - cx is zero from the rep movsb above
+    jne    .offset_9_bit
+
+                                ; 5 bit offset
+    cmp    dl, 020H             ; test bit 5
+    call   .get_nibble_x
+    jmp    .dec_offset_top
+
+.offset_9_bit:                  ; 9 bit offset
+    lodsb                       ; get 8 bit offset from stream in A
+    dec    ah                   ; set offset bits 15-8 to 1
+    test   dl, 020H             ; test bit Z (offset bit 8)
+    je     .get_match_length
+.dec_offset_top:
+    dec    ah                   ; clear bit 8 if Z bit is clear
+                                ; or set offset bits 15-8 to 1
+    jmp    .get_match_length
+
+.rep_match_or_large_offset:
+    ;;cmp dl,0c0H               ; check if this is a 13-bit offset or a 16-bit offset/rep match (Y bit)
+    jpe    .rep_match_or_16_bit
+
+                                ; 13 bit offset
+
+    cmp    dl, 0A0H             ; test bit 5 (knowing that bit 7 is also set)
+    xchg   ah, al
+    call   .get_nibble_x
+    sub    al, 2                ; substract 512
+    jmp    .get_match_length_1
+
+.rep_match_or_16_bit:
+    test   dl, 020H             ; test bit Z (offset bit 8)
+    jne    .repeat_match        ; rep-match
+
+                                ; 16 bit offset
+    lodsb                       ; Get 2-byte match offset
+
+.get_match_length_1:
+    xchg   ah, al
+    lodsb                       ; load match offset bits 0-7
+
+.get_match_length:
+    xchg   ebp, eax             ; ebp: offset
+.repeat_match:
+    xchg   eax, edx             ; ax: original token
+    and    al, 07H              ; isolate match length in token (MMM)
+    add    al, 2                ; add MIN_MATCH_SIZE_V2
+
+    cmp    al, 09H              ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+    jne    .got_matchlen        ; no, we have the full match length from the token, go copy
+
+    call   .get_nibble          ; get extra literals length nibble
+    add    al, cl               ; add len from token to nibble 
+    cmp    al, 018H             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+    jne    .got_matchlen        ; no, we have the full match length from the token, go copy
+
+    lodsb                       ; grab extra length byte
+    add    al,018H              ; overflow?
+    jnc    .got_matchlen        ; if not, we have the entire length
+    je     .done_decompressing  ; detect EOD code
+
+    lodsw                       ; grab 16-bit length
+
+.got_matchlen:
+    xchg   ecx, eax             ; copy match length into ecx
+    xchg   esi, eax          
+    movsx  ebp, bp              ; sign-extend bp to 32-bits
+    lea    esi,[ebp+edi]        ; esi now points at back reference in output data
+    rep    movsb                ; copy match
+    xchg   esi, eax             ; restore esi
+    jmp    .decode_token        ; go decode another token
+
+.done_decompressing:
+    sub    edi, [esp+32+4]
+    mov    [esp+28], edi
+    popad
+    ret                         ; done
+
+.get_nibble_x:
+    cmc                         ; carry set if bit 4 was set
+    rcr    al, 1
+    call   .get_nibble          ; get nibble for offset bits 0-3
+    or     al, cl               ; merge nibble
+    rol    al, 1
+    xor    al, 0E1H             ; set offset bits 7-5 to 1
+    ret
+
+.get_nibble:
+    neg    bh                   ; nibble ready?
+    jns    .has_nibble
+   
+    xchg   ebx, eax
+    lodsb                       ; load two nibbles
+    xchg   ebx, eax
+
+.has_nibble:
+    mov    cl, 4                ; swap 4 high and low bits of nibble
+    ror    bl, cl
+    mov    cl, 0FH
+    and    cl, bl
+    ret
--- a/asm/z80/unlzsa1_fast.asm
+++ b/asm/z80/unlzsa1_fast.asm
@ -0,0 +1,210 @@
+;
+;  Speed-optimized LZSA1 decompressor by spke & uniabis (113 bytes)
+;
+;  ver.00 by spke for LZSA 0.5.4 (03-24/04/2019, 134 bytes);
+;  ver.01 by spke for LZSA 0.5.6 (25/04/2019, 110(-24) bytes, +0.2% speed);
+;  ver.02 by spke for LZSA 1.0.5 (24/07/2019, added support for backward decompression);
+;  ver.03 by uniabis (30/07/2019, 109(-1) bytes, +3.5% speed);
+;  ver.04 by spke (31/07/2019, small re-organization of macros);
+;  ver.05 by uniabis (22/08/2019, 107(-2) bytes, same speed);
+;  ver.06 by spke for LZSA 1.0.7 (27/08/2019, 111(+4) bytes, +2.1% speed);
+;  ver.07 by spke for LZSA 1.1.0 (25/09/2019, added full revision history);
+;  ver.08 by spke for LZSA 1.1.2 (22/10/2019, re-organized macros and added an option for unrolled copying of long matches);
+;  ver.09 by spke for LZSA 1.2.1 (02/01/2020, 109(-2) bytes, same speed);
+;  ver.10 by spke (07/04/2021, 113(+4) bytes, +5% speed)
+;
+;  The data must be compressed using the command line compressor by Emmanuel Marty
+;  The compression is done as follows:
+;
+;  lzsa.exe -f1 -r <sourcefile> <outfile>
+;
+;  where option -r asks for the generation of raw (frame-less) data.
+;
+;  The decompression is done in the standard way:
+;
+;  ld hl,FirstByteOfCompressedData
+;  ld de,FirstByteOfMemoryForDecompressedData
+;  call DecompressLZSA1
+;
+;  Backward compression is also supported; you can compress files backward using:
+;
+;  lzsa.exe -f1 -r -b <sourcefile> <outfile>
+;
+;  and decompress the resulting files using:
+;
+;  ld hl,LastByteOfCompressedData
+;  ld de,LastByteOfMemoryForDecompressedData
+;  call DecompressLZSA1
+;
+;  (do not forget to uncomment the BACKWARD_DECOMPRESS option in the decompressor).
+;
+;  Of course, LZSA compression algorithms are (c) 2019 Emmanuel Marty,
+;  see https://github.com/emmanuel-marty/lzsa for more information
+;
+;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+;	DEFINE	UNROLL_LONG_MATCHES						; uncomment for faster decompression of very compressible data (+51 byte)
+;	DEFINE	BACKWARD_DECOMPRESS						; uncomment to decompress backward compressed data (-3% speed, +5 bytes)
+
+	IFNDEF	BACKWARD_DECOMPRESS
+
+		MACRO NEXT_HL
+		inc hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		; HL = DE+HL
+		add hl,de
+		ENDM
+
+		MACRO COPY1
+		ldi
+		ENDM
+
+		MACRO COPYBC
+		ldir
+		ENDM
+
+	ELSE
+
+		MACRO NEXT_HL
+		dec hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		; HL = DE-HL
+		ld a,e : sub l : ld l,a
+		ld a,d : sbc h : ld h,a						; 6*4 = 24t / 6 bytes
+		ENDM
+
+		MACRO COPY1
+		ldd
+		ENDM
+
+		MACRO COPYBC
+		lddr
+		ENDM
+
+	ENDIF
+
+@DecompressLZSA1:
+		ld b,0 : jr ReadToken
+
+	IFNDEF	UNROLL_LONG_MATCHES
+
+CopyMatch2:	ld c,a
+.UseC		ex (sp),hl : jr CopyMatch.UseC
+
+	ENDIF
+
+NoLiterals:	xor (hl) : NEXT_HL : jp m,LongOffset
+
+ShortOffset:	push hl : ld l,(hl) : ld h,#FF
+
+ 		; short matches have length 0+3..14+3
+		add 3 : cp 15+3 : jr nc,LongerMatch
+
+		; placed here this saves a JP per iteration
+CopyMatch:	ld c,a								; BC = len, DE = dest, HL = offset, SP -> [src]
+.UseC		ADD_OFFSET							; BC = len, DE = dest, HL = dest-offset, SP->[src]
+		COPY1 : COPY1 : COPYBC						; BC = 0, DE = dest
+.popSrc		pop hl : NEXT_HL						; HL = src
+	
+ReadToken:	; first a byte token "O|LLL|MMMM" is read from the stream,
+		; where LLL is the number of literals and MMMM is
+		; a length of the match that follows after the literals
+		ld a,(hl) : and #70 : jr z,NoLiterals
+
+		cp #70 : jr z,MoreLiterals					; LLL=7 means 7+ literals...
+		rrca : rrca : rrca : rrca : ld c,a				; LLL<7 means 0..6 literals...
+
+		ld a,(hl) : NEXT_HL
+		COPYBC
+
+		; the top bit of token is set if the offset contains two bytes
+		and #8F : jp p,ShortOffset
+
+LongOffset:	; read second byte of the offset
+		ld c,(hl) : NEXT_HL : push hl : ld h,(hl) : ld l,c
+		add -128+3 : cp 15+3 : jp c,CopyMatch
+
+	IFNDEF	UNROLL_LONG_MATCHES
+
+		; MMMM=15 indicates a multi-byte number of literals
+LongerMatch:	ex (sp),hl : NEXT_HL : add (hl) : jr nc,CopyMatch2
+
+		; the codes are designed to overflow;
+		; the overflow value 1 means read 1 extra byte
+		; and overflow value 0 means read 2 extra bytes
+.code1		ld b,a : NEXT_HL : ld c,(hl) : jr nz,CopyMatch2.UseC
+.code0		NEXT_HL : ld b,(hl)
+
+		; the two-byte match length equal to zero
+		; designates the end-of-data marker
+		ld a,b : or c : jr nz,CopyMatch2.UseC
+		pop bc : ret
+
+	ELSE
+
+		; MMMM=15 indicates a multi-byte number of literals
+LongerMatch:	ex (sp),hl : NEXT_HL : add (hl) : jr c,VeryLongMatch
+
+		ld c,a
+.UseC		ex (sp),hl
+		ADD_OFFSET
+		COPY1 : COPY1
+
+		; this is an unrolled equivalent of LDIR
+		xor a : sub c
+		and 16-1 : add a
+		ld (.jrOffset),a : jr nz,$+2
+.jrOffset	EQU $-1
+.fastLDIR	DUP 16
+		COPY1
+		EDUP
+		jp pe,.fastLDIR
+		jr CopyMatch.popSrc
+
+VeryLongMatch:	; the codes are designed to overflow;
+		; the overflow value 1 means read 1 extra byte
+		; and overflow value 0 means read 2 extra bytes
+.code1		ld b,a : NEXT_HL : ld c,(hl) : jr nz,LongerMatch.UseC
+.code0		NEXT_HL : ld b,(hl)
+
+		; the two-byte match length equal to zero
+		; designates the end-of-data marker
+		ld a,b : or c : jr nz,LongerMatch.UseC
+		pop bc : ret
+
+	ENDIF
+
+MoreLiterals:	; there are three possible situations here
+		xor (hl) : NEXT_HL : exa
+		ld a,7 : add (hl) : jr c,ManyLiterals
+
+CopyLiterals:	ld c,a
+.UseC		NEXT_HL : COPYBC
+
+		exa : jp p,ShortOffset : jr LongOffset
+
+ManyLiterals:
+.code1		ld b,a : NEXT_HL : ld c,(hl) : jr nz,CopyLiterals.UseC
+.code0		NEXT_HL : ld b,(hl) : jr CopyLiterals.UseC
+
+
--- a/asm/z80/unlzsa_small_v1.asm
+++ b/asm/z80/unlzsa_small_v1.asm
@ -1,20 +1,39 @@
 ;
-;  Size-optimized LZSA decompressor by spke (v.1 23/04/2019, 69 bytes)
+;  Size-optimized LZSA1 decompressor by spke & uniabis (67 bytes)
+;
+;  ver.00 by spke for LZSA 0.5.4 (23/04/2019, 69 bytes);
+;  ver.01 by spke for LZSA 1.0.5 (24/07/2019, added support for backward decompression);
+;  ver.02 by uniabis (30/07/2019, 68(-1) bytes, +3.2% speed);
+;  ver.03 by spke for LZSA 1.0.7 (31/07/2019, small re-organization of macros);
+;  ver.04 by spke (06/08/2019, 67(-1) bytes, -1.2% speed);
+;  ver.05 by spke for LZSA 1.1.0 (25/09/2019, added full revision history)
 ;
 ;  The data must be compressed using the command line compressor by Emmanuel Marty
 ;  The compression is done as follows:
 ;
-;  lzsa.exe -r <sourcefile> <outfile>
+;  lzsa.exe -f1 -r <sourcefile> <outfile>
 ;
 ;  where option -r asks for the generation of raw (frame-less) data.
 ;
 ;  The decompression is done in the standard way:
 ;
-;  ld hl,CompressedData
-;  ld de,WhereToDecompress
-;  call DecompressLZSA
+;  ld hl,FirstByteOfCompressedData
+;  ld de,FirstByteOfMemoryForDecompressedData
+;  call DecompressLZSA1
 ;
-;  Of course, LZSA compression algorithm is (c) 2019 Emmanuel Marty,
+;  Backward compression is also supported; you can compress files backward using:
+;
+;  lzsa.exe -f1 -r -b <sourcefile> <outfile>
+;
+;  and decompress the resulting files using:
+;
+;  ld hl,LastByteOfCompressedData
+;  ld de,LastByteOfMemoryForDecompressedData
+;  call DecompressLZSA1
+;
+;  (do not forget to uncomment the BACKWARD_DECOMPRESS option in the decompressor).
+;
+;  Of course, LZSA compression algorithms are (c) 2019 Emmanuel Marty,
 ;  see https://github.com/emmanuel-marty/lzsa for more information
 ;
 ;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
@ -34,49 +53,80 @@
 ;  2. Altered source versions must be plainly marked as such, and must not be
 ;     misrepresented as being the original software.
 ;  3. This notice may not be removed or altered from any source distribution.
-;

-@DecompressLZSA:
+;	DEFINE	BACKWARD_DECOMPRESS
+
+	IFNDEF	BACKWARD_DECOMPRESS
+
+		MACRO NEXT_HL
+		inc hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		ex de,hl : add hl,de
+		ENDM
+
+		MACRO BLOCKCOPY
+		ldir
+		ENDM
+
+	ELSE
+
+		MACRO NEXT_HL
+		dec hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		push hl : or a : sbc hl,de : pop de				; 11+4+15+10 = 40t / 5 bytes
+		ENDM
+
+		MACRO BLOCKCOPY
+		lddr
+		ENDM
+
+	ENDIF
+
+@DecompressLZSA1:
 		ld b,0

 		; first a byte token "O|LLL|MMMM" is read from the stream,
 		; where LLL is the number of literals and MMMM is
 		; a length of the match that follows after the literals
-ReadToken:	ld a,(hl) : exa : ld a,(hl) : inc hl
+ReadToken:	ld a,(hl) : NEXT_HL : push af
 		and #70 : jr z,NoLiterals

 		rrca : rrca : rrca : rrca					; LLL<7 means 0..6 literals...
 		cp #07 : call z,ReadLongBA					; LLL=7 means 7+ literals...

-		ld c,a : ldir
+		ld c,a : BLOCKCOPY

 		; next we read the low byte of the -offset
-NoLiterals:	push de : ld e,(hl) : inc hl : ld d,#FF
+NoLiterals:	pop af : push de : ld e,(hl) : NEXT_HL : ld d,#FF
 		; the top bit of token is set if
 		; the offset contains the high byte as well
-		exa : or a : jp p,ShortOffset
+		or a : jp p,ShortOffset

-LongOffset:	ld d,(hl) : inc hl
+LongOffset:	ld d,(hl) : NEXT_HL

 		; last but not least, the match length is read
 ShortOffset:	and #0F : add 3							; MMMM<15 means match lengths 0+3..14+3
 		cp 15+3 : call z,ReadLongBA					; MMMM=15 means lengths 14+3+
 		ld c,a

-		ex (sp),hl : push hl						; BC = len, DE = -offset, HL = dest, SP ->[dest,src]
-		add hl,de : pop de						; BC = len, DE = dest, HL = dest+(-offset), SP->[src]
-		ldir : pop hl							; BC = 0, DE = dest, HL = src
-		jr ReadToken
+		ex (sp),hl							; BC = len, DE = -offset, HL = dest, SP -> [src]
+		ADD_OFFSET							; BC = len, DE = dest, HL = dest+(-offset), SP -> [src]
+		BLOCKCOPY							; BC = 0, DE = dest
+		pop hl : jr ReadToken						; HL = src

 		; a standard routine to read extended codes
 		; into registers B (higher byte) and A (lower byte).
-ReadLongBA:	add (hl) : inc hl : ret nc
+ReadLongBA:	add (hl) : NEXT_HL : ret nc

 		; the codes are designed to overflow;
 		; the overflow value 1 means read 1 extra byte
 		; and overflow value 0 means read 2 extra bytes
-.code1:		ld b,a : ld a,(hl) : inc hl : ret nz
-.code0:		ld c,a : ld b,(hl) : inc hl
+.code1:		ld b,a : ld a,(hl) : NEXT_HL : ret nz
+.code0:		ld c,a : ld b,(hl) : NEXT_HL

 		; the two-byte match length equal to zero
 		; designates the end-of-data marker
--- a/asm/z80/unlzsa2_fast.asm
+++ b/asm/z80/unlzsa2_fast.asm
@ -0,0 +1,265 @@
+;
+;  Speed-optimized LZSA2 decompressor by spke & uniabis (210 bytes)
+;
+;  ver.00 by spke for LZSA 1.0.0 (02-07/06/2019, 218 bytes);
+;  ver.01 by spke for LZSA 1.0.5 (24/07/2019, added support for backward decompression);
+;  ver.02 by spke for LZSA 1.0.6 (27/07/2019, fixed a bug in the backward decompressor);
+;  ver.03 by uniabis (30/07/2019, 213(-5) bytes, +3.8% speed and support for Hitachi HD64180);
+;  ver.04 by spke for LZSA 1.0.7 (01/08/2019, 214(+1) bytes, +0.2% speed and small re-organization of macros);
+;  ver.05 by spke (27/08/2019, 216(+2) bytes, +1.1% speed);
+;  ver.06 by spke for LZSA 1.1.0 (26/09/2019, added full revision history);
+;  ver.07 by spke for LZSA 1.1.1 (10/10/2019, +0.2% speed and an option for unrolled copying of long matches);
+;  ver.08 by spke (07-08/04/2022, 210(-6) bytes, +1.7% speed, using self-modifying code by default)
+;
+;  The data must be compressed using the command line compressor by Emmanuel Marty
+;  The compression is done as follows:
+;
+;  lzsa.exe -f2 -r <sourcefile> <outfile>
+;
+;  where option -r asks for the generation of raw (frame-less) data.
+;
+;  The decompression is done in the standard way:
+;
+;  ld hl,FirstByteOfCompressedData
+;  ld de,FirstByteOfMemoryForDecompressedData
+;  call DecompressLZSA2
+;
+;  Backward compression is also supported; you can compress files backward using:
+;
+;  lzsa.exe -f2 -r -b <sourcefile> <outfile>
+;
+;  and decompress the resulting files using:
+;
+;  ld hl,LastByteOfCompressedData
+;  ld de,LastByteOfMemoryForDecompressedData
+;  call DecompressLZSA2
+;
+;  (do not forget to uncomment the BACKWARD_DECOMPRESS option in the decompressor).
+;
+;  Of course, LZSA2 compression algorithms are (c) 2019 Emmanuel Marty,
+;  see https://github.com/emmanuel-marty/lzsa for more information
+;
+;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+;	DEFINE	UNROLL_LONG_MATCHES						; uncomment for faster decompression of very compressible data (+38 bytes)
+;	DEFINE	BACKWARD_DECOMPRESS						; uncomment for data compressed with option -b (+5 bytes, -3.2% speed)
+
+	IFNDEF	BACKWARD_DECOMPRESS
+
+		MACRO NEXT_HL
+		inc hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		add hl,de
+		ENDM
+
+		MACRO COPY1
+		ldi
+		ENDM
+
+		MACRO COPYBC
+		ldir
+		ENDM
+
+	ELSE
+
+		MACRO NEXT_HL
+		dec hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		; HL = DE - HL
+		ld a,e : sub l : ld l,a
+		ld a,d : sbc h : ld h,a						; 6*4 = 24t / 6 bytes
+		ENDM
+
+		MACRO COPY1
+		ldd
+		ENDM
+
+		MACRO COPYBC
+		lddr
+		ENDM
+
+	ENDIF
+
+@DecompressLZSA2:
+		; A' stores next nibble as %1111.... or assumed to contain trash
+		; B is assumed to be 0 in many places
+		ld b,0 : scf : exa : jr ReadToken
+
+
+
+
+ManyLiterals:	ld a,18 : add (hl) : NEXT_HL : jr nc,CopyMoreLiterals
+		ld c,(hl) : NEXT_HL
+		ld a,b : ld b,(hl)
+		jr ReadToken.NEXTHLuseBC
+
+
+
+
+MoreLiterals:	ld b,(hl) : NEXT_HL
+		scf : exa : jr nc,.noUpdate
+
+			; nibbles are read left-to-right; spare nibbles are kept in AF'
+			; and flag NC indicates that a nibble is available
+			ld a,(hl) : or a : exa
+			ld a,(hl) : NEXT_HL
+			rrca : rrca : rrca : rrca
+
+.noUpdate	or #F0
+		;sub #F0-3 : cp 15+3 : jr z,ManyLiterals
+		inc a : jr z,ManyLiterals : sub #F0-3+1
+
+CopyMoreLiterals:	ld c,a : ld a,b : ld b,0
+			COPY1
+			COPY1
+			COPYBC
+
+		or a : jp p,CASE0xx
+
+		cp %11000000 : jr c,CASE10x
+
+		; "111": repeated offset
+CASE11x		cp %11100000 : jr nc,MatchLen
+
+		; "110": 16-bit offset
+CASE110:	ld b,(hl) : NEXT_HL : jr ReadOffsetC
+
+
+
+
+
+Literals0011:	jr nz,MoreLiterals
+
+		; if "LL" of the byte token is equal to 0,
+		; there are no literals to copy
+NoLiterals:	or (hl) : NEXT_HL
+		jp m,CASE1xx
+
+		; short (5 or 9 bit long) offsets
+CASE0xx		cp %01000000 : jr c,CASE00x
+
+			; "01x": the case of the 9-bit offset
+CASE01x:		dec b : cp %01100000 : rl b
+
+ReadOffsetC		ld c,(hl) : NEXT_HL
+
+SaveOffset		ld (CopyMatch.PrevOffset),bc : ld b,0
+
+MatchLen		inc a : and %00000111 : jr z,LongerMatch : inc a
+
+CopyMatch:		ld c,a
+.useC			push hl
+.PrevOffset		EQU $+1 : ld hl,0
+			ADD_OFFSET
+			COPY1
+			COPYBC
+.popSrc			pop hl
+
+		; compressed data stream contains records
+		; each record begins with the byte token "XYZ|LL|MMM"
+ReadToken:	ld a,(hl) : and %00011000 : jp pe,Literals0011		; process the cases 00 and 11 separately
+
+			rrca : rrca : rrca
+
+			ld c,a : ld a,(hl)					; token is re-read for further processing
+.NEXTHLuseBC		NEXT_HL
+			COPYBC
+
+		; the token and literals are followed by the offset
+		or a : jp p,CASE0xx
+
+CASE1xx		cp %11000000 : jr nc,CASE11x
+
+		; "10x": the case of the 13-bit offset
+CASE10x:	ld c,a : exa : jr nc,.noUpdate
+
+			ld a,(hl) : or a : exa
+			ld a,(hl) : NEXT_HL
+			rrca : rrca : rrca : rrca
+
+.noUpdate	or #F0 : ld b,a : ld a,c
+		cp %10100000 : dec b : rl b : jr ReadOffsetC
+
+
+		
+
+		; "00x": the case of the 5-bit offset
+CASE00x:	ld b,a : exa : jr nc,.noUpdate
+
+			ld a,(hl) : or a : exa
+			ld a,(hl) : NEXT_HL
+			rrca : rrca : rrca : rrca
+
+.noUpdate	or #F0 : ld c,a : ld a,b
+		cp %00100000 : rl c
+		ld b,#FF : jr SaveOffset
+
+
+
+
+
+LongerMatch:	scf : exa : jr nc,.noUpdate
+
+			ld a,(hl) : or a : exa
+			ld a,(hl) : NEXT_HL
+			rrca : rrca : rrca : rrca
+
+.noUpdate	or #F0 : sub #F0-9 : cp 15+9 : jr c,CopyMatch
+
+	IFNDEF	UNROLL_LONG_MATCHES
+
+LongMatch:	add (hl) : NEXT_HL : jr nc,CopyMatch
+		ld c,(hl) : NEXT_HL
+		ld b,(hl) : NEXT_HL : jr nz,CopyMatch.useC
+		ret
+
+	ELSE
+
+LongMatch:	add (hl) : NEXT_HL : jr c,VeryLongMatch
+
+		ld c,a
+.useC		push hl
+		ld hl,(CopyMatch.PrevOffset)
+		ADD_OFFSET
+
+		; this is an unrolled equivalent of LDIR
+		xor a : sub c
+		and 8-1 : add a
+		ld (.jrOffset),a : jr nz,$+2
+.jrOffset	EQU $-1
+.fastLDIR	DUP 8
+		COPY1
+		EDUP
+		jp pe,.fastLDIR
+		jp CopyMatch.popSrc
+
+VeryLongMatch:	ld c,(hl) : NEXT_HL
+		ld b,(hl) : NEXT_HL : jr nz,LongMatch.useC
+		ret
+
+	ENDIF
+
+
+
+
+
--- a/asm/z80/unlzsa2_fast_v1.asm
+++ b/asm/z80/unlzsa2_fast_v1.asm
@ -1,167 +0,0 @@
-;
-;  Speed-optimized LZSA2 decompressor by spke (v.1 02-07/06/2019, 218 bytes)
-;
-;  The data must be compressed using the command line compressor by Emmanuel Marty
-;  The compression is done as follows:
-;
-;  lzsa.exe -f2 -r <sourcefile> <outfile>
-;
-;  where option -r asks for the generation of raw (frame-less) data.
-;
-;  The decompression is done in the standard way:
-;
-;  ld hl,CompressedData
-;  ld de,WhereToDecompress
-;  call DecompressLZSA2
-;
-;  Of course, LZSA2 compression algorithm is (c) 2019 Emmanuel Marty,
-;  see https://github.com/emmanuel-marty/lzsa for more information
-;
-;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
-;
-
-@DecompressLZSA2:
-		; A' stores next nibble as %1111.... or assumed to contain trash
-		; B is assumed to be 0
-		xor a : ld b,a : exa : jr ReadToken
-
-
-
-
-
-LongerMatch:	exa : jp m,.noUpdate
-
-			ld a,(hl) : or #F0 : exa
-			ld a,(hl) : inc hl : or #0F
-			rrca : rrca : rrca : rrca
-
-.noUpdate	sub #F0-9 : cp 15+9 : jr c,CopyMatch
-		;inc a : jr z,LongMatch : sub #F0-9+1 : jp CopyMatch
-
-LongMatch:	;ld a,24 : 
-		add (hl) : inc hl : jr nc,CopyMatch
-		ld c,(hl) : inc hl
-		ld b,(hl) : inc hl
-		jr nz,CopyMatch.useC
-		pop de : ret
-
-
-
-
-ManyLiterals:	ld a,18 : 
-		add (hl) : inc hl : jr nc,CopyLiterals
-		ld c,(hl) : inc hl
-		ld a,b : ld b,(hl) : inc hl
-		jr CopyLiterals.useBC
-
-
-
-
-MoreLiterals:	ld b,(hl) : inc hl
-		exa : jp m,.noUpdate
-
-			ld a,(hl) : or #F0 : exa
-			ld a,(hl) : inc hl : or #0F
-			rrca : rrca : rrca : rrca
-
-.noUpdate	;sub #F0-3 : cp 15+3 : jr z,ManyLiterals
-		inc a : jr z,ManyLiterals : sub #F0-3+1
-
-CopyLiterals:	ld c,a
-.useC		ld a,b : ld b,0
-.useBC		ldir
-		push de : or a : jp p,CASE0xx : jr CASE1xx
-
-
-
-
-
-		; if "LL" of the byte token is equal to 0,
-		; there are no literals to copy
-NoLiterals:	xor (hl) : inc hl
-		push de : jp m,CASE1xx
-
-		; short (5 or 9 bit long) offsets
-CASE0xx		ld d,#FF : cp %01000000 : jr c,CASE00x
-
-		; "01x": the case of the 9-bit offset
-CASE01x:	cp %01100000 : rl d
-
-ReadOffsetE:	ld e,(hl) : inc hl
-
-SaveOffset:	ld ixl,e : ld ixh,d
-
-MatchLen:	inc a : and %00000111 : jr z,LongerMatch : inc a
-
-CopyMatch:	ld c,a
-.useC		ex (sp),hl : push hl					; BC = len, DE = offset, HL = dest, SP ->[dest,src]
-		add hl,de : pop de					; BC = len, DE = dest, HL = dest-offset, SP->[src]
-		ldir : pop hl
-
-		; compressed data stream contains records
-		; each record begins with the byte token "XYZ|LL|MMM"
-ReadToken:	ld a,(hl) : and %00011000 : jr z,NoLiterals
-
-		jp pe,MoreLiterals					; 00 has already been processed; this identifies the case of 11
-		rrca : rrca : rrca
-
-		ld c,a : ld a,(hl) : inc hl				; token is re-read for further processing
-		ldir
-
-		; the token and literals are followed by the offset
-		push de : or a : jp p,CASE0xx
-
-CASE1xx		cp %11000000 : jr nc,CASE11x
-
-		; "10x": the case of the 5-bit offset
-CASE10x:	ld c,a : xor a
-		exa : jp m,.noUpdate
-
-			ld a,(hl) : or #F0 : exa
-			ld a,(hl) : inc hl : or #0F
-			rrca : rrca : rrca : rrca
-
-.noUpdate	ld d,a : ld a,c
-		cp %10100000 : rl d
-		dec d : dec d : jr ReadOffsetE
-
-		; "00x": the case of the 5-bit offset
-CASE00x:	ld c,a : xor a
-		exa : jp m,.noUpdate
-
-			ld a,(hl) : or #F0 : exa
-			ld a,(hl) : inc hl : or #0F
-			rrca : rrca : rrca : rrca
-
-.noUpdate	ld e,a : ld a,c
-		cp %00100000 : rl e : jp SaveOffset
-
-		; two remaining cases
-CASE11x		cp %11100000 : jr c,CASE110
-
-		; "111": repeated offset
-CASE111:	ld e,ixl : ld d,ixh : jr MatchLen
-
-		; "110": 16-bit offset
-CASE110:	ld d,(hl) : inc hl : jr ReadOffsetE
-
-;ReadNibble:	; 17 bytes, 44 t-state per nibble
-;		exa : ret m				; 4+11 = 15t
-;UpdateNibble:
-;		ld a,(hl) : or #F0 : exa
-;		ld a,(hl) : inc hl : or #0F
-;		rrca : rrca : rrca : rrca : ret		; 4+5 + 7+7+4+7+6+7+4+4+4+4+10 = 73t
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/asm/z80/unlzsa2_small.asm
+++ b/asm/z80/unlzsa2_small.asm
@ -0,0 +1,197 @@
+;
+;  Size-optimized LZSA2 decompressor by spke & uniabis (134 bytes)
+;
+;  ver.00 by spke for LZSA 1.0.0 (02-09/06/2019, 145 bytes);
+;  ver.01 by spke for LZSA 1.0.5 (24/07/2019, added support for backward decompression);
+;  ver.02 by uniabis (30/07/2019, 144(-1) bytes, +3.3% speed and support for Hitachi HD64180);
+;  ver.03 by spke for LZSA 1.0.7 (01/08/2019, 140(-4) bytes, -1.4% speed and small re-organization of macros);
+;  ver.04 by spke for LZSA 1.1.0 (26/09/2019, removed usage of IY, added full revision history)
+;  ver.05 by spke for LZSA 1.1.1 (11/10/2019, 139(-1) bytes, +0.1% speed)
+;  ver.06 by spke (11-12/04/2021, added some comments)
+;  ver.07 by spke (04-05/04/2022, 134(-5) bytes, +1% speed, using self-modifying code by default)
+;
+;  The data must be compressed using the command line compressor by Emmanuel Marty
+;  The compression is done as follows:
+;
+;  lzsa.exe -f2 -r <sourcefile> <outfile>
+;
+;  where option -r asks for the generation of raw (frame-less) data.
+;
+;  The decompression is done in the standard way:
+;
+;  ld hl,FirstByteOfCompressedData
+;  ld de,FirstByteOfMemoryForDecompressedData
+;  call DecompressLZSA2
+;
+;  Backward compression is also supported; you can compress files backward using:
+;
+;  lzsa.exe -f2 -r -b <sourcefile> <outfile>
+;
+;  and decompress the resulting files using:
+;
+;  ld hl,LastByteOfCompressedData
+;  ld de,LastByteOfMemoryForDecompressedData
+;  call DecompressLZSA2
+;
+;  (do not forget to uncomment the BACKWARD_DECOMPRESS option in the decompressor).
+;
+;  Of course, LZSA2 compression algorithms are (c) 2019 Emmanuel Marty,
+;  see https://github.com/emmanuel-marty/lzsa for more information
+;
+;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+;
+
+;	DEFINE	BACKWARD_DECOMPRESS						; uncomment for data compressed with option -b (+5 bytes, -3% speed)
+;	DEFINE	AVOID_SELFMODIFYING_CODE					; uncomment to disallow self-modifying code (-1 byte, -4% speed)
+
+	IFNDEF	BACKWARD_DECOMPRESS
+
+		MACRO NEXT_HL
+		inc hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		add hl,de
+		ENDM
+
+		MACRO BLOCKCOPY
+		ldir
+		ENDM
+
+	ELSE
+
+		MACRO NEXT_HL
+		dec hl
+		ENDM
+
+		MACRO ADD_OFFSET
+		;push hl : or a : sbc hl,de : pop de				; 11+4+15+10 = 40t / 5 bytes
+		; HL = DE - HL
+		ld a,e : sub l : ld l,a
+		ld a,d : sbc h : ld h,a						; 6*4 = 24t / 6 bytes
+		ENDM
+
+		MACRO BLOCKCOPY
+		lddr
+		ENDM
+
+	ENDIF
+
+@DecompressLZSA2:
+		; in many places we assume that B = 0
+		; flag P in A' signals the need to re-load the nibble store
+		xor a : ld b,a : exa : jr .ReadToken
+
+.CASE00x:		; token "00Z" stands for 5-bit offsets
+			; (read a nibble for offset bits 1-4 and use the inverted bit Z
+			; of the token as bit 0 of the offset; set bits 5-15 of the offset to 1)
+			push af
+			call ReadNibble.skipLDCA : ld c,a
+			pop af
+			cp %00100000 : rl c : jr .SaveOffset
+
+.CASE0xx	dec b : cp %01000000 : jr c,.CASE00x
+
+.CASE01x:		; token "01Z" stands for 9-bit offsets
+			; (read a byte for offset bits 0-7 and use the inverted bit Z
+			; for bit 8 of the offset; set bits 9-15 of the offset to 1)
+			cp %01100000
+.doRLB			rl b
+
+.OffsetReadC:		ld c,(hl) : NEXT_HL
+		
+	IFNDEF	AVOID_SELFMODIFYING_CODE
+.SaveOffset:		ld (.PrevOffset),bc : ld b,0
+	ELSE
+.SaveOffset:		push bc : pop ix : ld b,0
+	ENDIF
+
+.MatchLen:		and %00000111 : add 2 : cp 9
+			call z,ExtendedCode
+
+.CopyMatch:		ld c,a
+			push hl						; BC = len, DE = dest, HL = -offset, SP -> [src]
+
+	IFNDEF	AVOID_SELFMODIFYING_CODE
+.PrevOffset		EQU $+1 : ld hl,0
+	ELSE
+			push ix : pop hl
+	ENDIF
+			ADD_OFFSET
+			BLOCKCOPY					; BC = 0, DE = dest
+			pop hl						; HL = src
+
+.ReadToken:	ld a,(hl) : NEXT_HL : push af
+		and %00011000 : jr z,.NoLiterals
+
+			rrca : rrca : rrca
+			call pe,ExtendedCode
+
+			ld c,a
+			BLOCKCOPY
+
+.NoLiterals:	pop af : or a : jp p,.CASE0xx
+
+.CASE1xx	cp %11000000 : jr c,.CASE10x
+		; token "111" stands for repeat offsets
+		; (reuse the offset value of the previous match command)
+		cp %11100000 : jr nc,.MatchLen
+
+.CASE110:		; token "110" stands for 16-bit offset
+			; (read a byte for offset bits 8-15, then another byte for offset bits 0-7)
+			ld b,(hl) : NEXT_HL : jr .OffsetReadC
+
+.CASE10x:		; token "10Z" stands for 13-bit offsets
+			; (read a nibble for offset bits 9-12 and use the inverted bit Z
+			; for bit 8 of the offset, then read a byte for offset bits 0-7.
+			; set bits 13-15 of the offset to 1. substract 512 from the offset to get the final value)
+			call ReadNibble : ld b,a
+			ld a,c : cp %10100000
+			dec b : jr .doRLB
+
+
+ExtendedCode:	call ReadNibble : inc a : jr z,ExtraByte
+		sub #F0+1 : add c : ret
+ExtraByte	ld a,15 : add c : add (hl) : NEXT_HL : ret nc
+		ld a,(hl) : NEXT_HL
+		ld b,(hl) : NEXT_HL : ret nz
+		pop bc								; RET is not needed, because RET from ReadNibble is sufficient
+
+
+ReadNibble:	ld c,a
+.skipLDCA	xor a : exa : ret m
+		ld a,(hl) : or #F0 : exa
+		ld a,(hl) : NEXT_HL : or #0F
+		rrca : rrca : rrca : rrca : ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/asm/z80/unlzsa2_small_v1.asm
+++ b/asm/z80/unlzsa2_small_v1.asm
@ -1,96 +0,0 @@
-;
-;  Size-optimized LZSA2 decompressor by spke (v.1 02-09/06/2019, 145 bytes)
-;
-;  The data must be compressed using the command line compressor by Emmanuel Marty
-;  The compression is done as follows:
-;
-;  lzsa.exe -f2 -r <sourcefile> <outfile>
-;
-;  where option -r asks for the generation of raw (frame-less) data.
-;
-;  The decompression is done in the standard way:
-;
-;  ld hl,CompressedData
-;  ld de,WhereToDecompress
-;  call DecompressLZSA2
-;
-;  Of course, LZSA2 compression algorithm is (c) 2019 Emmanuel Marty,
-;  see https://github.com/emmanuel-marty/lzsa for more information
-;
-;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
-;
-
-@DecompressLZSA2:
-		xor a : ld b,a : exa : jr ReadToken
-
-CASE0xx		ld d,#FF : cp %01000000 : jr c,CASE00x
-
-CASE01x:	cp %01100000 : rl d
-
-OffsetReadE:	ld e,(hl) : inc hl
-		
-SaveOffset:	ld iyl,e : ld iyh,d
-
-MatchLen:	and %00000111 : add 2 : cp 9 : call z,ExtendedCode
-
-CopyMatch:	ld c,a
-		ex (sp),hl : push hl						; BC = len, DE = offset, HL = dest, SP ->[dest,src]
-		add hl,de : pop de						; BC = len, DE = dest, HL = dest-offset, SP->[src]
-		ldir : pop hl
-
-ReadToken:	ld a,(hl) : ld ixl,a : inc hl
-		and %00011000 : jr z,NoLiterals
-
-		rrca : rrca : rrca
-		call pe,ExtendedCode
-
-		ld c,a
-		ldir
-
-NoLiterals:	push de : ld a,ixl
-		or a : jp p,CASE0xx
-
-CASE1xx		cp %11000000 : jr nc,CASE11x
-
-CASE10x:	call ReadNibble
-		ld d,a : ld a,c
-		cp %10100000 : rl d
-		dec d : dec d : jr OffsetReadE
-
-CASE00x:	call ReadNibble
-		ld e,a : ld a,c
-		cp %00100000 : rl e : jr SaveOffset
-
-CASE11x		cp %11100000 : jr c,CASE110
-
-CASE111:	ld e,iyl : ld d,iyh : jr MatchLen
-
-CASE110:	ld d,(hl) : inc hl : jr OffsetReadE
-
-ExtendedCode:	call ReadNibble : inc a : jr z,ExtraByte
-		sub #F0+1 : add c : ret
-ExtraByte	ld a,15 : add c : add (hl) : inc hl : ret nc
-		ld a,(hl) : inc hl
-		ld b,(hl) : inc hl : ret nz
-		pop de : pop de : ret
-
-ReadNibble:	ld c,a : xor a : exa : ret m
-UpdateNibble	ld a,(hl) : or #F0 : exa
-		ld a,(hl) : inc hl : or #0F
-		rrca : rrca : rrca : rrca : ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/asm/z80/unlzsa_fast_v1.asm
+++ b/asm/z80/unlzsa_fast_v1.asm
@ -1,101 +0,0 @@
-;
-;  Speed-optimized LZSA decompressor by spke (v.1 03-25/04/2019, 110 bytes)
-;
-;  The data must be compressed using the command line compressor by Emmanuel Marty
-;  The compression is done as follows:
-;
-;  lzsa.exe -r <sourcefile> <outfile>
-;
-;  where option -r asks for the generation of raw (frame-less) data.
-;
-;  The decompression is done in the standard way:
-;
-;  ld hl,CompressedData
-;  ld de,WhereToDecompress
-;  call DecompressLZSA
-;
-;  Of course, LZSA compression algorithm is (c) 2019 Emmanuel Marty,
-;  see https://github.com/emmanuel-marty/lzsa for more information
-;
-;  Drop me an email if you have any comments/ideas/suggestions: zxintrospec@gmail.com
-;
-;  This software is provided 'as-is', without any express or implied
-;  warranty.  In no event will the authors be held liable for any damages
-;  arising from the use of this software.
-;
-;  Permission is granted to anyone to use this software for any purpose,
-;  including commercial applications, and to alter it and redistribute it
-;  freely, subject to the following restrictions:
-;
-;  1. The origin of this software must not be misrepresented; you must not
-;     claim that you wrote the original software. If you use this software
-;     in a product, an acknowledgment in the product documentation would be
-;     appreciated but is not required.
-;  2. Altered source versions must be plainly marked as such, and must not be
-;     misrepresented as being the original software.
-;  3. This notice may not be removed or altered from any source distribution.
-
-@DecompressLZSA:
-		ld b,0 : jr ReadToken
-
-NoLiterals:	xor (hl) : inc hl
-		push de : ld e,(hl) : inc hl : jp m,LongOffset
-
- 		; short matches have length 0+3..14+3
-ShortOffset:	ld d,#FF : add 3 : cp 15+3 : jr nc,LongerMatch
-
-		; placed here this saves a JP per iteration
-CopyMatch:	ld c,a
-.UseC		ex (sp),hl : push hl						; BC = len, DE = offset, HL = dest, SP ->[dest,src]
-		add hl,de : pop de						; BC = len, DE = dest, HL = dest-offset, SP->[src]
-		ldir : pop hl							; BC = 0, DE = dest, HL = src
-	
-ReadToken:	; first a byte token "O|LLL|MMMM" is read from the stream,
-		; where LLL is the number of literals and MMMM is
-		; a length of the match that follows after the literals
-		ld a,(hl) : and #70 : jr z,NoLiterals
-
-		cp #70 : jr z,MoreLiterals					; LLL=7 means 7+ literals...
-		rrca : rrca : rrca : rrca					; LLL<7 means 0..6 literals...
-
-		ld c,a : ld a,(hl) : inc hl
-		ldir
-
-		; next we read the first byte of the offset
-		push de : ld e,(hl) : inc hl
-		; the top bit of token is set if the offset contains two bytes
-		and #8F : jp p,ShortOffset
-
-LongOffset:	; read second byte of the offset
-		ld d,(hl) : inc hl
-		add -128+3 : cp 15+3 : jp c,CopyMatch
-
-		; MMMM=15 indicates a multi-byte number of literals
-LongerMatch:	add (hl) : inc hl : jr nc,CopyMatch
-
-		; the codes are designed to overflow;
-		; the overflow value 1 means read 1 extra byte
-		; and overflow value 0 means read 2 extra bytes
-.code1		ld b,a : ld c,(hl) : inc hl : jr nz,CopyMatch.UseC
-.code0		ld b,(hl) : inc hl
-
-		; the two-byte match length equal to zero
-		; designates the end-of-data marker
-		ld a,b : or c : jr nz,CopyMatch.UseC
-		pop de : ret
-
-MoreLiterals:	; there are three possible situations here
-		xor (hl) : inc hl : exa
-		ld a,7 : add (hl) : inc hl : jr c,ManyLiterals
-
-CopyLiterals:	ld c,a
-.UseC		ldir
-
-		push de : ld e,(hl) : inc hl
-		exa : jp p,ShortOffset : jr LongOffset
-
-ManyLiterals:
-.code1		ld b,a : ld c,(hl) : inc hl : jr nz,CopyLiterals.UseC
-.code0		ld b,(hl) : inc hl : jr CopyLiterals.UseC
-
-
--- a/pareto_graph.png
+++ b/pareto_graph.png
--- a/src/dictionary.c
+++ b/src/dictionary.c
@ -30,8 +30,10 @@
 *
 */

+#define _POSIX_C_SOURCE 200808
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/types.h>
 #include "format.h"
 #include "lib.h"

@ -96,6 +98,6 @@ int lzsa_dictionary_load(const char *pszDictionaryFilename, void **ppDictionaryD
 void lzsa_dictionary_free(void **ppDictionaryData) {
   if (*ppDictionaryData) {
      free(*ppDictionaryData);
-      ppDictionaryData = NULL;
+      *ppDictionaryData = NULL;
   }
 }
--- a/src/expand_block_v1.c
+++ b/src/expand_block_v1.c
@ -1,5 +1,5 @@
 /*
- * expand_v1.c - LZSA1 block decompressor implementation
+ * expand_block_v1.c - LZSA1 block decompressor implementation
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -120,7 +120,7 @@ static inline FORCE_INLINE int lzsa_build_match_len_v1(const unsigned char **ppI
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_decompressor_expand_block_v1(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize) {
+int lzsa_decompressor_expand_block_v1(const unsigned char *pInBlock, const int nBlockSize, unsigned char *pOutData, const int nOutDataOffset, const int nBlockMaxSize) {
   const unsigned char *pInBlockEnd = pInBlock + nBlockSize;
   unsigned char *pCurOutData = pOutData + nOutDataOffset;
   const unsigned char *pOutDataEnd = pCurOutData + nBlockMaxSize;
@ -166,7 +166,7 @@ int lzsa_decompressor_expand_block_v1(const unsigned char *pInBlock, int nBlockS
         const unsigned char *pSrc = pCurOutData - nMatchOffset;
         if (pSrc >= pOutData) {
            unsigned int nMatchLen = (unsigned int)(token & 0x0f);
-            if (nMatchLen != MATCH_RUN_LEN_V1 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+            if (nMatchLen != MATCH_RUN_LEN_V1 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd && (pSrc + 18) <= pOutDataEnd) {
               memcpy(pCurOutData, pSrc, 8);
               memcpy(pCurOutData + 8, pSrc + 8, 8);
               memcpy(pCurOutData + 16, pSrc + 16, 2);
@ -177,29 +177,36 @@ int lzsa_decompressor_expand_block_v1(const unsigned char *pInBlock, int nBlockS
               if (nMatchLen == (MATCH_RUN_LEN_V1 + MIN_MATCH_SIZE_V1)) {
                  if (lzsa_build_match_len_v1(&pInBlock, pInBlockEnd, &nMatchLen))
                     return -1;
+                  if (nMatchLen == 0)
+                     break;
               }

-               if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
-                  /* Do a deterministic, left to right byte copy instead of memcpy() so as to handle overlaps */
+               if ((pSrc + nMatchLen) <= pOutDataEnd) {
+                  if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
+                     /* Do a deterministic, left to right byte copy instead of memcpy() so as to handle overlaps */

-                  if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) < (pOutDataFastEnd - 15)) {
-                     const unsigned char *pCopySrc = pSrc;
-                     unsigned char *pCopyDst = pCurOutData;
-                     const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;
+                     if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) < (pOutDataFastEnd - 15)) {
+                        const unsigned char *pCopySrc = pSrc;
+                        unsigned char *pCopyDst = pCurOutData;
+                        const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;

-                     do {
-                        memcpy(pCopyDst, pCopySrc, 16);
-                        pCopySrc += 16;
-                        pCopyDst += 16;
-                     } while (pCopyDst < pCopyEndDst);
+                        do {
+                           memcpy(pCopyDst, pCopySrc, 16);
+                           pCopySrc += 16;
+                           pCopyDst += 16;
+                        } while (pCopyDst < pCopyEndDst);

-                     pCurOutData += nMatchLen;
+                        pCurOutData += nMatchLen;
+                     }
+                     else {
+                        while (nMatchLen) {
+                           *pCurOutData++ = *pSrc++;
+                           nMatchLen--;
+                        }
+                     }
                  }
                  else {
-                     while (nMatchLen) {
-                        *pCurOutData++ = *pSrc++;
-                        nMatchLen--;
-                     }
+                     return -1;
                  }
               }
               else {
--- a/src/expand_block_v1.h
+++ b/src/expand_block_v1.h
@ -1,5 +1,5 @@
 /*
- * expand_v1.h - LZSA1 block decompressor definitions
+ * expand_block_v1.h - LZSA1 block decompressor definitions
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -30,8 +30,8 @@
 *
 */

-#ifndef _EXPAND_V1_H
-#define _EXPAND_V1_H
+#ifndef _EXPAND_BLOCK_V1_H
+#define _EXPAND_BLOCK_V1_H

 /**
 * Decompress one LZSA1 data block
@ -44,6 +44,6 @@
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_decompressor_expand_block_v1(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize);
+int lzsa_decompressor_expand_block_v1(const unsigned char *pInBlock, const int nBlockSize, unsigned char *pOutData, const int nOutDataOffset, const int nBlockMaxSize);

-#endif /* _EXPAND_V1_H */
+#endif /* _EXPAND_BLOCK_V1_H */
--- a/src/expand_block_v2.c
+++ b/src/expand_block_v2.c
@ -1,5 +1,5 @@
 /*
- * expand_v2.c - LZSA2 block decompressor implementation
+ * expand_block_v2.c - LZSA2 block decompressor implementation
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -80,6 +80,9 @@ static inline FORCE_INLINE int lzsa_build_len_v2(const unsigned char **ppInBlock
                  return -1;
               }
            }
+            else if ((*nLength) == 256) {
+               (*nLength) = 0;
+            }
         }
         else {
            return -1;
@ -106,7 +109,7 @@ static inline FORCE_INLINE int lzsa_build_len_v2(const unsigned char **ppInBlock
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize) {
+int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, const int nBlockSize, unsigned char *pOutData, const int nOutDataOffset, const int nBlockMaxSize) {
   const unsigned char *pInBlockEnd = pInBlock + nBlockSize;
   unsigned char *pCurOutData = pOutData + nOutDataOffset;
   const unsigned char *pOutDataEnd = pCurOutData + nBlockMaxSize;
@ -143,8 +146,8 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
         }
      }

-      if ((pInBlock + 1) < pInBlockEnd) { /* The last token in the block does not include match information */
-         unsigned char nOffsetMode = token & 0xc0;
+      if (pInBlock < pInBlockEnd) { /* The last token in the block does not include match information */
+         const unsigned char nOffsetMode = token & 0xc0;
         unsigned int nValue;

         switch (nOffsetMode) {
@ -182,6 +185,7 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
            if ((token & 0x20) == 0) {
               /* 16 bit offset */
               nMatchOffset = (((unsigned int)(*pInBlock++)) << 8);
+               if (pInBlock >= pInBlockEnd) return -1;
               nMatchOffset |= (unsigned int)(*pInBlock++);
               nMatchOffset ^= 0xffff;
               nMatchOffset++;
@ -192,7 +196,7 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
         const unsigned char *pSrc = pCurOutData - nMatchOffset;
         if (pSrc >= pOutData) {
            unsigned int nMatchLen = (unsigned int)(token & 0x07);
-            if (nMatchLen != MATCH_RUN_LEN_V2 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+            if (nMatchLen != MATCH_RUN_LEN_V2 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd && (pSrc + 10) <= pOutDataEnd) {
               memcpy(pCurOutData, pSrc, 8);
               memcpy(pCurOutData + 8, pSrc + 8, 2);
               pCurOutData += (MIN_MATCH_SIZE_V2 + nMatchLen);
@ -202,29 +206,36 @@ int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockS
               if (nMatchLen == (MATCH_RUN_LEN_V2 + MIN_MATCH_SIZE_V2)) {
                  if (lzsa_build_len_v2(&pInBlock, pInBlockEnd, &nCurNibbles, &nibbles, &nMatchLen))
                     return -1;
+                  if (nMatchLen == 0)
+                     break;
               }

-               if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
-                  /* Do a deterministic, left to right byte copy instead of memcpy() so as to handle overlaps */
+               if ((pSrc + nMatchLen) <= pOutDataEnd) {
+                  if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
+                     /* Do a deterministic, left to right byte copy instead of memcpy() so as to handle overlaps */

-                  if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) < (pOutDataFastEnd - 15)) {
-                     const unsigned char *pCopySrc = pSrc;
-                     unsigned char *pCopyDst = pCurOutData;
-                     const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;
+                     if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) < (pOutDataFastEnd - 15)) {
+                        const unsigned char *pCopySrc = pSrc;
+                        unsigned char *pCopyDst = pCurOutData;
+                        const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;

-                     do {
-                        memcpy(pCopyDst, pCopySrc, 16);
-                        pCopySrc += 16;
-                        pCopyDst += 16;
-                     } while (pCopyDst < pCopyEndDst);
+                        do {
+                           memcpy(pCopyDst, pCopySrc, 16);
+                           pCopySrc += 16;
+                           pCopyDst += 16;
+                        } while (pCopyDst < pCopyEndDst);

-                     pCurOutData += nMatchLen;
+                        pCurOutData += nMatchLen;
+                     }
+                     else {
+                        while (nMatchLen) {
+                           *pCurOutData++ = *pSrc++;
+                           nMatchLen--;
+                        }
+                     }
                  }
                  else {
-                     while (nMatchLen) {
-                        *pCurOutData++ = *pSrc++;
-                        nMatchLen--;
-                     }
+                     return -1;
                  }
               }
               else {
--- a/src/expand_block_v2.h
+++ b/src/expand_block_v2.h
@ -1,5 +1,5 @@
 /*
- * expand_v2.h - LZSA2 block decompressor definitions
+ * expand_block_v2.h - LZSA2 block decompressor definitions
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -30,8 +30,8 @@
 *
 */

-#ifndef _EXPAND_V2_H
-#define _EXPAND_V2_H
+#ifndef _EXPAND_BLOCK_V2_H
+#define _EXPAND_BLOCK_V2_H

 /**
 * Decompress one LZSA2 data block
@ -44,6 +44,6 @@
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize);
+int lzsa_decompressor_expand_block_v2(const unsigned char *pInBlock, const int nBlockSize, unsigned char *pOutData, const int nOutDataOffset, const int nBlockMaxSize);

-#endif /* _EXPAND_V2_H */
+#endif /* _EXPAND_BLOCK_V2_H */
--- a/src/expand_context.c
+++ b/src/expand_context.c
@ -35,6 +35,7 @@
 #include "expand_context.h"
 #include "expand_block_v1.h"
 #include "expand_block_v2.h"
+#include "lib.h"

 /**
 * Decompress one data block
@ -45,14 +46,31 @@
 * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes)
 * @param nBlockMaxSize total size of output decompression buffer, in bytes
 * @param nFormatVersion version of format to use (1-2)
+ * @param nFlags compression flags (LZSA_FLAG_xxx)
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_decompressor_expand_block(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize, const int nFormatVersion) {
+int lzsa_decompressor_expand_block(unsigned char *pInBlock, const int nBlockSize, unsigned char *pOutData, const int nOutDataOffset, const int nBlockMaxSize, const int nFormatVersion, const int nFlags) {
+   int nDecompressedSize;
+
+   if (nFlags & LZSA_FLAG_RAW_BACKWARD) {
+      lzsa_reverse_buffer(pInBlock, nBlockSize);
+   }
+
   if (nFormatVersion == 1)
-      return lzsa_decompressor_expand_block_v1(pInBlock, nBlockSize, pOutData, nOutDataOffset, nBlockMaxSize);
+      nDecompressedSize = lzsa_decompressor_expand_block_v1(pInBlock, nBlockSize, pOutData, nOutDataOffset, nBlockMaxSize);
   else if (nFormatVersion == 2)
-      return lzsa_decompressor_expand_block_v2(pInBlock, nBlockSize, pOutData, nOutDataOffset, nBlockMaxSize);
+      nDecompressedSize = lzsa_decompressor_expand_block_v2(pInBlock, nBlockSize, pOutData, nOutDataOffset, nBlockMaxSize);
   else
-      return -1;
+      nDecompressedSize = -1;
+
+   if (nDecompressedSize != -1 && (nFlags & LZSA_FLAG_RAW_BACKWARD)) {
+      lzsa_reverse_buffer(pOutData + nOutDataOffset, nDecompressedSize);
+   }
+
+   if (nFlags & LZSA_FLAG_RAW_BACKWARD) {
+      lzsa_reverse_buffer(pInBlock, nBlockSize);
+   }
+
+   return nDecompressedSize;
 }
--- a/src/expand_context.h
+++ b/src/expand_context.h
@ -48,10 +48,11 @@ extern "C" {
 * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes)
 * @param nBlockMaxSize total size of output decompression buffer, in bytes
 * @param nFormatVersion version of format to use (1-2)
+ * @param nFlags compression flags (LZSA_FLAG_xxx)
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_decompressor_expand_block(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize, const int nFormatVersion);
+int lzsa_decompressor_expand_block(unsigned char *pInBlock, const int nBlockSize, unsigned char *pOutData, const int nOutDataOffset, const int nBlockMaxSize, const int nFormatVersion, const int nFlags);

 #ifdef __cplusplus
 }
--- a/src/expand_inmem.c
+++ b/src/expand_inmem.c
@ -98,8 +98,8 @@ size_t lzsa_get_max_decompressed_size_inmem(const unsigned char *pFileData, size
 *
 * @return actual decompressed size, or -1 for error
 */
-size_t lzsa_decompress_inmem(const unsigned char *pFileData, unsigned char *pOutBuffer, size_t nFileSize, size_t nMaxOutBufferSize, const unsigned int nFlags, int *pFormatVersion) {
-   const unsigned char *pCurFileData = pFileData;
+size_t lzsa_decompress_inmem(unsigned char *pFileData, unsigned char *pOutBuffer, size_t nFileSize, size_t nMaxOutBufferSize, const unsigned int nFlags, int *pFormatVersion) {
+   unsigned char *pCurFileData = pFileData;
   const unsigned char *pEndFileData = pCurFileData + nFileSize;
   unsigned char *pCurOutBuffer = pOutBuffer;
   const unsigned char *pEndOutBuffer = pCurOutBuffer + nMaxOutBufferSize;
@ -107,8 +107,7 @@ size_t lzsa_decompress_inmem(const unsigned char *pFileData, unsigned char *pOut
   const int nHeaderSize = lzsa_get_header_size();

   if (nFlags & LZSA_FLAG_RAW_BLOCK) {
-      int nEODBytes = (*pFormatVersion == 2) ? 2 : 4;
-      return (size_t)lzsa_decompressor_expand_block(pFileData, (int)nFileSize - nEODBytes /* EOD marker */, pOutBuffer, 0, (int)nMaxOutBufferSize, *pFormatVersion);
+      return (size_t)lzsa_decompressor_expand_block(pFileData, (int)nFileSize, pOutBuffer, 0, (int)nMaxOutBufferSize, *pFormatVersion, nFlags);
   }

   /* Check header */
@ -140,7 +139,7 @@ size_t lzsa_decompress_inmem(const unsigned char *pFileData, unsigned char *pOut
         if ((pCurFileData + nBlockDataSize) > pEndFileData)
            return -1;

-         nDecompressedSize = lzsa_decompressor_expand_block(pCurFileData, nBlockDataSize, pCurOutBuffer - nPreviousBlockSize, nPreviousBlockSize, (int)(pEndOutBuffer - pCurOutBuffer + nPreviousBlockSize), *pFormatVersion);
+         nDecompressedSize = lzsa_decompressor_expand_block(pCurFileData, nBlockDataSize, pCurOutBuffer - nPreviousBlockSize, nPreviousBlockSize, (int)(pEndOutBuffer - pCurOutBuffer + nPreviousBlockSize), *pFormatVersion, nFlags);
         if (nDecompressedSize < 0)
            return -1;

--- a/src/expand_inmem.h
+++ b/src/expand_inmem.h
@ -61,7 +61,7 @@ size_t lzsa_get_max_decompressed_size_inmem(const unsigned char *pFileData, size
 *
 * @return actual decompressed size, or -1 for error
 */
-size_t lzsa_decompress_inmem(const unsigned char *pFileData, unsigned char *pOutBuffer, size_t nFileSize, size_t nMaxOutBufferSize, const unsigned int nFlags, int *pFormatVersion);
+size_t lzsa_decompress_inmem(unsigned char *pFileData, unsigned char *pOutBuffer, size_t nFileSize, size_t nMaxOutBufferSize, const unsigned int nFlags, int *pFormatVersion);

 #ifdef __cplusplus
 }
--- a/src/expand_streaming.c
+++ b/src/expand_streaming.c
@ -185,11 +185,6 @@ lzsa_status_t lzsa_decompress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pO
         }
         size_t nReadBytes = pInStream->read(pInStream, pInBlock, nBlockSize);
         if (nFlags & LZSA_FLAG_RAW_BLOCK) {
-            size_t nEODBytes = (nFormatVersion == 2) ? 2 : 4;
-            if (nReadBytes > nEODBytes)
-               nReadBytes -= nEODBytes;
-            else
-               nReadBytes = 0;
            nBlockSize = (unsigned int)nReadBytes;
         }

@ -201,7 +196,7 @@ lzsa_status_t lzsa_decompress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pO
               nDecompressedSize = nBlockSize;
            }
            else {
-               nDecompressedSize = lzsa_decompressor_expand_block(pInBlock, nBlockSize, pOutData, BLOCK_SIZE, BLOCK_SIZE, nFormatVersion);
+               nDecompressedSize = lzsa_decompressor_expand_block(pInBlock, nBlockSize, pOutData, BLOCK_SIZE, BLOCK_SIZE, nFormatVersion, nFlags);
               if (nDecompressedSize < 0) {
                  nDecompressionError = LZSA_ERROR_DECOMPRESSION;
                  break;
--- a/src/expand_streaming.h
+++ b/src/expand_streaming.h
@ -39,9 +39,6 @@
 extern "C" {
 #endif

-/* Forward declaration */
-typedef enum _lzsa_status_t lzsa_status_t;
-
 /*-------------- File API -------------- */

 /**
--- a/src/frame.c
+++ b/src/frame.c
@ -60,10 +60,11 @@ int lzsa_get_frame_size(void) {
 *
 * @param pFrameData encoding buffer
 * @param nMaxFrameDataSize max encoding buffer size, in bytes
+ * @param nFormatVersion version of format to use (1-2)
 *
 * @return number of encoded bytes, or -1 for failure
 */
-int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize, int nFormatVersion) {
+int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize, const int nFormatVersion) {
   if (nMaxFrameDataSize >= 3 && (nFormatVersion == 1 || nFormatVersion == 2)) {
      pFrameData[0] = LZSA_ID_0;                         /* Magic number */
      pFrameData[1] = LZSA_ID_1;
@ -146,6 +147,7 @@ int lzsa_encode_footer_frame(unsigned char *pFrameData, const int nMaxFrameDataS
 *
 * @param pFrameData data bytes
 * @param nFrameDataSize number of bytes to decode
+ * @param nFormatVersion pointer to returned format version, if successful
 *
 * @return 0 for success, or -1 for failure
 */
--- a/src/frame.h
+++ b/src/frame.h
@ -56,10 +56,11 @@ int lzsa_get_frame_size(void);
 *
 * @param pFrameData encoding buffer
 * @param nMaxFrameDataSize max encoding buffer size, in bytes
+ * @param nFormatVersion version of format to use (1-2)
 *
 * @return number of encoded bytes, or -1 for failure
 */
-int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize, int nFormatVersion);
+int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize, const int nFormatVersion);

 /**
 * Encode compressed block frame header
@ -98,6 +99,7 @@ int lzsa_encode_footer_frame(unsigned char *pFrameData, const int nMaxFrameDataS
 *
 * @param pFrameData data bytes
 * @param nFrameDataSize number of bytes to decode
+ * @param nFormatVersion pointer to returned format version, if successful
 *
 * @return 0 for success, or -1 for failure
 */
--- a/src/hashmap.c
+++ b/src/hashmap.c
@ -1,138 +0,0 @@
-/*
- * hashmap.c - integer hashmap implementation
- *
- * Copyright (C) 2019 Emmanuel Marty
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/*
- * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
- *
- * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
- * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
- * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
- * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
- *
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include "hashmap.h"
-
-/**
- * Generate key hash by mixing
- *
- * @param key key to get hash for
- *
- * @return hash
- */
-static unsigned int lzsa_hashmap_get_hash(unsigned long long key) {
-   key = (~key) + (key << 21);
-   key = key ^ (key >> 24);
-   key = (key + (key << 3)) + (key << 8);
-   key = key ^ (key >> 14);
-   key = (key + (key << 2)) + (key << 4);
-   key = key ^ (key >> 28);
-   key = key + (key << 31);
-   return key & (LZSA_HASH_NBUCKETS - 1);
-}
-
-/**
- * Initialize hashmap
- *
- * @param pHashMap hashmap
- */
-void lzsa_hashmap_init(lzsa_hashmap_t *pHashMap) {
-   pHashMap->pBuffer = NULL;
-   memset(pHashMap->pBucket, 0, sizeof(lzsa_hashvalue_t *) * LZSA_HASH_NBUCKETS);
-}
-
-/**
- * Set value for key
- *
- * @param pHashMap hashmap
- * @param key key to set value for
- * @param value new value
- */
-void lzsa_hashmap_insert(lzsa_hashmap_t *pHashMap, unsigned long long key, unsigned int value) {
-   unsigned int hash = lzsa_hashmap_get_hash(key);
-   lzsa_hashvalue_t **pBucket = &pHashMap->pBucket[hash];
-   while (*pBucket) {
-      if ((*pBucket)->key == key) {
-         (*pBucket)->value = value;
-         return;
-      }
-
-      pBucket = &((*pBucket)->pNext);
-   }
-
-   if (!pHashMap->pBuffer || pHashMap->pBuffer->nFreeEntryIdx >= 255) {
-      lzsa_hashbuffer_t *pNewBuffer = (lzsa_hashbuffer_t *)malloc(sizeof(lzsa_hashbuffer_t));
-      if (!pNewBuffer) return;
-
-      pNewBuffer->pNext = pHashMap->pBuffer;
-      pNewBuffer->nFreeEntryIdx = 0;
-      pHashMap->pBuffer = pNewBuffer;
-   }
-
-   *pBucket = &pHashMap->pBuffer->value[pHashMap->pBuffer->nFreeEntryIdx++];
-   (*pBucket)->pNext = NULL;
-   (*pBucket)->key = key;
-   (*pBucket)->value = value;
-}
-
-/**
- * Get value for key
- *
- * @param pHashMap hashmap
- * @param key key to get value for
- * @param pValue pointer to where to store value if found
- *
- * @return 0 if found, nonzero if not found
- */
-int lzsa_hashmap_find(lzsa_hashmap_t *pHashMap, unsigned long long key, unsigned int *pValue) {
-   unsigned int hash = lzsa_hashmap_get_hash(key);
-   lzsa_hashvalue_t **pBucket = &pHashMap->pBucket[hash];
-   while (*pBucket) {
-      if ((*pBucket)->key == key) {
-         *pValue = (*pBucket)->value;
-         return 0;
-      }
-
-      pBucket = &((*pBucket)->pNext);
-   }
-
-   return -1;
-}
-
-/**
- * Clear hashmap
- *
- * @param pHashMap hashmap
- */
-void lzsa_hashmap_clear(lzsa_hashmap_t *pHashMap) {
-   while (pHashMap->pBuffer) {
-      lzsa_hashbuffer_t *pCurBuffer = pHashMap->pBuffer;
-      pHashMap->pBuffer = pCurBuffer->pNext;
-      free(pCurBuffer);
-      pCurBuffer = NULL;
-   }
-
-   memset(pHashMap->pBucket, 0, sizeof(lzsa_hashvalue_t *) * LZSA_HASH_NBUCKETS);
-}
-
--- a/src/hashmap.h
+++ b/src/hashmap.h
@ -1,99 +0,0 @@
-/*
- * hashmap.h - integer hashmap definitions
- *
- * Copyright (C) 2019 Emmanuel Marty
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/*
- * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
- *
- * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
- * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
- * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
- * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
- *
- */
-
-#ifndef _HASHMAP_H
-#define _HASHMAP_H
-
-#include <stdlib.h>
-
-/** Number of hashmap buckets */
-#define LZSA_HASH_NBUCKETS 256
-
-/* Forward definitions */
-typedef struct _lzsa_hashvalue_t lzsa_hashvalue_t;
-typedef struct _lzsa_hashbuffer_t lzsa_hashbuffer_t;
-
-/** One hashmap bucket entry */
-typedef struct _lzsa_hashvalue_t {
-   lzsa_hashvalue_t *pNext;
-   unsigned long long key;
-   unsigned int value;
-} lzsa_hashvalue_t;
-
-/** One buffer storing hashmap bucket entries */
-typedef struct _lzsa_hashbuffer_t {
-   lzsa_hashbuffer_t *pNext;
-   int nFreeEntryIdx;
-   lzsa_hashvalue_t value[255];
-} lzsa_hashbuffer_t;
-
-/** Hashmap */
-typedef struct {
-   lzsa_hashbuffer_t *pBuffer;
-   lzsa_hashvalue_t *pBucket[LZSA_HASH_NBUCKETS];
-} lzsa_hashmap_t;
-
-/**
- * Initialize hashmap
- *
- * @param pHashMap hashmap
- */
-void lzsa_hashmap_init(lzsa_hashmap_t *pHashMap);
-
-/**
- * Set value for key
- *
- * @param pHashMap hashmap
- * @param key key to set value for
- * @param value new value
- */
-void lzsa_hashmap_insert(lzsa_hashmap_t *pHashMap, unsigned long long key, unsigned int value);
-
-/**
- * Get value for key
- *
- * @param pHashMap hashmap
- * @param key key to get value for
- * @param pValue pointer to where to store value if found
- *
- * @return 0 if found, nonzero if not found
- */
-int lzsa_hashmap_find(lzsa_hashmap_t *pHashMap, unsigned long long key, unsigned int *pValue);
-
-/**
- * Clear hashmap
- *
- * @param pHashMap hashmap
- */
-void lzsa_hashmap_clear(lzsa_hashmap_t *pHashMap);
-
-#endif   /* _HASHMAP_H */
--- a/src/lib.h
+++ b/src/lib.h
@ -48,27 +48,27 @@
 extern "C" {
 #endif

-/** High level status for compression and decompression */
-typedef enum _lzsa_status_t {
-   LZSA_OK = 0,                           /**< Success */
-   LZSA_ERROR_SRC,                        /**< Error reading input */
-   LZSA_ERROR_DST,                        /**< Error reading output */
-   LZSA_ERROR_DICTIONARY,                 /**< Error reading dictionary */
-   LZSA_ERROR_MEMORY,                     /**< Out of memory */
-
-   /* Compression-specific status codes */
-   LZSA_ERROR_COMPRESSION,                /**< Internal compression error */
-   LZSA_ERROR_RAW_TOOLARGE,               /**< Input is too large to be compressed to a raw block */
-   LZSA_ERROR_RAW_UNCOMPRESSED,           /**< Input is incompressible and raw blocks don't support uncompressed data */
-
-   /* Decompression-specific status codes */
-   LZSA_ERROR_FORMAT,                     /**< Invalid input format or magic number when decompressing */
-   LZSA_ERROR_DECOMPRESSION,              /**< Internal decompression error */
-} lzsa_status_t;
-
 /* Compression flags */
 #define LZSA_FLAG_FAVOR_RATIO    (1<<0)      /**< 1 to compress with the best ratio, 0 to trade some compression ratio for extra decompression speed */
 #define LZSA_FLAG_RAW_BLOCK      (1<<1)      /**< 1 to emit raw block */
+#define LZSA_FLAG_RAW_BACKWARD   (1<<2)      /**< 1 to compress or decompress raw block backward */
+
+/**
+ * Reverse bytes in the specified buffer
+ *
+ * @param pBuffer pointer to buffer whose contents are to be reversed
+ * @param nBufferSize size of buffer in bytes
+ */
+static inline void lzsa_reverse_buffer(unsigned char *pBuffer, const int nBufferSize) {
+   const int nMidPoint = nBufferSize / 2;
+   int i, j;
+
+   for (i = 0, j = nBufferSize - 1; i < nMidPoint; i++, j--) {
+      const unsigned char c = pBuffer[i];
+      pBuffer[i] = pBuffer[j];
+      pBuffer[j] = c;
+   }
+}

 #ifdef __cplusplus
 }
--- a/src/libdivsufsort/include/divsufsort.h
+++ b/src/libdivsufsort/include/divsufsort.h
@ -75,8 +75,8 @@ void divsufsort_destroy(divsufsort_ctx_t *ctx);
 /**
 * Constructs the suffix array of a given string.
 * @param ctx suffix array context
- * @param T[0..n-1] The input string.
- * @param SA[0..n-1] The output array of suffixes.
+ * @param T The input string.
+ * @param SA The output array of suffixes.
 * @param n The length of the given string.
 * @return 0 if no error occurred, -1 or -2 otherwise.
 */
--- a/src/lzsa.c
+++ b/src/lzsa.c
@ -31,7 +31,6 @@
 */

 #include <stdio.h>
-#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
@ -42,11 +41,13 @@
 #endif
 #include "lib.h"

-#define OPT_VERBOSE     1
-#define OPT_RAW         2
-#define OPT_FAVOR_RATIO 4
+#define OPT_VERBOSE        1
+#define OPT_RAW            2
+#define OPT_FAVOR_RATIO    4
+#define OPT_RAW_BACKWARD   8
+#define OPT_STATS          16

-#define TOOL_VERSION "1.0.4"
+#define TOOL_VERSION "1.4.1"

 /*---------------------------------------------------------------------------*/

@ -100,23 +101,26 @@ static void compression_progress(long long nOriginalSize, long long nCompressedS
 static int do_compress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, const int nMinMatchSize, const int nFormatVersion) {
   long long nStartTime = 0LL, nEndTime = 0LL;
   long long nOriginalSize = 0LL, nCompressedSize = 0LL;
-   int nCommandCount = 0;
+   int nCommandCount = 0, nSafeDist = 0;
   int nFlags;
   lzsa_status_t nStatus;
+   lzsa_stats stats;

   nFlags = 0;
   if (nOptions & OPT_FAVOR_RATIO)
      nFlags |= LZSA_FLAG_FAVOR_RATIO;
   if (nOptions & OPT_RAW)
      nFlags |= LZSA_FLAG_RAW_BLOCK;
+   if (nOptions & OPT_RAW_BACKWARD)
+      nFlags |= LZSA_FLAG_RAW_BACKWARD;

   if (nOptions & OPT_VERBOSE) {
      nStartTime = do_get_time();
   }

-   nStatus = lzsa_compress_file(pszInFilename, pszOutFilename, pszDictionaryFilename, nFlags, nMinMatchSize, nFormatVersion, compression_progress, &nOriginalSize, &nCompressedSize, &nCommandCount);
+   nStatus = lzsa_compress_file(pszInFilename, pszOutFilename, pszDictionaryFilename, nFlags, nMinMatchSize, nFormatVersion, compression_progress, &nOriginalSize, &nCompressedSize, &nCommandCount, &nSafeDist, &stats);

-   if ((nOptions & OPT_VERBOSE)) {
+   if (nOptions & OPT_VERBOSE) {
      nEndTime = do_get_time();
   }

@ -135,14 +139,43 @@ static int do_compress(const char *pszInFilename, const char *pszOutFilename, co
   if (nStatus)
      return 100;

-   if ((nOptions & OPT_VERBOSE)) {
+   if (nOptions & OPT_VERBOSE) {
      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
      fprintf(stdout, "\rCompressed '%s' in %g seconds, %.02g Mb/s, %d tokens (%g bytes/token), %lld into %lld bytes ==> %g %%\n",
         pszInFilename, fDelta, fSpeed, nCommandCount, (double)nOriginalSize / (double)nCommandCount,
         nOriginalSize, nCompressedSize, (double)(nCompressedSize * 100.0 / nOriginalSize));
+      if (nOptions & OPT_RAW) {
+         fprintf(stdout, "Safe distance: %d (0x%X)\n", nSafeDist, nSafeDist);
+      }
   }

+   if (nOptions & OPT_STATS) {
+      if (stats.literals_divisor > 0)
+         fprintf(stdout, "Literals: min: %d avg: %d max: %d count: %d\n", stats.min_literals, stats.total_literals / stats.literals_divisor, stats.max_literals, stats.literals_divisor);
+      else
+         fprintf(stdout, "Literals: none\n");
+      if (stats.match_divisor > 0) {
+         fprintf(stdout, "Offsets: min: %d avg: %d max: %d reps: %d count: %d\n", stats.min_offset, stats.total_offsets / stats.match_divisor, stats.max_offset, stats.num_rep_offsets, stats.match_divisor);
+         fprintf(stdout, "Match lens: min: %d avg: %d max: %d count: %d\n", stats.min_match_len, stats.total_match_lens / stats.match_divisor, stats.max_match_len, stats.match_divisor);
+      }
+      else {
+         fprintf(stdout, "Offsets: none\n");
+         fprintf(stdout, "Match lens: none\n");
+      }
+      if (stats.rle1_divisor > 0) {
+         fprintf(stdout, "RLE1 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle1_len, stats.total_rle1_lens / stats.rle1_divisor, stats.max_rle1_len, stats.rle1_divisor);
+      }
+      else {
+         fprintf(stdout, "RLE1 lens: none\n");
+      }
+      if (stats.rle2_divisor > 0) {
+         fprintf(stdout, "RLE2 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle2_len, stats.total_rle2_lens / stats.rle2_divisor, stats.max_rle2_len, stats.rle2_divisor);
+      }
+      else {
+         fprintf(stdout, "RLE2 lens: none\n");
+      }
+   }
   return 0;
 }

@ -157,6 +190,8 @@ static int do_decompress(const char *pszInFilename, const char *pszOutFilename,
   nFlags = 0;
   if (nOptions & OPT_RAW)
      nFlags |= LZSA_FLAG_RAW_BLOCK;
+   if (nOptions & OPT_RAW_BACKWARD)
+      nFlags |= LZSA_FLAG_RAW_BACKWARD;

   if (nOptions & OPT_VERBOSE) {
      nStartTime = do_get_time();
@ -259,7 +294,7 @@ int comparestream_open(lzsa_stream_t *stream, const char *pszCompareFilename, co

   pCompareStream->pCompareDataBuf = NULL;
   pCompareStream->nCompareDataSize = 0;
-   pCompareStream->f = (void*)fopen(pszCompareFilename, pszMode);
+   pCompareStream->f = (FILE*)fopen(pszCompareFilename, pszMode);

   if (pCompareStream->f) {
      stream->obj = pCompareStream;
@ -269,8 +304,10 @@ int comparestream_open(lzsa_stream_t *stream, const char *pszCompareFilename, co
      stream->close = comparestream_close;
      return 0;
   }
-   else
+   else {
+      free(pCompareStream);
      return -1;
+   }
 }

 static int do_compare(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, int nFormatVersion) {
@ -305,6 +342,8 @@ static int do_compare(const char *pszInFilename, const char *pszOutFilename, con
   nFlags = 0;
   if (nOptions & OPT_RAW)
      nFlags |= LZSA_FLAG_RAW_BLOCK;
+   if (nOptions & OPT_RAW_BACKWARD)
+      nFlags |= LZSA_FLAG_RAW_BACKWARD;

   if (nOptions & OPT_VERBOSE) {
      nStartTime = do_get_time();
@ -344,13 +383,13 @@ static int do_compare(const char *pszInFilename, const char *pszOutFilename, con

 /*---------------------------------------------------------------------------*/

-static void generate_compressible_data(unsigned char *pBuffer, size_t nBufferSize, int nMinMatchSize, unsigned int nSeed, int nNumLiteralValues, float fMatchProbability) {
+static void generate_compressible_data(unsigned char *pBuffer, const size_t nBufferSize, const int nMinMatchSize, const unsigned int nSeed, const int nNumLiteralValues, const float fMatchProbability) {
   size_t nIndex = 0;
-   int nMatchProbability = (int)(fMatchProbability * 1023.0f);
+   const int nMatchProbability = (int)(fMatchProbability * 1023.0f);

   srand(nSeed);
   
-   if (nIndex >= nBufferSize) return;
+   if (nBufferSize == 0) return;
   pBuffer[nIndex++] = rand() % nNumLiteralValues;

   while (nIndex < nBufferSize) {
@ -384,14 +423,12 @@ static void generate_compressible_data(unsigned char *pBuffer, size_t nBufferSiz
   }
 }

-static void xor_data(unsigned char *pBuffer, size_t nBufferSize, unsigned int nSeed, float fXorProbability) {
+static void xor_data(unsigned char *pBuffer, const size_t nBufferSize, const unsigned int nSeed, const float fXorProbability) {
   size_t nIndex = 0;
-   int nXorProbability = (int)(fXorProbability * 1023.0f);
+   const int nXorProbability = (const int)(fXorProbability * 1023.0f);

   srand(nSeed);

-   if (nIndex >= nBufferSize) return;
-
   while (nIndex < nBufferSize) {
      if ((rand() & 1023) < nXorProbability) {
         pBuffer[nIndex] ^= 0xff;
@ -400,7 +437,7 @@ static void xor_data(unsigned char *pBuffer, size_t nBufferSize, unsigned int nS
   }
 }

-static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, int nFormatVersion) {
+static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, const int nFormatVersion) {
   unsigned char *pGeneratedData;
   unsigned char *pCompressedData;
   unsigned char *pTmpCompressedData;
@ -416,6 +453,8 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
      nFlags |= LZSA_FLAG_FAVOR_RATIO;
   if (nOptions & OPT_RAW)
      nFlags |= LZSA_FLAG_RAW_BLOCK;
+   if (nOptions & OPT_RAW_BACKWARD)
+      nFlags |= LZSA_FLAG_RAW_BACKWARD;

   pGeneratedData = (unsigned char*)malloc(4 * BLOCK_SIZE);
   if (!pGeneratedData) {
@ -429,7 +468,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
      free(pGeneratedData);
      pGeneratedData = NULL;

-      fprintf(stderr, "out of memory, %zd bytes needed\n", nMaxCompressedDataSize);
+      fprintf(stderr, "out of memory, %zu bytes needed\n", nMaxCompressedDataSize);
      return 100;
   }

@ -440,7 +479,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
      free(pGeneratedData);
      pGeneratedData = NULL;

-      fprintf(stderr, "out of memory, %zd bytes needed\n", nMaxCompressedDataSize);
+      fprintf(stderr, "out of memory, %zu bytes needed\n", nMaxCompressedDataSize);
      return 100;
   }

@ -470,11 +509,11 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
   size_t nDataSizeStep = 128;
   float fProbabilitySizeStep = 0.0005f;

-   for (nGeneratedDataSize = 1024; nGeneratedDataSize <= ((nOptions & OPT_RAW) ? BLOCK_SIZE : (4 * BLOCK_SIZE)); nGeneratedDataSize += nDataSizeStep) {
+   for (nGeneratedDataSize = 1024; nGeneratedDataSize <= ((size_t)((nOptions & OPT_RAW) ? BLOCK_SIZE : (4 * BLOCK_SIZE))); nGeneratedDataSize += nDataSizeStep) {
      float fMatchProbability;

-      fprintf(stdout, "size %zd", nGeneratedDataSize);
-      for (fMatchProbability = ((nOptions & OPT_RAW) ? 0.5f : 0); fMatchProbability <= 0.995f; fMatchProbability += fProbabilitySizeStep) {
+      fprintf(stdout, "size %zu", nGeneratedDataSize);
+      for (fMatchProbability = 0; fMatchProbability <= 0.995f; fMatchProbability += fProbabilitySizeStep) {
         int nNumLiteralValues[12] = { 1, 2, 3, 15, 30, 56, 96, 137, 178, 191, 255, 256 };
         float fXorProbability;

@ -488,7 +527,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
            /* Try to compress it, expected to succeed */
            size_t nActualCompressedSize = lzsa_compress_inmem(pGeneratedData, pCompressedData, nGeneratedDataSize, lzsa_get_max_compressed_size_inmem(nGeneratedDataSize),
               nFlags, nMinMatchSize, nFormatVersion);
-            if (nActualCompressedSize == -1 || nActualCompressedSize < (lzsa_get_header_size() + lzsa_get_frame_size() + lzsa_get_frame_size() /* footer */)) {
+            if (nActualCompressedSize == (size_t)-1 || (int)nActualCompressedSize < (lzsa_get_header_size() + lzsa_get_frame_size() + lzsa_get_frame_size() /* footer */)) {
               free(pTmpDecompressedData);
               pTmpDecompressedData = NULL;
               free(pTmpCompressedData);
@ -498,7 +537,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
               free(pGeneratedData);
               pGeneratedData = NULL;

-               fprintf(stderr, "\nself-test: error compressing size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               fprintf(stderr, "\nself-test: error compressing size %zu, seed %u, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
               return 100;
            }

@ -506,7 +545,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
            size_t nActualDecompressedSize;
            int nDecFormatVersion = nFormatVersion;
            nActualDecompressedSize = lzsa_decompress_inmem(pCompressedData, pTmpDecompressedData, nActualCompressedSize, nGeneratedDataSize, nFlags, &nDecFormatVersion);
-            if (nActualDecompressedSize == -1) {
+            if (nActualDecompressedSize == (size_t)-1) {
               free(pTmpDecompressedData);
               pTmpDecompressedData = NULL;
               free(pTmpCompressedData);
@ -516,7 +555,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
               free(pGeneratedData);
               pGeneratedData = NULL;

-               fprintf(stderr, "\nself-test: error decompressing size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               fprintf(stderr, "\nself-test: error decompressing size %zu, seed %u, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
               return 100;
            }

@ -530,7 +569,7 @@ static int do_self_test(const unsigned int nOptions, const int nMinMatchSize, in
               free(pGeneratedData);
               pGeneratedData = NULL;

-               fprintf(stderr, "\nself-test: error comparing decompressed and original data, size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               fprintf(stderr, "\nself-test: error comparing decompressed and original data, size %zu, seed %u, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
               return 100;
            }

@ -587,6 +626,8 @@ static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilen
      nFlags |= LZSA_FLAG_FAVOR_RATIO;
   if (nOptions & OPT_RAW)
      nFlags |= LZSA_FLAG_RAW_BLOCK;
+   if (nOptions & OPT_RAW_BACKWARD)
+      nFlags |= LZSA_FLAG_RAW_BACKWARD;

   if (pszDictionaryFilename) {
      fprintf(stderr, "in-memory benchmarking does not support dictionaries\n");
@ -608,7 +649,7 @@ static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilen
   pFileData = (unsigned char*)malloc(nFileSize);
   if (!pFileData) {
      fclose(f_in);
-      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nFileSize);
+      fprintf(stderr, "out of memory for reading '%s', %zu bytes needed\n", pszInFilename, nFileSize);
      return 100;
   }

@ -628,7 +669,7 @@ static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilen
   pCompressedData = (unsigned char*)malloc(nMaxCompressedSize + 2048);
   if (!pCompressedData) {
      free(pFileData);
-      fprintf(stderr, "out of memory for compressing '%s', %zd bytes needed\n", pszInFilename, nMaxCompressedSize);
+      fprintf(stderr, "out of memory for compressing '%s', %zu bytes needed\n", pszInFilename, nMaxCompressedSize);
      return 100;
   }

@ -650,7 +691,7 @@ static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilen
      long long t0 = do_get_time();
      nActualCompressedSize = lzsa_compress_inmem(pFileData, pCompressedData + 1024, nFileSize, nRightGuardPos, nFlags, nMinMatchSize, nFormatVersion);
      long long t1 = do_get_time();
-      if (nActualCompressedSize == -1) {
+      if (nActualCompressedSize == (size_t)-1) {
         free(pCompressedData);
         free(pFileData);
         fprintf(stderr, "compression error\n");
@ -658,7 +699,7 @@ static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilen
      }

      long long nCurDecTime = t1 - t0;
-      if (nBestCompTime == -1 || nBestCompTime > nCurDecTime)
+      if (nBestCompTime == (size_t)-1 || nBestCompTime > nCurDecTime)
         nBestCompTime = nCurDecTime;

      /* Check guard bytes before the output buffer */
@ -699,7 +740,7 @@ static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilen
   free(pCompressedData);
   free(pFileData);

-   fprintf(stdout, "compressed size: %zd bytes\n", nActualCompressedSize);
+   fprintf(stdout, "compressed size: %zu bytes\n", nActualCompressedSize);
   fprintf(stdout, "compression time: %lld microseconds (%g Mb/s)\n", nBestCompTime, ((double)nActualCompressedSize / 1024.0) / ((double)nBestCompTime / 1000.0));

   return 0;
@ -717,6 +758,8 @@ static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilenam
   nFlags = 0;
   if (nOptions & OPT_RAW)
      nFlags |= LZSA_FLAG_RAW_BLOCK;
+   if (nOptions & OPT_RAW_BACKWARD)
+      nFlags |= LZSA_FLAG_RAW_BACKWARD;

   if (pszDictionaryFilename) {
      fprintf(stderr, "in-memory benchmarking does not support dictionaries\n");
@ -738,7 +781,7 @@ static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilenam
   pFileData = (unsigned char*)malloc(nFileSize);
   if (!pFileData) {
      fclose(f_in);
-      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nFileSize);
+      fprintf(stderr, "out of memory for reading '%s', %zu bytes needed\n", pszInFilename, nFileSize);
      return 100;
   }

@ -757,7 +800,7 @@ static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilenam
      nMaxDecompressedSize = 65536;
   else
      nMaxDecompressedSize = lzsa_get_max_decompressed_size_inmem(pFileData, nFileSize);
-   if (nMaxDecompressedSize == -1) {
+   if (nMaxDecompressedSize == (size_t)-1) {
      free(pFileData);
      fprintf(stderr, "invalid compressed format for file '%s'\n", pszInFilename);
      return 100;
@ -766,7 +809,7 @@ static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilenam
   pDecompressedData = (unsigned char*)malloc(nMaxDecompressedSize);
   if (!pDecompressedData) {
      free(pFileData);
-      fprintf(stderr, "out of memory for decompressing '%s', %zd bytes needed\n", pszInFilename, nMaxDecompressedSize);
+      fprintf(stderr, "out of memory for decompressing '%s', %zu bytes needed\n", pszInFilename, nMaxDecompressedSize);
      return 100;
   }

@ -779,7 +822,7 @@ static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilenam
      long long t0 = do_get_time();
      nActualDecompressedSize = lzsa_decompress_inmem(pFileData, pDecompressedData, nFileSize, nMaxDecompressedSize, nFlags, &nFormatVersion);
      long long t1 = do_get_time();
-      if (nActualDecompressedSize == -1) {
+      if (nActualDecompressedSize == (size_t)-1) {
         free(pDecompressedData);
         free(pFileData);
         fprintf(stderr, "decompression error\n");
@ -787,7 +830,7 @@ static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilenam
      }

      long long nCurDecTime = t1 - t0;
-      if (nBestDecTime == -1 || nBestDecTime > nCurDecTime)
+      if (nBestDecTime == (size_t)-1 || nBestDecTime > nCurDecTime)
         nBestDecTime = nCurDecTime;
   }

@ -807,7 +850,7 @@ static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilenam
   free(pFileData);

   fprintf(stdout, "format: LZSA%d\n", nFormatVersion);
-   fprintf(stdout, "decompressed size: %zd bytes\n", nActualDecompressedSize);
+   fprintf(stdout, "decompressed size: %zu bytes\n", nActualDecompressedSize);
   fprintf(stdout, "decompression time: %lld microseconds (%g Mb/s)\n", nBestDecTime, ((double)nActualDecompressedSize / 1024.0) / ((double)nBestDecTime / 1000.0));

   return 0;
@ -820,11 +863,11 @@ int main(int argc, char **argv) {
   const char *pszInFilename = NULL;
   const char *pszOutFilename = NULL;
   const char *pszDictionaryFilename = NULL;
-   bool bArgsError = false;
-   bool bCommandDefined = false;
-   bool bVerifyCompression = false;
-   bool bMinMatchDefined = false;
-   bool bFormatVersionDefined = false;
+   int nArgsError = 0;
+   int nCommandDefined = 0;
+   int nVerifyCompression = 0;
+   int nMinMatchDefined = 0;
+   int nFormatVersionDefined = 0;
   char cCommand = 'z';
   int nMinMatchSize = 0;
   unsigned int nOptions = OPT_FAVOR_RATIO;
@ -832,51 +875,51 @@ int main(int argc, char **argv) {

   for (i = 1; i < argc; i++) {
      if (!strcmp(argv[i], "-d")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
            cCommand = 'd';
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-z")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
            cCommand = 'z';
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-c")) {
-         if (!bVerifyCompression) {
-            bVerifyCompression = true;
+         if (!nVerifyCompression) {
+            nVerifyCompression = 1;
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-cbench")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
            cCommand = 'B';
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-dbench")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
            cCommand = 'b';
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-test")) {
-         if (!bCommandDefined) {
-            bCommandDefined = true;
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
            cCommand = 't';
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-D")) {
         if (!pszDictionaryFilename && (i + 1) < argc) {
@ -884,105 +927,119 @@ int main(int argc, char **argv) {
            i++;
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strncmp(argv[i], "-D", 2)) {
         if (!pszDictionaryFilename) {
            pszDictionaryFilename = argv[i] + 2;
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-m")) {
-         if (!bMinMatchDefined && (i + 1) < argc) {
+         if (!nMinMatchDefined && (i + 1) < argc) {
            char *pEnd = NULL;
            nMinMatchSize = (int)strtol(argv[i + 1], &pEnd, 10);
            if (pEnd && pEnd != argv[i + 1] && (nMinMatchSize >= 2 && nMinMatchSize <= 5)) {
               i++;
-               bMinMatchDefined = true;
+               nMinMatchDefined = 1;
               nOptions &= (~OPT_FAVOR_RATIO);
            }
            else {
-               bArgsError = true;
+               nArgsError = 1;
            }
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strncmp(argv[i], "-m", 2)) {
-         if (!bMinMatchDefined) {
+         if (!nMinMatchDefined) {
            char *pEnd = NULL;
            nMinMatchSize = (int)strtol(argv[i] + 2, &pEnd, 10);
            if (pEnd && pEnd != (argv[i]+2) && (nMinMatchSize >= 2 && nMinMatchSize <= 5)) {
-               bMinMatchDefined = true;
+               nMinMatchDefined = 1;
               nOptions &= (~OPT_FAVOR_RATIO);
            }
            else {
-               bArgsError = true;
+               nArgsError = 1;
            }
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "--prefer-ratio")) {
-         if (!bMinMatchDefined) {
+         if (!nMinMatchDefined) {
            nMinMatchSize = 0;
-            bMinMatchDefined = true;
+            nMinMatchDefined = 1;
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "--prefer-speed")) {
-         if (!bMinMatchDefined) {
+         if (!nMinMatchDefined) {
            nMinMatchSize = 3;
            nOptions &= (~OPT_FAVOR_RATIO);
-            bMinMatchDefined = true;
+            nMinMatchDefined = 1;
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-f")) {
-         if (!bFormatVersionDefined && (i + 1) < argc) {
+         if (!nFormatVersionDefined && (i + 1) < argc) {
            char *pEnd = NULL;
            nFormatVersion = (int)strtol(argv[i + 1], &pEnd, 10);
            if (pEnd && pEnd != argv[i + 1] && (nFormatVersion >= 1 && nFormatVersion <= 2)) {
               i++;
-               bFormatVersionDefined = true;
+               nFormatVersionDefined = 1;
            }
            else {
-               bArgsError = true;
+               nArgsError = 1;
            }
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strncmp(argv[i], "-f", 2)) {
-         if (!bFormatVersionDefined) {
+         if (!nFormatVersionDefined) {
            char *pEnd = NULL;
            nFormatVersion = (int)strtol(argv[i] + 2, &pEnd, 10);
            if (pEnd && pEnd != (argv[i] + 2) && (nFormatVersion >= 1 && nFormatVersion <= 2)) {
-               bFormatVersionDefined = true;
+               nFormatVersionDefined = 1;
            }
            else {
-               bArgsError = true;
+               nArgsError = 1;
            }
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-v")) {
         if ((nOptions & OPT_VERBOSE) == 0) {
            nOptions |= OPT_VERBOSE;
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
      }
      else if (!strcmp(argv[i], "-r")) {
         if ((nOptions & OPT_RAW) == 0) {
            nOptions |= OPT_RAW;
         }
         else
-            bArgsError = true;
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-b")) {
+         if ((nOptions & OPT_RAW_BACKWARD) == 0) {
+            nOptions |= OPT_RAW_BACKWARD;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-stats")) {
+         if ((nOptions & OPT_STATS) == 0) {
+            nOptions |= OPT_STATS;
+         }
+         else
+            nArgsError = 1;
      }
      else {
         if (!pszInFilename)
@ -991,26 +1048,33 @@ int main(int argc, char **argv) {
            if (!pszOutFilename)
               pszOutFilename = argv[i];
            else
-               bArgsError = true;
+               nArgsError = 1;
         }
      }
   }

-   if (!bArgsError && cCommand == 't') {
+   if (!nArgsError && (nOptions & OPT_RAW_BACKWARD) && !(nOptions & OPT_RAW)) {
+      fprintf(stderr, "error: -b (compress backwards) requires -r (raw block format)\n");
+      return 100;
+   }
+
+   if (!nArgsError && cCommand == 't') {
      return do_self_test(nOptions, nMinMatchSize, nFormatVersion);
   }

-   if (bArgsError || !pszInFilename || !pszOutFilename) {
+   if (nArgsError || !pszInFilename || !pszOutFilename) {
      fprintf(stderr, "lzsa command-line tool v" TOOL_VERSION " by Emmanuel Marty and spke\n");
      fprintf(stderr, "usage: %s [-c] [-d] [-v] [-r] <infile> <outfile>\n", argv[0]);
      fprintf(stderr, "       -c: check resulting stream after compressing\n");
      fprintf(stderr, "       -d: decompress (default: compress)\n");
-      fprintf(stderr, "  -cbench: benchmary in-memory compression\n");
-      fprintf(stderr, "  -dbench: benchmary in-memory decompression\n");
+      fprintf(stderr, "  -cbench: benchmark in-memory compression\n");
+      fprintf(stderr, "  -dbench: benchmark in-memory decompression\n");
      fprintf(stderr, "    -test: run automated self-tests\n");
+      fprintf(stderr, "   -stats: show compressed data stats\n");
      fprintf(stderr, "       -v: be verbose\n");
      fprintf(stderr, "       -f <value>: LZSA compression format (1-2)\n");
      fprintf(stderr, "       -r: raw block format (max. 64 Kb files)\n");
+      fprintf(stderr, "       -b: compress backward (requires -r and a backward decompressor)\n");
      fprintf(stderr, "       -D <filename>: use dictionary file\n");
      fprintf(stderr, "       -m <value>: minimum match size (3-5) (default: 3)\n");
      fprintf(stderr, "       --prefer-ratio: favor compression ratio (default)\n");
@ -1022,8 +1086,10 @@ int main(int argc, char **argv) {

   if (cCommand == 'z') {
      int nResult = do_compress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMinMatchSize, nFormatVersion);
-      if (nResult == 0 && bVerifyCompression) {
-         nResult = do_compare(pszOutFilename, pszInFilename, pszDictionaryFilename, nOptions, nFormatVersion);
+      if (nResult == 0 && nVerifyCompression) {
+         return do_compare(pszOutFilename, pszInFilename, pszDictionaryFilename, nOptions, nFormatVersion);
+      } else {
+         return nResult;
      }
   }
   else if (cCommand == 'd') {
--- a/src/matchfinder.c
+++ b/src/matchfinder.c
@ -33,7 +33,17 @@
 #include <string.h>
 #include "matchfinder.h"
 #include "format.h"
-#include "lib.h"
+
+/**
+ * Hash index into TAG_BITS
+ *
+ * @param nIndex index value
+ *
+ * @return hash
+ */
+static inline int lzsa_get_index_tag(unsigned int nIndex) {
+   return (int)(((unsigned long long)nIndex * 11400714819323198485ULL) >> (64ULL - TAG_BITS));
+}

 /**
 * Parse input data, build suffix array and overlaid data structures to speed up match finding
@ -55,7 +65,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
   int *PLCP = (int*)pCompressor->pos_data;  /* Use temporarily */
   int *Phi = PLCP;
   int nCurLen = 0;
-   int i;
+   int i, r;

   /* Compute the permuted LCP first (Kärkkäinen method) */
   Phi[intervals[0]] = -1;
@ -66,7 +76,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
         PLCP[i] = 0;
         continue;
      }
-      int nMaxLen = (i > Phi[i]) ? (nInWindowSize - i) : (nInWindowSize - Phi[i]);
+      const int nMaxLen = (i > Phi[i]) ? (nInWindowSize - i) : (nInWindowSize - Phi[i]);
      while (nCurLen < nMaxLen && pInWindow[i + nCurLen] == pInWindow[Phi[i] + nCurLen]) nCurLen++;
      PLCP[i] = nCurLen;
      if (nCurLen > 0)
@ -77,18 +87,33 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
    * saves us from having to build the inverse suffix array index, as the LCP is calculated without it using this method,
    * and the interval builder below doesn't need it either. */
   intervals[0] &= POS_MASK;
-   int nMinMatchSize = pCompressor->min_match_size;
-   for (i = 1; i < nInWindowSize - 1; i++) {
-      int nIndex = (int)(intervals[i] & POS_MASK);
-      int nLen = PLCP[nIndex];
-      if (nLen < nMinMatchSize)
-         nLen = 0;
-      if (nLen > LCP_MAX)
-         nLen = LCP_MAX;
-      intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nLen) << LCP_SHIFT);
+   const int nMinMatchSize = pCompressor->min_match_size;
+
+   if (pCompressor->format_version >= 2) {
+      for (i = 1; i < nInWindowSize; i++) {
+         const int nIndex = (int)(intervals[i] & POS_MASK);
+         int nLen = PLCP[nIndex];
+         if (nLen < nMinMatchSize)
+            nLen = 0;
+         if (nLen > LCP_MAX)
+            nLen = LCP_MAX;
+         int nTaggedLen = 0;
+         if (nLen)
+            nTaggedLen = (nLen << TAG_BITS) | (lzsa_get_index_tag((unsigned int)nIndex) & ((1 << TAG_BITS) - 1));
+         intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nTaggedLen) << LCP_SHIFT);
+      }
+   }
+   else {
+      for (i = 1; i < nInWindowSize; i++) {
+         const int nIndex = (int)(intervals[i] & POS_MASK);
+         int nLen = PLCP[nIndex];
+         if (nLen < nMinMatchSize)
+            nLen = 0;
+         if (nLen > LCP_AND_TAG_MAX)
+            nLen = LCP_AND_TAG_MAX;
+         intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nLen) << LCP_SHIFT);
+      }
   }
-   if (i < nInWindowSize)
-      intervals[i] &= POS_MASK;

   /**
    * Build intervals for finding matches
@ -106,7 +131,7 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
   intervals[0] = 0;
   next_interval_idx = 1;

-   for (int r = 1; r < nInWindowSize; r++) {
+   for (r = 1; r < nInWindowSize; r++) {
      const unsigned int next_pos = SA_and_LCP[r] & POS_MASK;
      const unsigned int next_lcp = SA_and_LCP[r] & LCP_MASK;
      const unsigned int top_lcp = *top & LCP_MASK;
@ -166,16 +191,19 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
 * @param nOffset offset to find matches at, in the input window
 * @param pMatches pointer to returned matches
 * @param nMaxMatches maximum number of matches to return (0 for none)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
 *
 * @return number of matches
 */
-int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches) {
+static int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches, const int nInWindowSize) {
   unsigned int *intervals = pCompressor->intervals;
   unsigned int *pos_data = pCompressor->pos_data;
   unsigned int ref;
   unsigned int super_ref;
   unsigned int match_pos;
   lzsa_match *matchptr;
+   unsigned int nPrevOffset = 0;
+   unsigned char nV1OffsetFound[2] = { 0, 0 };

   /**
    * Find matches using intervals
@ -209,19 +237,70 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
   /* Ascend indirectly via pos_data[] links.  */
   match_pos = super_ref & EXCL_VISITED_MASK;
   matchptr = pMatches;
+
+   if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
+      if ((matchptr - pMatches) < nMaxMatches) {
+         const unsigned int nMatchOffset = (const unsigned int)(nOffset - match_pos);
+
+         if (nMatchOffset <= MAX_OFFSET) {
+            matchptr->length = (const unsigned short)(ref >> (LCP_SHIFT + TAG_BITS));
+            matchptr->offset = (const unsigned short)nMatchOffset;
+            matchptr++;
+
+            nPrevOffset = nMatchOffset;
+         }
+      }
+   }
+
   for (;;) {
+      if ((super_ref = pos_data[match_pos]) > ref) {
+         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
+
+         if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
+            if ((matchptr - pMatches) < nMaxMatches) {
+               const unsigned int nMatchOffset = (const unsigned int)(nOffset - match_pos);
+
+               if (nMatchOffset <= MAX_OFFSET) {
+                  matchptr->length = ((const unsigned short)(ref >> (LCP_SHIFT + TAG_BITS))) | 0x8000;
+                  matchptr->offset = (const unsigned short)nMatchOffset;
+                  matchptr++;
+
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
+
      while ((super_ref = pos_data[match_pos]) > ref)
         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
      intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
      pos_data[match_pos] = ref;

      if ((matchptr - pMatches) < nMaxMatches) {
-         int nMatchOffset = (int)(nOffset - match_pos);
+         const unsigned int nMatchOffset = (const unsigned int)(nOffset - match_pos);

-         if (nMatchOffset <= MAX_OFFSET) {
-            matchptr->length = (unsigned short)(ref >> LCP_SHIFT);
-            matchptr->offset = (unsigned short)nMatchOffset;
-            matchptr++;
+         if (nMatchOffset <= MAX_OFFSET && nMatchOffset != nPrevOffset) {
+            if (pCompressor->format_version >= 2) {
+               matchptr->length = (const unsigned short)(ref >> (LCP_SHIFT + TAG_BITS));
+               matchptr->offset = (const unsigned short)nMatchOffset;
+               matchptr++;
+
+               nPrevOffset = nMatchOffset;
+            }
+            else {
+               unsigned int nV1OffsetType = (nMatchOffset <= 256) ? 0 : 1;
+
+               if (!nV1OffsetFound[nV1OffsetType]) {
+                  matchptr->length = (const unsigned short)(ref >> LCP_SHIFT);
+                  matchptr->offset = (const unsigned short)nMatchOffset;
+
+                  if (matchptr->length < 256)
+                     nV1OffsetFound[nV1OffsetType] = 1;
+                  matchptr++;
+
+                  nPrevOffset = nMatchOffset;
+               }
+            }
         }
      }

@ -229,6 +308,24 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
         break;
      ref = super_ref;
      match_pos = intervals[ref & POS_MASK] & EXCL_VISITED_MASK;
+
+      if (pCompressor->format_version >= 2 && nInWindowSize < 65536) {
+         if ((matchptr - pMatches) < nMaxMatches) {
+            const unsigned int nMatchOffset = (const unsigned int)(nOffset - match_pos);
+
+            if (nMatchOffset <= MAX_OFFSET) {
+               const unsigned short nMatchLen = ((const unsigned short)(ref >> (LCP_SHIFT + TAG_BITS)));
+
+               if (nMatchLen > 2) {
+                  matchptr->length = nMatchLen | 0x8000;
+                  matchptr->offset = (const unsigned short)nMatchOffset;
+                  matchptr++;
+
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
   }

   return (int)(matchptr - pMatches);
@ -248,40 +345,29 @@ void lzsa_skip_matches(lzsa_compressor *pCompressor, const int nStartOffset, con
   /* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
    * we don't store the matches. */
   for (i = nStartOffset; i < nEndOffset; i++) {
-      lzsa_find_matches_at(pCompressor, i, &match, 0);
+      lzsa_find_matches_at(pCompressor, i, &match, 0, 0);
   }
 }

 /**
- * Find all matches for the data to be compressed. Up to NMATCHES_PER_OFFSET matches are stored for each offset, for
- * the optimizer to look at.
+ * Find all matches for the data to be compressed
 *
 * @param pCompressor compression context
+ * @param nMatchesPerOffset maximum number of matches to store for each offset
 * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
 */
-void lzsa_find_all_matches(lzsa_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
-   lzsa_match *pMatch = pCompressor->match + (nStartOffset << MATCHES_PER_OFFSET_SHIFT);
+void lzsa_find_all_matches(lzsa_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset) {
+   lzsa_match *pMatch = pCompressor->match;
   int i;

   for (i = nStartOffset; i < nEndOffset; i++) {
-      int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, NMATCHES_PER_OFFSET);
-      int m;
+      const int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, nMatchesPerOffset, nEndOffset - nStartOffset);

-      for (m = 0; m < NMATCHES_PER_OFFSET; m++) {
-         if (nMatches <= m || i > (nEndOffset - LAST_MATCH_OFFSET)) {
-            pMatch->length = 0;
-            pMatch->offset = 0;
-         }
-         else {
-            int nMaxLen = (nEndOffset - LAST_LITERALS) - i;
-            if (nMaxLen < 0)
-               nMaxLen = 0;
-            if (pMatch->length > nMaxLen)
-               pMatch->length = (unsigned short)nMaxLen;
-         }
-
-         pMatch++;
+      if (nMatches < nMatchesPerOffset) {
+         memset(pMatch + nMatches, 0, (nMatchesPerOffset - nMatches) * sizeof(lzsa_match));
      }
+
+      pMatch += nMatchesPerOffset;
   }
 }
--- a/src/matchfinder.h
+++ b/src/matchfinder.h
@ -33,14 +33,12 @@
 #ifndef _MATCHFINDER_H
 #define _MATCHFINDER_H

+#include "shrink_context.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif

-/* Forward declarations */
-typedef struct _lzsa_match lzsa_match;
-typedef struct _lzsa_compressor lzsa_compressor;
-
 /**
 * Parse input data, build suffix array and overlaid data structures to speed up match finding
 *
@ -52,18 +50,6 @@ typedef struct _lzsa_compressor lzsa_compressor;
 */
 int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize);

-/**
- * Find matches at the specified offset in the input window
- *
- * @param pCompressor compression context
- * @param nOffset offset to find matches at, in the input window
- * @param pMatches pointer to returned matches
- * @param nMaxMatches maximum number of matches to return (0 for none)
- *
- * @return number of matches
- */
-int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches);
-
 /**
 * Skip previously compressed bytes
 *
@ -74,14 +60,14 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
 void lzsa_skip_matches(lzsa_compressor *pCompressor, const int nStartOffset, const int nEndOffset);

 /**
- * Find all matches for the data to be compressed. Up to NMATCHES_PER_OFFSET matches are stored for each offset, for
- * the optimizer to look at.
+ * Find all matches for the data to be compressed
 *
 * @param pCompressor compression context
+ * @param nMatchesPerOffset maximum number of matches to store for each offset
 * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
 */
-void lzsa_find_all_matches(lzsa_compressor *pCompressor, const int nStartOffset, const int nEndOffset);
+void lzsa_find_all_matches(lzsa_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset);

 #ifdef __cplusplus
 }
--- a/src/shrink_block_v1.c
+++ b/src/shrink_block_v1.c
@ -1,5 +1,5 @@
 /*
- * shrink_v1.c - LZSA1 block compressor implementation
+ * shrink_block_v1.c - LZSA1 block compressor implementation
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -67,7 +67,7 @@ static inline int lzsa_get_literals_varlen_size_v1(const int nLength) {
 * @param nOutOffset current write index into output buffer
 * @param nLength literals length
 */
-static inline int lzsa_write_literals_varlen_v1(unsigned char *pOutData, int nOutOffset, int nLength) {
+static inline int lzsa_write_literals_varlen_v1(unsigned char *pOutData, int nOutOffset, const int nLength) {
   if (nLength >= LITERALS_RUN_LEN_V1) {
      if (nLength < 256)
         pOutData[nOutOffset++] = nLength - LITERALS_RUN_LEN_V1;
@ -118,7 +118,7 @@ static inline int lzsa_get_match_varlen_size_v1(const int nLength) {
 * @param nOutOffset current write index into output buffer
 * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE_V1)
 */
-static inline int lzsa_write_match_varlen_v1(unsigned char *pOutData, int nOutOffset, int nLength) {
+static inline int lzsa_write_match_varlen_v1(unsigned char *pOutData, int nOutOffset, const int nLength) {
   if (nLength >= MATCH_RUN_LEN_V1) {
      if ((nLength + MIN_MATCH_SIZE_V1) < 256)
         pOutData[nOutOffset++] = nLength - MATCH_RUN_LEN_V1;
@ -139,110 +139,148 @@ static inline int lzsa_write_match_varlen_v1(unsigned char *pOutData, int nOutOf
 }

 /**
- * Attempt to pick optimal matches, so as to produce the smallest possible output that decompresses to the same input
+ * Get offset encoding cost in bits
+ *
+ * @param __nMatchOffset offset to get cost of
+ *
+ * @return cost in bits
+ */
+#define lzsa_get_offset_cost_v1(__nMatchOffset) (((__nMatchOffset) <= 256) ? 8 : 16)
+
+/**
+ * Attempt to pick optimal matches using a forward arrivals parser, so as to produce the smallest possible output that decompresses to the same input
 *
 * @param pCompressor compression context
 * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nReduce non-zero to reduce the number of tokens when the path costs are equal, zero not to
 */
-static void lzsa_optimize_matches_v1(lzsa_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
-   int *cost = (int*)pCompressor->pos_data;  /* Reuse */
-   int nLastLiteralsOffset;
-   int nMinMatchSize = pCompressor->min_match_size;
+static void lzsa_optimize_forward_v1(lzsa_compressor *pCompressor, const int nStartOffset, const int nEndOffset, const int nReduce) {
+   lzsa_arrival *arrival = pCompressor->arrival - (nStartOffset << ARRIVALS_PER_POSITION_SHIFT_V1);
+   const int nMinMatchSize = pCompressor->min_match_size;
   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
+   const int nModeSwitchPenalty = nFavorRatio ? 0 : MODESWITCH_PENALTY;
+   const int nDisableScore = nReduce ? 0 : (2 * BLOCK_SIZE);
   int i;

-   cost[nEndOffset - 1] = 8;
-   nLastLiteralsOffset = nEndOffset;
+   if ((nEndOffset - nStartOffset) > BLOCK_SIZE) return;

-   for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
-      int nBestCost, nBestMatchLen, nBestMatchOffset;
+   for (i = (nStartOffset << ARRIVALS_PER_POSITION_SHIFT_V1); i != ((nEndOffset + 1) << ARRIVALS_PER_POSITION_SHIFT_V1); i += NARRIVALS_PER_POSITION_V1) {
+      lzsa_arrival* cur_arrival = &arrival[i];
+      int j;

-      int nLiteralsLen = nLastLiteralsOffset - i;
-      nBestCost = 8 + cost[i + 1];
-      if (nLiteralsLen == LITERALS_RUN_LEN_V1 || nLiteralsLen == 256 || nLiteralsLen == 512) {
-         /* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
-          * The cost automatically accumulates down the chain. */
-         nBestCost += 8;
-      }
-      if (pCompressor->match[(i + 1) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
-         nBestCost += MODESWITCH_PENALTY;
-      nBestMatchLen = 0;
-      nBestMatchOffset = 0;
+      memset(cur_arrival, 0, sizeof(lzsa_arrival) * NARRIVALS_PER_POSITION_V1);

-      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
-      int m;
+      for (j = 0; j < NARRIVALS_PER_POSITION_V1; j++)
+         cur_arrival[j].cost = 0x40000000;
+   }

-      for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
-         int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 8 : 16;
+   arrival[nStartOffset << ARRIVALS_PER_POSITION_SHIFT_V1].cost = 0;
+   arrival[nStartOffset << ARRIVALS_PER_POSITION_SHIFT_V1].from_slot = -1;

-         if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
-            int nCurCost;
-            int nMatchLen = pMatch[m].length;
+   for (i = nStartOffset; i != nEndOffset; i++) {
+      lzsa_arrival* cur_arrival = &arrival[i << ARRIVALS_PER_POSITION_SHIFT_V1];
+      lzsa_arrival* pDestLiteralSlots = &cur_arrival[1 << ARRIVALS_PER_POSITION_SHIFT_V1];
+      int j, m;

-            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
-               nMatchLen = nEndOffset - LAST_LITERALS - i;
+      for (j = 0; j < NARRIVALS_PER_POSITION_V1 && cur_arrival[j].from_slot; j++) {
+         const int nPrevCost = cur_arrival[j].cost;
+         int nCodingChoiceCost = nPrevCost + 8 /* literal */;
+         const int nScore = cur_arrival[j].score + 1;
+         const int nNumLiterals = cur_arrival[j].num_literals + 1;
+         int n;

-            nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size_v1(nMatchLen - MIN_MATCH_SIZE_V1);
-            nCurCost += cost[i + nMatchLen];
-            if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
-               nCurCost += MODESWITCH_PENALTY;
-
-            if (nBestCost > (nCurCost - nFavorRatio)) {
-               nBestCost = nCurCost;
-               nBestMatchLen = nMatchLen;
-               nBestMatchOffset = pMatch[m].offset;
-            }
+         if (nNumLiterals == 1)
+            nCodingChoiceCost += nModeSwitchPenalty;
+         else if (nNumLiterals == LITERALS_RUN_LEN_V1 || nNumLiterals == 256 || nNumLiterals == 512) {
+            nCodingChoiceCost += 8;
         }
-         else {
-            int nMatchLen = pMatch[m].length;
-            int k, nMatchRunLen;

-            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
-               nMatchLen = nEndOffset - LAST_LITERALS - i;
+         for (n = 0; n < NARRIVALS_PER_POSITION_V1 /* we only need the literals + short match cost + long match cost cases */; n++) {
+            if (nCodingChoiceCost < pDestLiteralSlots[n].cost ||
+               (nCodingChoiceCost == pDestLiteralSlots[n].cost && nScore < (pDestLiteralSlots[n].score + nDisableScore))) {
+               memmove(&pDestLiteralSlots[n + 1],
+                  &pDestLiteralSlots[n],
+                  sizeof(lzsa_arrival) * (NARRIVALS_PER_POSITION_V1 - n - 1));

-            nMatchRunLen = nMatchLen;
-            if (nMatchRunLen > MATCH_RUN_LEN_V1)
-               nMatchRunLen = MATCH_RUN_LEN_V1;
-
-            for (k = nMinMatchSize; k < nMatchRunLen; k++) {
-               int nCurCost;
-
-               nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */;
-               nCurCost += cost[i + k];
-               if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
-                  nCurCost += MODESWITCH_PENALTY;
-
-               if (nBestCost > (nCurCost - nFavorRatio)) {
-                  nBestCost = nCurCost;
-                  nBestMatchLen = k;
-                  nBestMatchOffset = pMatch[m].offset;
-               }
-            }
-
-            for (; k <= nMatchLen; k++) {
-               int nCurCost;
-
-               nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size_v1(k - MIN_MATCH_SIZE_V1);
-               nCurCost += cost[i + k];
-               if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
-                  nCurCost += MODESWITCH_PENALTY;
-
-               if (nBestCost > (nCurCost - nFavorRatio)) {
-                  nBestCost = nCurCost;
-                  nBestMatchLen = k;
-                  nBestMatchOffset = pMatch[m].offset;
-               }
+               lzsa_arrival* pDestArrival = &pDestLiteralSlots[n];
+               pDestArrival->cost = nCodingChoiceCost;
+               pDestArrival->rep_offset = cur_arrival[j].rep_offset;
+               pDestArrival->from_slot = j + 1;
+               pDestArrival->from_pos = i - nStartOffset;
+               pDestArrival->match_len = 0;
+               pDestArrival->num_literals = nNumLiterals;
+               pDestArrival->score = nScore;
+               break;
            }
         }
      }

-      if (nBestMatchLen >= MIN_MATCH_SIZE_V1)
-         nLastLiteralsOffset = i;
+      const lzsa_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT_V1);
+      const int nNumArrivalsForThisPos = j;

-      cost[i] = nBestCost;
-      pMatch->length = nBestMatchLen;
-      pMatch->offset = nBestMatchOffset;
+      if (nNumArrivalsForThisPos != 0) {
+         for (m = 0; m < NMATCHES_PER_INDEX_V1 && match[m].length; m++) {
+            int nMatchLen = match[m].length;
+            const int nMatchOffsetCost = lzsa_get_offset_cost_v1(match[m].offset);
+            int nStartingMatchLen, k;
+
+            if ((i + nMatchLen) > nEndOffset)
+               nMatchLen = nEndOffset - i;
+
+            if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE)
+               nStartingMatchLen = nMatchLen;
+            else
+               nStartingMatchLen = nMinMatchSize;
+            for (k = nStartingMatchLen; k <= nMatchLen; k++) {
+               const int nMatchLenCost = lzsa_get_match_varlen_size_v1(k - MIN_MATCH_SIZE_V1);
+
+               lzsa_arrival* pDestSlots = &cur_arrival[k << ARRIVALS_PER_POSITION_SHIFT_V1];
+               int nCodingChoiceCost = cur_arrival[0].cost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchOffsetCost + nMatchLenCost;
+               int exists = 0, n;
+
+               if (!cur_arrival[0].num_literals)
+                  nCodingChoiceCost += nModeSwitchPenalty;
+
+               for (n = 0;
+                  n < NARRIVALS_PER_POSITION_V1 && pDestSlots[n].from_slot && pDestSlots[n].cost <= nCodingChoiceCost;
+                  n++) {
+                  if (lzsa_get_offset_cost_v1(pDestSlots[n].rep_offset) == nMatchOffsetCost) {
+                     exists = 1;
+                     break;
+                  }
+               }
+
+               if (!exists) {
+                  const int nScore = cur_arrival[0].score + 5;
+
+                  if (nCodingChoiceCost < pDestSlots[0].cost ||
+                     (nCodingChoiceCost == pDestSlots[0].cost && nScore < (pDestSlots[0].score + nDisableScore))) {
+                     memmove(&pDestSlots[1],
+                        &pDestSlots[0],
+                        sizeof(lzsa_arrival) * (NARRIVALS_PER_POSITION_V1 - 1));
+
+                     pDestSlots->cost = nCodingChoiceCost;
+                     pDestSlots->rep_offset = match[m].offset;
+                     pDestSlots->from_slot = 1;
+                     pDestSlots->from_pos = i - nStartOffset;
+                     pDestSlots->match_len = k;
+                     pDestSlots->num_literals = 0;
+                     pDestSlots->score = nScore;
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   const lzsa_arrival *end_arrival = &arrival[i << ARRIVALS_PER_POSITION_SHIFT_V1];
+   lzsa_match *pBestMatch = pCompressor->best_match - nStartOffset;
+
+   while (end_arrival->from_slot > 0 && (end_arrival->from_pos + nStartOffset) < nEndOffset) {
+      pBestMatch[end_arrival->from_pos + nStartOffset].length = end_arrival->match_len;
+      pBestMatch[end_arrival->from_pos + nStartOffset].offset = (end_arrival->match_len) ? end_arrival->rep_offset: 0;
+      end_arrival = &arrival[((end_arrival->from_pos + nStartOffset) << ARRIVALS_PER_POSITION_SHIFT_V1) + (end_arrival->from_slot - 1)];
   }
 }

@ -251,80 +289,102 @@ static void lzsa_optimize_matches_v1(lzsa_compressor *pCompressor, const int nSt
 * impacting the compression ratio
 *
 * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
 * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
 * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
 *
 * @return non-zero if the number of tokens was reduced, 0 if it wasn't
 */
-static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset) {
+   lzsa_match *pBestMatch = pCompressor->best_match - nStartOffset;
   int i;
   int nNumLiterals = 0;
   int nDidReduce = 0;

   for (i = nStartOffset; i < nEndOffset; ) {
-      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
+      lzsa_match *pMatch = pBestMatch + i;
+
+      if (pMatch->length == 0 &&
+         (i + 1) < nEndOffset &&
+         pBestMatch[i + 1].length >= MIN_MATCH_SIZE_V1 &&
+         pBestMatch[i + 1].length < MAX_VARLEN &&
+         pBestMatch[i + 1].offset &&
+         i >= pBestMatch[i + 1].offset &&
+         (i + pBestMatch[i + 1].length + 1) <= nEndOffset &&
+         !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
+         const int nCurLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length - MIN_MATCH_SIZE_V1);
+         const int nReducedLenSize = lzsa_get_match_varlen_size_v1(pBestMatch[i + 1].length + 1 - MIN_MATCH_SIZE_V1);
+
+         if ((nReducedLenSize - nCurLenSize) <= 8) {
+            /* Merge */
+            pBestMatch[i].length = pBestMatch[i + 1].length + 1;
+            pBestMatch[i].offset = pBestMatch[i + 1].offset;
+            pBestMatch[i + 1].length = 0;
+            pBestMatch[i + 1].offset = 0;
+            nDidReduce = 1;
+            continue;
+         }
+      }

      if (pMatch->length >= MIN_MATCH_SIZE_V1) {
-         int nMatchLen = pMatch->length;
-         int nReduce = 0;
+         if (pMatch->length <= 9 /* Don't waste time considering large matches, they will always win over literals */ &&
+            (i + pMatch->length) < nEndOffset /* Don't consider the last token in the block, we can only reduce a match inbetween other tokens */) {
+            int nNextIndex = i + pMatch->length;
+            int nNextLiterals = 0;

-         if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <EE> <ll> <ll> <offset> <offset> <EE> <mm> <mm> */ {
-            int nMatchOffset = pMatch->offset;
-            int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V1;
-            int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + ((nMatchOffset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size_v1(nEncodedMatchLen);
+            while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < MIN_MATCH_SIZE_V1) {
+               nNextLiterals++;
+               nNextIndex++;
+            }

-            if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1) {
-               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v1(nNumLiterals + nMatchLen))) {
-                  /* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
-                   * make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
-                   * at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
-                   * match command by literals, the output size will not increase and it will remove one command. */
-                  nReduce = 1;
+            /* This command is a match, is followed by 'nNextLiterals' literals and then by another match, or the end of the input. Calculate this command's current cost (excluding 'nNumLiterals' bytes) */
+            if ((8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + ((pMatch->offset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size_v1(pMatch->length - MIN_MATCH_SIZE_V1) +
+               8 /* token */ + lzsa_get_literals_varlen_size_v1(nNextLiterals)) >=
+               (8 /* token */ + (pMatch->length << 3) + lzsa_get_literals_varlen_size_v1(nNumLiterals + pMatch->length + nNextLiterals))) {
+               /* Reduce */
+               const int nMatchLen = pMatch->length;
+               int j;
+
+               for (j = 0; j < nMatchLen; j++) {
+                  pBestMatch[i + j].length = 0;
               }
-            }
-            else {
-               int nCurIndex = i + nMatchLen;
-               int nNextNumLiterals = 0;

-               do {
-                  nCurIndex++;
-                  nNextNumLiterals++;
-               } while (nCurIndex < nEndOffset && pCompressor->match[nCurIndex << MATCHES_PER_OFFSET_SHIFT].length < MIN_MATCH_SIZE_V1);
-
-               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v1(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size_v1(nNextNumLiterals))) {
-                  /* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
-                   * more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
-                  nReduce = 1;
-               }
-            }
-         }
-
-         if (nReduce) {
-            int j;
-
-            for (j = 0; j < nMatchLen; j++) {
-               pCompressor->match[(i + j) << MATCHES_PER_OFFSET_SHIFT].length = 0;
-            }
-            nNumLiterals += nMatchLen;
-            i += nMatchLen;
-
-            nDidReduce = 1;
-         }
-         else {
-            if ((i + nMatchLen) < nEndOffset && nMatchLen >= LCP_MAX &&
-               pMatch->offset && pMatch->offset <= 32 && pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset == pMatch->offset && (nMatchLen % pMatch->offset) == 0 &&
-               (nMatchLen + pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length) <= MAX_VARLEN) {
-               /* Join */
-
-               pMatch->length += pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length;
-               pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset = 0;
-               pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length = -1;
+               nDidReduce = 1;
               continue;
            }
-
-            nNumLiterals = 0;
-            i += nMatchLen;
         }
+
+         if ((i + pMatch->length) < nEndOffset && pMatch->offset && pMatch->length >= MIN_MATCH_SIZE_V1 &&
+            pBestMatch[i + pMatch->length].offset &&
+            pBestMatch[i + pMatch->length].length >= MIN_MATCH_SIZE_V1 &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN &&
+            (i + pMatch->length) >= pMatch->offset &&
+            (i + pMatch->length) >= pBestMatch[i + pMatch->length].offset &&
+            (i + pMatch->length + pBestMatch[i + pMatch->length].length) <= nEndOffset &&
+            !memcmp(pInWindow + i - pMatch->offset + pMatch->length,
+               pInWindow + i + pMatch->length - pBestMatch[i + pMatch->length].offset,
+               pBestMatch[i + pMatch->length].length)) {
+
+            int nCurPartialSize = lzsa_get_match_varlen_size_v1(pMatch->length - MIN_MATCH_SIZE_V1);
+            nCurPartialSize += 8 /* token */ + /* lzsa_get_literals_varlen_size_v1(0) + */ ((pBestMatch[i + pMatch->length].offset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size_v1(pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V1);
+
+            const int nReducedPartialSize = lzsa_get_match_varlen_size_v1(pMatch->length + pBestMatch[i + pMatch->length].length - MIN_MATCH_SIZE_V1);
+
+            if (nCurPartialSize >= nReducedPartialSize) {
+               const int nMatchLen = pMatch->length;
+
+               /* Join */
+
+               pMatch->length += pBestMatch[i + nMatchLen].length;
+               pBestMatch[i + nMatchLen].length = 0;
+               pBestMatch[i + nMatchLen].offset = 0;
+               nDidReduce = 1;
+               continue;
+            }
+         }
+
+         i += pMatch->length;
+         nNumLiterals = 0;
      }
      else {
         nNumLiterals++;
@ -348,22 +408,23 @@ static int lzsa_optimize_command_count_v1(lzsa_compressor *pCompressor, const in
 * @return size of compressed data in output buffer, or -1 if the data is uncompressible
 */
 static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize) {
+   const lzsa_match *pBestMatch = pCompressor->best_match - nStartOffset;
   int i;
   int nNumLiterals = 0;
   int nInFirstLiteralOffset = 0;
   int nOutOffset = 0;

   for (i = nStartOffset; i < nEndOffset; ) {
-      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
+      const lzsa_match *pMatch = pBestMatch + i;

      if (pMatch->length >= MIN_MATCH_SIZE_V1) {
-         int nMatchOffset = pMatch->offset;
-         int nMatchLen = pMatch->length;
-         int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V1;
-         int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
-         int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN_V1) ? MATCH_RUN_LEN_V1 : nEncodedMatchLen;
-         int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
-         int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size_v1(nEncodedMatchLen);
+         const int nMatchOffset = pMatch->offset;
+         const int nMatchLen = pMatch->length;
+         const int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V1;
+         const int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
+         const int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN_V1) ? MATCH_RUN_LEN_V1 : nEncodedMatchLen;
+         const int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
+         const int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size_v1(nEncodedMatchLen);

         if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
            return -1;
@ -373,6 +434,13 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
         pOutData[nOutOffset++] = nTokenLongOffset | (nTokenLiteralsLen << 4) | nTokenMatchLen;
         nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals);

+         if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
+            pCompressor->stats.min_literals = nNumLiterals;
+         if (nNumLiterals > pCompressor->stats.max_literals)
+            pCompressor->stats.max_literals = nNumLiterals;
+         pCompressor->stats.total_literals += nNumLiterals;
+         pCompressor->stats.literals_divisor++;
+
         if (nNumLiterals != 0) {
            memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
            nOutOffset += nNumLiterals;
@ -384,8 +452,45 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
            pOutData[nOutOffset++] = (-nMatchOffset) >> 8;
         }
         nOutOffset = lzsa_write_match_varlen_v1(pOutData, nOutOffset, nEncodedMatchLen);
+
+         if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1)
+            pCompressor->stats.min_offset = nMatchOffset;
+         if (nMatchOffset > pCompressor->stats.max_offset)
+            pCompressor->stats.max_offset = nMatchOffset;
+         pCompressor->stats.total_offsets += nMatchOffset;
+
+         if (nMatchLen < pCompressor->stats.min_match_len || pCompressor->stats.min_match_len == -1)
+            pCompressor->stats.min_match_len = nMatchLen;
+         if (nMatchLen > pCompressor->stats.max_match_len)
+            pCompressor->stats.max_match_len = nMatchLen;
+         pCompressor->stats.total_match_lens += nMatchLen;
+         pCompressor->stats.match_divisor++;
+
+         if (nMatchOffset == 1) {
+            if (nMatchLen < pCompressor->stats.min_rle1_len || pCompressor->stats.min_rle1_len == -1)
+               pCompressor->stats.min_rle1_len = nMatchLen;
+            if (nMatchLen > pCompressor->stats.max_rle1_len)
+               pCompressor->stats.max_rle1_len = nMatchLen;
+            pCompressor->stats.total_rle1_lens += nMatchLen;
+            pCompressor->stats.rle1_divisor++;
+         }
+         else if (nMatchOffset == 2) {
+            if (nMatchLen < pCompressor->stats.min_rle2_len || pCompressor->stats.min_rle2_len == -1)
+               pCompressor->stats.min_rle2_len = nMatchLen;
+            if (nMatchLen > pCompressor->stats.max_rle2_len)
+               pCompressor->stats.max_rle2_len = nMatchLen;
+            pCompressor->stats.total_rle2_lens += nMatchLen;
+            pCompressor->stats.rle2_divisor++;
+         }
+
         i += nMatchLen;

+         if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
+            const int nCurSafeDist = (i - nStartOffset) - nOutOffset;
+            if (nCurSafeDist >= 0 && pCompressor->safe_dist < nCurSafeDist)
+               pCompressor->safe_dist = nCurSafeDist;
+         }
+
         pCompressor->num_commands++;
      }
      else {
@ -397,8 +502,8 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
   }

   {
-      int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
-      int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3);
+      const int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
+      const int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3);

      if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
         return -1;
@ -406,13 +511,25 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
      if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK)
         pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x0f;
      else
-         pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x00;
+         pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) /* | 0x00 */;
      nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals);

+      if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
+         pCompressor->stats.min_literals = nNumLiterals;
+      if (nNumLiterals > pCompressor->stats.max_literals)
+         pCompressor->stats.max_literals = nNumLiterals;
+      pCompressor->stats.total_literals += nNumLiterals;
+      pCompressor->stats.literals_divisor++;
+
      if (nNumLiterals != 0) {
         memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
         nOutOffset += nNumLiterals;
-         nNumLiterals = 0;
+      }
+
+      if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
+         const int nCurSafeDist = (i - nStartOffset) - nOutOffset;
+         if (nCurSafeDist >= 0 && pCompressor->safe_dist < nCurSafeDist)
+            pCompressor->safe_dist = nCurSafeDist;
      }

      pCompressor->num_commands++;
@ -446,11 +563,11 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
 * @return size of compressed data in output buffer, or -1 if the data is uncompressible
 */
 static int lzsa_write_raw_uncompressed_block_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize) {
-   int nNumLiterals = nEndOffset - nStartOffset;
-   int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
+   const int nNumLiterals = nEndOffset - nStartOffset;
+   const int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
   int nOutOffset = 0;

-   int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3) + 4;
+   const int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3) + 4;
   if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
      return -1;

@ -462,7 +579,6 @@ static int lzsa_write_raw_uncompressed_block_v1(lzsa_compressor *pCompressor, co
   if (nNumLiterals != 0) {
      memcpy(pOutData + nOutOffset, pInWindow + nStartOffset, nNumLiterals);
      nOutOffset += nNumLiterals;
-      nNumLiterals = 0;
   }

   pCompressor->num_commands++;
@ -482,8 +598,8 @@ static int lzsa_write_raw_uncompressed_block_v1(lzsa_compressor *pCompressor, co
 *
 * @param pCompressor compression context
 * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
- * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
- * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
 * @param pOutData pointer to output buffer
 * @param nMaxOutDataSize maximum size of output buffer, in bytes
 *
@ -492,17 +608,26 @@ static int lzsa_write_raw_uncompressed_block_v1(lzsa_compressor *pCompressor, co
 int lzsa_optimize_and_write_block_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
   int nResult;

-   lzsa_optimize_matches_v1(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+   /* Compress optimally without breaking ties in favor of less tokens */
+
+   memset(pCompressor->best_match, 0, BLOCK_SIZE * sizeof(lzsa_match));
+
+   if (nInDataSize < 65536) {
+      lzsa_optimize_forward_v1(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 1 /* reduce */);
+   }
+   else {
+      lzsa_optimize_forward_v1(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, 0 /* reduce */);
+   }

   int nDidReduce;
   int nPasses = 0;
   do {
-      nDidReduce = lzsa_optimize_command_count_v1(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      nDidReduce = lzsa_optimize_command_count_v1(pCompressor, pInWindow, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
      nPasses++;
   } while (nDidReduce && nPasses < 20);

   nResult = lzsa_write_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, pOutData, nMaxOutDataSize);
-   if (nResult < 0 && pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
+   if (nResult < 0 && (pCompressor->flags & LZSA_FLAG_RAW_BLOCK)) {
      nResult = lzsa_write_raw_uncompressed_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, pOutData, nMaxOutDataSize);
   }

--- a/src/shrink_block_v1.h
+++ b/src/shrink_block_v1.h
@ -1,5 +1,5 @@
 /*
- * shrink_v1.h - LZSA1 block compressor definitions
+ * shrink_block_v1.h - LZSA1 block compressor definitions
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -33,21 +33,28 @@
 #ifndef _SHRINK_BLOCK_V1_H
 #define _SHRINK_BLOCK_V1_H

-/* Forward declarations */
-typedef struct _lzsa_compressor lzsa_compressor;
+#include "shrink_context.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif

 /**
 * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed LZSA1 data
 *
 * @param pCompressor compression context
 * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
- * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
- * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
 * @param pOutData pointer to output buffer
 * @param nMaxOutDataSize maximum size of output buffer, in bytes
 *
 * @return size of compressed data in output buffer, or -1 if the data is uncompressible
 */
-int lzsa_optimize_and_write_block_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize);
+int lzsa_optimize_and_write_block_v1(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize);
+
+#ifdef __cplusplus
+}
+#endif

 #endif /* _SHRINK_BLOCK_V1_H */
--- a/src/shrink_block_v2.c
+++ b/src/shrink_block_v2.c
--- a/src/shrink_block_v2.h
+++ b/src/shrink_block_v2.h
@ -1,5 +1,5 @@
 /*
- * shrink_v2.h - LZSA2 block compressor definitions
+ * shrink_block_v2.h - LZSA2 block compressor definitions
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -33,21 +33,28 @@
 #ifndef _SHRINK_BLOCK_V2_H
 #define _SHRINK_BLOCK_V2_H

-/* Forward declarations */
-typedef struct _lzsa_compressor lzsa_compressor;
+#include "shrink_context.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif

 /**
 * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed LZSA2 data
 *
 * @param pCompressor compression context
 * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
- * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
- * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
 * @param pOutData pointer to output buffer
 * @param nMaxOutDataSize maximum size of output buffer, in bytes
 *
 * @return size of compressed data in output buffer, or -1 if the data is uncompressible
 */
-int lzsa_optimize_and_write_block_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize);
+int lzsa_optimize_and_write_block_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize);
+
+#ifdef __cplusplus
+}
+#endif

 #endif /* _SHRINK_BLOCK_V2_H */
--- a/src/shrink_context.c
+++ b/src/shrink_context.c
@ -37,6 +37,7 @@
 #include "shrink_block_v2.h"
 #include "format.h"
 #include "matchfinder.h"
+#include "lib.h"

 /**
 * Initialize compression context
@ -44,34 +45,44 @@
 * @param pCompressor compression context to initialize
 * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
 * @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE)
+ * @param nFormatVersion version of format to use (1-2)
 * @param nFlags compression flags
 *
 * @return 0 for success, non-zero for failure
 */
 int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize, const int nFormatVersion, const int nFlags) {
   int nResult;
-   int nMinMatchSizeForFormat = (nFormatVersion == 1) ? MIN_MATCH_SIZE_V1 : MIN_MATCH_SIZE_V2;
-   int nMaxMinMatchForFormat = (nFormatVersion == 1) ? 5 : 3;
+   const int nMinMatchSizeForFormat = (nFormatVersion == 1) ? MIN_MATCH_SIZE_V1 : MIN_MATCH_SIZE_V2;
+   const int nMaxMinMatchForFormat = (nFormatVersion == 1) ? 5 : 3;

   nResult = divsufsort_init(&pCompressor->divsufsort_context);
   pCompressor->intervals = NULL;
   pCompressor->pos_data = NULL;
   pCompressor->open_intervals = NULL;
   pCompressor->match = NULL;
-   pCompressor->selected_match = NULL;
   pCompressor->best_match = NULL;
-   pCompressor->improved_match = NULL;
-   pCompressor->slot_cost = NULL;
-   pCompressor->repmatch_opt = NULL;
+   pCompressor->arrival = NULL;
+   pCompressor->rep_slot_handled_mask = NULL;
+   pCompressor->rep_len_handled_mask = NULL;
+   pCompressor->first_offset_for_byte = NULL;
+   pCompressor->next_offset_for_pos = NULL;
+   pCompressor->offset_cache = NULL;
   pCompressor->min_match_size = nMinMatchSize;
   if (pCompressor->min_match_size < nMinMatchSizeForFormat)
      pCompressor->min_match_size = nMinMatchSizeForFormat;
   else if (pCompressor->min_match_size > nMaxMinMatchForFormat)
      pCompressor->min_match_size = nMaxMinMatchForFormat;
-   pCompressor->max_forward_depth = 0;
   pCompressor->format_version = nFormatVersion;
   pCompressor->flags = nFlags;
+   pCompressor->safe_dist = 0;
   pCompressor->num_commands = 0;
+   
+   memset(&pCompressor->stats, 0, sizeof(pCompressor->stats));
+   pCompressor->stats.min_literals = -1;
+   pCompressor->stats.min_match_len = -1;
+   pCompressor->stats.min_offset = -1;
+   pCompressor->stats.min_rle1_len = -1;
+   pCompressor->stats.min_rle2_len = -1;

   if (!nResult) {
      pCompressor->intervals = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));
@ -80,37 +91,43 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
         pCompressor->pos_data = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));

         if (pCompressor->pos_data) {
-            pCompressor->open_intervals = (unsigned int *)malloc((LCP_MAX + 1) * sizeof(unsigned int));
+            pCompressor->open_intervals = (unsigned int *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned int));

            if (pCompressor->open_intervals) {
-               pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(lzsa_match));
+               pCompressor->arrival = (lzsa_arrival *)malloc(((BLOCK_SIZE + 1) << ARRIVALS_PER_POSITION_SHIFT_V2) * sizeof(lzsa_arrival));
+   
+               if (pCompressor->arrival) {
+                  pCompressor->best_match = (lzsa_match *)malloc(BLOCK_SIZE * sizeof(lzsa_match));

-               if (pCompressor->match) {
-                  if (pCompressor->format_version == 2) {
-                     pCompressor->selected_match = (lzsa_match *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(lzsa_match));
-
-                     if (pCompressor->selected_match) {
-                        pCompressor->best_match = (lzsa_match *)malloc(nMaxWindowSize * sizeof(lzsa_match));
-
-                        if (pCompressor->best_match) {
-                           pCompressor->improved_match = (lzsa_match *)malloc(nMaxWindowSize * sizeof(lzsa_match));
-
-                           if (pCompressor->improved_match) {
-                              pCompressor->slot_cost = (int *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(int));
-
-                              if (pCompressor->slot_cost) {
-                                 pCompressor->repmatch_opt = (lzsa_repmatch_opt *)malloc(nMaxWindowSize * sizeof(lzsa_repmatch_opt));
-
-                                 if (pCompressor->repmatch_opt)
-                                    return 0;
+                  if (pCompressor->best_match) {
+                     if (pCompressor->format_version == 2)
+                        pCompressor->match = (lzsa_match*)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V2 * sizeof(lzsa_match));
+                     else
+                        pCompressor->match = (lzsa_match*)malloc(BLOCK_SIZE * NMATCHES_PER_INDEX_V1 * sizeof(lzsa_match));
+                     if (pCompressor->match) {
+                        if (pCompressor->format_version == 2) {
+                           pCompressor->rep_slot_handled_mask = (unsigned char*)malloc(NARRIVALS_PER_POSITION_V2_BIG * ((LCP_MAX + 1) / 8) * sizeof(unsigned char));
+                           if (pCompressor->rep_slot_handled_mask) {
+                              pCompressor->rep_len_handled_mask = (unsigned char*)malloc(((LCP_MAX + 1) / 8) * sizeof(unsigned char));
+                              if (pCompressor->rep_len_handled_mask) {
+                                 pCompressor->first_offset_for_byte = (int*)malloc(65536 * sizeof(int));
+                                 if (pCompressor->first_offset_for_byte) {
+                                    pCompressor->next_offset_for_pos = (int*)malloc(BLOCK_SIZE * sizeof(int));
+                                    if (pCompressor->next_offset_for_pos) {
+                                       pCompressor->offset_cache = (int*)malloc(2048 * sizeof(int));
+                                       if (pCompressor->offset_cache) {
+                                          return 0;
+                                       }
+                                    }
+                                 }
                              }
                           }
                        }
+                        else {
+                           return 0;
+                        }
                     }
                  }
-                  else {
-                     return 0;
-                  }
               }
            }
         }
@ -129,29 +146,29 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
 void lzsa_compressor_destroy(lzsa_compressor *pCompressor) {
   divsufsort_destroy(&pCompressor->divsufsort_context);

-   if (pCompressor->repmatch_opt) {
-      free(pCompressor->repmatch_opt);
-      pCompressor->repmatch_opt = NULL;
+   if (pCompressor->offset_cache) {
+      free(pCompressor->offset_cache);
+      pCompressor->offset_cache = NULL;
   }

-   if (pCompressor->slot_cost) {
-      free(pCompressor->slot_cost);
-      pCompressor->slot_cost = NULL;
+   if (pCompressor->next_offset_for_pos) {
+      free(pCompressor->next_offset_for_pos);
+      pCompressor->next_offset_for_pos = NULL;
   }

-   if (pCompressor->improved_match) {
-      free(pCompressor->improved_match);
-      pCompressor->improved_match = NULL;
+   if (pCompressor->first_offset_for_byte) {
+      free(pCompressor->first_offset_for_byte);
+      pCompressor->first_offset_for_byte = NULL;
   }

-   if (pCompressor->best_match) {
-      free(pCompressor->best_match);
-      pCompressor->best_match = NULL;
+   if (pCompressor->rep_len_handled_mask) {
+      free(pCompressor->rep_len_handled_mask);
+      pCompressor->rep_len_handled_mask = NULL;
   }

-   if (pCompressor->selected_match) {
-      free(pCompressor->selected_match);
-      pCompressor->selected_match = NULL;
+   if (pCompressor->rep_slot_handled_mask) {
+      free(pCompressor->rep_slot_handled_mask);
+      pCompressor->rep_slot_handled_mask = NULL;
   }

   if (pCompressor->match) {
@ -159,6 +176,16 @@ void lzsa_compressor_destroy(lzsa_compressor *pCompressor) {
      pCompressor->match = NULL;
   }

+   if (pCompressor->arrival) {
+      free(pCompressor->arrival);
+      pCompressor->arrival = NULL;
+   }
+
+   if (pCompressor->best_match) {
+      free(pCompressor->best_match);
+      pCompressor->best_match = NULL;
+   }
+
   if (pCompressor->open_intervals) {
      free(pCompressor->open_intervals);
      pCompressor->open_intervals = NULL;
@ -187,23 +214,43 @@ void lzsa_compressor_destroy(lzsa_compressor *pCompressor) {
 *
 * @return size of compressed data in output buffer, or -1 if the data is uncompressible
 */
-int lzsa_compressor_shrink_block(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
-   if (lzsa_build_suffix_array(pCompressor, pInWindow, nPreviousBlockSize + nInDataSize))
-      return -1;
-   if (nPreviousBlockSize) {
-      lzsa_skip_matches(pCompressor, 0, nPreviousBlockSize);
-   }
-   lzsa_find_all_matches(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+int lzsa_compressor_shrink_block(lzsa_compressor *pCompressor, unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
+   int nCompressedSize;

-   if (pCompressor->format_version == 1) {
-      return lzsa_optimize_and_write_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);
-   }
-   else if (pCompressor->format_version == 2) {
-      return lzsa_optimize_and_write_block_v2(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);
+   if (pCompressor->flags & LZSA_FLAG_RAW_BACKWARD) {
+      lzsa_reverse_buffer(pInWindow + nPreviousBlockSize, nInDataSize);
   }
+
+   if (lzsa_build_suffix_array(pCompressor, pInWindow, nPreviousBlockSize + nInDataSize))
+      nCompressedSize = -1;
   else {
-      return -1;
+      if (nPreviousBlockSize) {
+         lzsa_skip_matches(pCompressor, 0, nPreviousBlockSize);
+      }
+      lzsa_find_all_matches(pCompressor, (pCompressor->format_version == 2) ? NMATCHES_PER_INDEX_V2 : NMATCHES_PER_INDEX_V1, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+
+      if (pCompressor->format_version == 1) {
+         nCompressedSize = lzsa_optimize_and_write_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);
+         if (nCompressedSize != -1 && (pCompressor->flags & LZSA_FLAG_RAW_BACKWARD)) {
+            lzsa_reverse_buffer(pOutData, nCompressedSize);
+         }
+      }
+      else if (pCompressor->format_version == 2) {
+         nCompressedSize = lzsa_optimize_and_write_block_v2(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);
+         if (nCompressedSize != -1 && (pCompressor->flags & LZSA_FLAG_RAW_BACKWARD)) {
+            lzsa_reverse_buffer(pOutData, nCompressedSize);
+         }
+      }
+      else {
+         nCompressedSize = -1;
+      }
   }
+
+   if (pCompressor->flags & LZSA_FLAG_RAW_BACKWARD) {
+      lzsa_reverse_buffer(pInWindow + nPreviousBlockSize, nInDataSize);
+   }
+
+   return nCompressedSize;
 }

 /**
--- a/src/shrink_context.h
+++ b/src/shrink_context.h
@ -34,29 +34,38 @@
 #define _SHRINK_CONTEXT_H

 #include "divsufsort.h"
-#include "hashmap.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #define LCP_BITS 14
-#define LCP_MAX (1U<<(LCP_BITS - 1))
+#define TAG_BITS 4
+#define LCP_MAX ((1U<<(LCP_BITS - TAG_BITS)) - 1)
+#define LCP_AND_TAG_MAX (1U<<(LCP_BITS - 1))
 #define LCP_SHIFT (31-LCP_BITS)
 #define LCP_MASK (((1U<<LCP_BITS) - 1) << LCP_SHIFT)
 #define POS_MASK ((1U<<LCP_SHIFT) - 1)
 #define VISITED_FLAG 0x80000000
 #define EXCL_VISITED_MASK  0x7fffffff

-#define NMATCHES_PER_OFFSET 8
-#define MATCHES_PER_OFFSET_SHIFT 3
+#define NARRIVALS_PER_POSITION_V1 8
+#define NARRIVALS_PER_POSITION_V2_SMALL 9
+#define NARRIVALS_PER_POSITION_V2_BIG 32
+#define NARRIVALS_PER_POSITION_V2_MAX 64
+#define ARRIVALS_PER_POSITION_SHIFT_V1 3
+#define ARRIVALS_PER_POSITION_SHIFT_V2 6

-#define LEAVE_ALONE_MATCH_SIZE 1000
+#define NMATCHES_PER_INDEX_V1 16
+#define MATCHES_PER_INDEX_SHIFT_V1 4

-#define LAST_MATCH_OFFSET 4
-#define LAST_LITERALS 1
+#define NMATCHES_PER_INDEX_V2 64
+#define MATCHES_PER_INDEX_SHIFT_V2 6

-#define MODESWITCH_PENALTY 1
+#define LEAVE_ALONE_MATCH_SIZE 300
+#define LEAVE_ALONE_MATCH_SIZE_SMALL 1000
+
+#define MODESWITCH_PENALTY 3

 /** One match */
 typedef struct _lzsa_match {
@ -64,12 +73,48 @@ typedef struct _lzsa_match {
   unsigned short offset;
 } lzsa_match;

-/** One rep-match slot (for LZSA2) */
-typedef struct _lzsa_repmatch_opt {
-   int incoming_offset;
-   short best_slot_for_incoming;
-   short expected_repmatch;
-} lzsa_repmatch_opt;
+/** Forward arrival slot */
+typedef struct _lzsa_arrival {
+   int cost;
+   unsigned short rep_offset;
+   short from_slot;
+
+   unsigned short from_pos;
+   unsigned short rep_len;
+   unsigned short match_len;
+   unsigned short num_literals;
+   int rep_pos;
+   int score;
+} lzsa_arrival;
+
+/** Compression statistics */
+typedef struct _lzsa_stats {
+   int min_literals;
+   int max_literals;
+   int total_literals;
+
+   int min_offset;
+   int max_offset;
+   int num_rep_offsets;
+   int total_offsets;
+
+   int min_match_len;
+   int max_match_len;
+   int total_match_lens;
+
+   int min_rle1_len;
+   int max_rle1_len;
+   int total_rle1_lens;
+
+   int min_rle2_len;
+   int max_rle2_len;
+   int total_rle2_lens;
+
+   int literals_divisor;
+   int match_divisor;
+   int rle1_divisor;
+   int rle2_divisor;
+} lzsa_stats;

 /** Compression context */
 typedef struct _lzsa_compressor {
@ -78,17 +123,19 @@ typedef struct _lzsa_compressor {
   unsigned int *pos_data;
   unsigned int *open_intervals;
   lzsa_match *match;
-   lzsa_match *selected_match;
   lzsa_match *best_match;
-   lzsa_match *improved_match;
-   int *slot_cost;
-   lzsa_repmatch_opt *repmatch_opt;
+   lzsa_arrival *arrival;
+   unsigned char *rep_slot_handled_mask;
+   unsigned char *rep_len_handled_mask;
+   int *first_offset_for_byte;
+   int *next_offset_for_pos;
+   int *offset_cache;
   int min_match_size;
-   int max_forward_depth;
   int format_version;
   int flags;
+   int safe_dist;
   int num_commands;
-   lzsa_hashmap_t cost_map;
+   lzsa_stats stats;
 } lzsa_compressor;

 /**
@ -97,6 +144,7 @@ typedef struct _lzsa_compressor {
 * @param pCompressor compression context to initialize
 * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
 * @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE)
+ * @param nFormatVersion version of format to use (1-2)
 * @param nFlags compression flags
 *
 * @return 0 for success, non-zero for failure
@ -122,7 +170,7 @@ void lzsa_compressor_destroy(lzsa_compressor *pCompressor);
 *
 * @return size of compressed data in output buffer, or -1 if the data is uncompressible
 */
-int lzsa_compressor_shrink_block(lzsa_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize);
+int lzsa_compressor_shrink_block(lzsa_compressor *pCompressor, unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize);

 /**
 * Get the number of compression commands issued in compressed data blocks
--- a/src/shrink_inmem.c
+++ b/src/shrink_inmem.c
@ -62,7 +62,7 @@ size_t lzsa_get_max_compressed_size_inmem(size_t nInputSize) {
 *
 * @return actual compressed size, or -1 for error
 */
-size_t lzsa_compress_inmem(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
+size_t lzsa_compress_inmem(unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
                             const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion) {
   lzsa_compressor compressor;
   size_t nOriginalSize = 0;
@ -84,21 +84,6 @@ size_t lzsa_compress_inmem(const unsigned char *pInputData, unsigned char *pOutB
      }
   }

-   if ((compressor.flags & LZSA_FLAG_FAVOR_RATIO)) {
-      if (nInputSize < 16384)
-         compressor.max_forward_depth = 25;
-      else {
-         if (nInputSize < 32768)
-            compressor.max_forward_depth = 15;
-         else {
-            if (nInputSize < BLOCK_SIZE)
-               compressor.max_forward_depth = 10;
-            else
-               compressor.max_forward_depth = 0;
-         }
-      }
-   }
-
   int nPreviousBlockSize = 0;
   int nNumBlocks = 0;

@ -157,7 +142,7 @@ size_t lzsa_compress_inmem(const unsigned char *pInputData, unsigned char *pOutB
            if (nBlockheaderSize < 0)
               nError = LZSA_ERROR_COMPRESSION;
            else {
-               if (nInDataSize > (nMaxOutBufferSize - (nCompressedSize + nBlockheaderSize)))
+               if ((size_t)nInDataSize > (nMaxOutBufferSize - (nCompressedSize + nBlockheaderSize)))
                  nError = LZSA_ERROR_DST;
               else {
                  memcpy(pOutBuffer + nBlockheaderSize + nCompressedSize, pInputData + nOriginalSize, nInDataSize);
--- a/src/shrink_inmem.h
+++ b/src/shrink_inmem.h
@ -61,7 +61,7 @@ size_t lzsa_get_max_compressed_size_inmem(size_t nInputSize);
 *
 * @return actual compressed size, or -1 for error
 */
-size_t lzsa_compress_inmem(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
+size_t lzsa_compress_inmem(unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
   const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion);

 #ifdef __cplusplus
--- a/src/shrink_streaming.c
+++ b/src/shrink_streaming.c
@ -70,11 +70,13 @@ static void lzsa_delete_file(const char *pszInFilename) {
 * @param pOriginalSize pointer to returned input(source) size, updated when this function is successful
 * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
 * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
+ * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
 *
 * @return LZSA_OK for success, or an error value from lzsa_status_t
 */
 lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
-      void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount) {
+      void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats) {
   lzsa_stream_t inStream, outStream;
   void *pDictionaryData = NULL;
   int nDictionaryDataSize = 0;
@ -98,7 +100,7 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi
      return nStatus;
   }

-   nStatus = lzsa_compress_stream(&inStream, &outStream, pDictionaryData, nDictionaryDataSize, nFlags, nMinMatchSize, nFormatVersion, progress, pOriginalSize, pCompressedSize, pCommandCount);
+   nStatus = lzsa_compress_stream(&inStream, &outStream, pDictionaryData, nDictionaryDataSize, nFlags, nMinMatchSize, nFormatVersion, progress, pOriginalSize, pCompressedSize, pCommandCount, pSafeDist, pStats);

   lzsa_dictionary_free(&pDictionaryData);
   outStream.close(&outStream);
@ -127,12 +129,14 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi
 * @param pOriginalSize pointer to returned input(source) size, updated when this function is successful
 * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
 * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
+ * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
 *
 * @return LZSA_OK for success, or an error value from lzsa_status_t
 */
 lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOutStream, const void *pDictionaryData, int nDictionaryDataSize,
                                   const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
-                                   void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount) {
+                                   void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats) {
   unsigned char *pInData, *pOutData;
   lzsa_compressor compressor;
   long long nOriginalSize = 0LL, nCompressedSize = 0LL;
@ -200,21 +204,6 @@ lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOut
         }
         nDictionaryDataSize = 0;

-         if (nNumBlocks == 0 && (compressor.flags & LZSA_FLAG_FAVOR_RATIO)) {
-            if (nInDataSize < 16384)
-               compressor.max_forward_depth = 25;
-            else {
-               if (nInDataSize < 32768)
-                  compressor.max_forward_depth = 15;
-               else {
-                  if (nInDataSize < BLOCK_SIZE)
-                     compressor.max_forward_depth = 10;
-                  else
-                     compressor.max_forward_depth = 0;
-               }
-            }
-         }
-
         int nOutDataSize;

         nOutDataSize = lzsa_compressor_shrink_block(&compressor, pInData + BLOCK_SIZE - nPreviousBlockSize, nPreviousBlockSize, nInDataSize, pOutData, ((nInDataSize + nRawPadding) >= BLOCK_SIZE) ? BLOCK_SIZE : (nInDataSize + nRawPadding));
@ -301,6 +290,11 @@ lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOut
      progress(nOriginalSize, nCompressedSize);

   int nCommandCount = lzsa_compressor_get_command_count(&compressor);
+   int nSafeDist = compressor.safe_dist;
+
+   if (pStats)
+      *pStats = compressor.stats;
+
   lzsa_compressor_destroy(&compressor);

   free(pOutData);
@ -319,6 +313,8 @@ lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOut
         *pCompressedSize = nCompressedSize;
      if (pCommandCount)
         *pCommandCount = nCommandCount;
+      if (pSafeDist)
+         *pSafeDist = nSafeDist;
      return LZSA_OK;
   }
 }
--- a/src/shrink_streaming.h
+++ b/src/shrink_streaming.h
@ -33,15 +33,13 @@
 #ifndef _SHRINK_STREAMING_H
 #define _SHRINK_STREAMING_H

+#include "shrink_context.h"
 #include "stream.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

-/* Forward declaration */
-typedef enum _lzsa_status_t lzsa_status_t;
-
 /*-------------- File API -------------- */

 /**
@ -57,12 +55,14 @@ typedef enum _lzsa_status_t lzsa_status_t;
 * @param pOriginalSize pointer to returned input(source) size, updated when this function is successful
 * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
 * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
+ * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
 *
 * @return LZSA_OK for success, or an error value from lzsa_status_t
 */
 lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename,
   const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
-   void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount);
+   void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats);

 /*-------------- Streaming API -------------- */

@ -80,12 +80,14 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi
 * @param pOriginalSize pointer to returned input(source) size, updated when this function is successful
 * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
 * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
+ * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
 *
 * @return LZSA_OK for success, or an error value from lzsa_status_t
 */
 lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOutStream, const void *pDictionaryData, int nDictionaryDataSize,
   const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
-   void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount);
+   void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats);

 #ifdef __cplusplus
 }
--- a/src/stream.c
+++ b/src/stream.c
@ -34,13 +34,17 @@
 #include <stdlib.h>
 #include <string.h>
 #include "stream.h"
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#endif

 /**
 * Close file stream
 *
 * @param stream stream
 */
-static void lzsa_filestream_close(lzsa_stream_t *stream) {
+static void lzsa_filestream_close(struct _lzsa_stream_t *stream) {
   if (stream->obj) {
      fclose((FILE*)stream->obj);
      stream->obj = NULL;
@ -60,7 +64,7 @@ static void lzsa_filestream_close(lzsa_stream_t *stream) {
 *
 * @return number of bytes read
 */
-static size_t lzsa_filestream_read(lzsa_stream_t *stream, void *ptr, size_t size) {
+static size_t lzsa_filestream_read(struct _lzsa_stream_t *stream, void *ptr, size_t size) {
   return fread(ptr, 1, size, (FILE*)stream->obj);
 }

@ -73,7 +77,7 @@ static size_t lzsa_filestream_read(lzsa_stream_t *stream, void *ptr, size_t size
 *
 * @return number of bytes written
 */
-static size_t lzsa_filestream_write(lzsa_stream_t *stream, void *ptr, size_t size) {
+static size_t lzsa_filestream_write(struct _lzsa_stream_t *stream, void *ptr, size_t size) {
   return fwrite(ptr, 1, size, (FILE*)stream->obj);
 }

@ -84,7 +88,7 @@ static size_t lzsa_filestream_write(lzsa_stream_t *stream, void *ptr, size_t siz
 *
 * @return nonzero if the end of the data has been reached, 0 if there is more data
 */
-static int lzsa_filestream_eof(lzsa_stream_t *stream) {
+static int lzsa_filestream_eof(struct _lzsa_stream_t *stream) {
   return feof((FILE*)stream->obj);
 }

@ -98,7 +102,32 @@ static int lzsa_filestream_eof(lzsa_stream_t *stream) {
 * @return 0 for success, nonzero for failure
 */
 int lzsa_filestream_open(lzsa_stream_t *stream, const char *pszInFilename, const char *pszMode) {
-   stream->obj = (void*)fopen(pszInFilename, pszMode);
+   const char* stdInOutFile = "-";
+   const char* stdInMode = "rb";
+   const char* stdOutMode = "wb";
+#ifdef _WIN32
+   int result;
+#endif
+
+   if (!strncmp(pszInFilename, stdInOutFile, 1)) {
+       if (!strncmp(pszMode, stdInMode, 2)) {
+#ifdef _WIN32
+           result = _setmode(_fileno(stdin), _O_BINARY);
+#endif   
+           stream->obj = stdin;
+       } else if (!strncmp(pszMode, stdOutMode, 2)) {
+#ifdef _WIN32
+		   result = _setmode(_fileno(stdout), _O_BINARY);
+#endif 
+           stream->obj = stdout;
+       } else {
+           return -1;
+       }
+  
+   } else {
+       stream->obj = (void*)fopen(pszInFilename, pszMode);
+   }
+   
   if (stream->obj) {
      stream->read = lzsa_filestream_read;
      stream->write = lzsa_filestream_write;
--- a/src/stream.h
+++ b/src/stream.h
@ -37,8 +37,23 @@
 extern "C" {
 #endif

-/* Forward declaration */
-typedef struct _lzsa_stream_t lzsa_stream_t;
+/** High level status for compression and decompression */
+typedef enum _lzsa_status_t {
+   LZSA_OK = 0,                           /**< Success */
+   LZSA_ERROR_SRC,                        /**< Error reading input */
+   LZSA_ERROR_DST,                        /**< Error reading output */
+   LZSA_ERROR_DICTIONARY,                 /**< Error reading dictionary */
+   LZSA_ERROR_MEMORY,                     /**< Out of memory */
+
+   /* Compression-specific status codes */
+   LZSA_ERROR_COMPRESSION,                /**< Internal compression error */
+   LZSA_ERROR_RAW_TOOLARGE,               /**< Input is too large to be compressed to a raw block */
+   LZSA_ERROR_RAW_UNCOMPRESSED,           /**< Input is incompressible and raw blocks don't support uncompressed data */
+
+   /* Decompression-specific status codes */
+   LZSA_ERROR_FORMAT,                     /**< Invalid input format or magic number when decompressing */
+   LZSA_ERROR_DECOMPRESSION               /**< Internal decompression error */
+} lzsa_status_t;

 /* I/O stream */
 typedef struct _lzsa_stream_t {
@ -54,7 +69,7 @@ typedef struct _lzsa_stream_t {
    *
    * @return number of bytes read
    */
-   size_t(*read)(lzsa_stream_t *stream, void *ptr, size_t size);
+   size_t(*read)(struct _lzsa_stream_t *stream, void *ptr, size_t size);

   /**
    * Write to stream
@ -65,7 +80,7 @@ typedef struct _lzsa_stream_t {
    *
    * @return number of bytes written
    */
-   size_t(*write)(lzsa_stream_t *stream, void *ptr, size_t size);
+   size_t(*write)(struct _lzsa_stream_t *stream, void *ptr, size_t size);


   /**
@ -75,14 +90,14 @@ typedef struct _lzsa_stream_t {
    *
    * @return nonzero if the end of the data has been reached, 0 if there is more data
    */
-   int(*eof)(lzsa_stream_t *stream);
+   int(*eof)(struct _lzsa_stream_t *stream);

   /**
    * Close stream
    *
    * @param stream stream
    */
-   void(*close)(lzsa_stream_t *stream);
+   void(*close)(struct _lzsa_stream_t *stream);
 } lzsa_stream_t;

 /**