Check in LZSA2 implementation (ratio competitive with ZX7, faster decompression)

2024-12-27 02:32:44 +00:00 · 2019-05-09 16:51:29 +02:00 · 2019-05-09 16:51:29 +02:00 · 8b7b4a2b4f
commit 8b7b4a2b4f
parent 49b0739050
26 changed files with 2982 additions and 936 deletions
--- a/2
+++ b/2
@ -1,3 +1,3 @@
-The LZSA code is available under the Zlib license, except for src/shrink.c which is placed under the Creative Commons CC0 license.
+The LZSA code is available under the Zlib license, except for src/matchfinder.c which is placed under the Creative Commons CC0 license.

 Please consult LICENSE.zlib.md and LICENSE.CC0.md for more information.
--- a/10
+++ b/10
@ -10,10 +10,14 @@ $(OBJDIR)/%.o: src/../%.c

 APP := lzsa

-OBJS := $(OBJDIR)/src/main.o
+OBJS := $(OBJDIR)/src/lzsa.o
 OBJS += $(OBJDIR)/src/frame.o
-OBJS += $(OBJDIR)/src/shrink.o
-OBJS += $(OBJDIR)/src/expand.o
+OBJS += $(OBJDIR)/src/lib.o
+OBJS += $(OBJDIR)/src/matchfinder.o
+OBJS += $(OBJDIR)/src/shrink_v1.o
+OBJS += $(OBJDIR)/src/shrink_v2.o
+OBJS += $(OBJDIR)/src/expand_v1.o
+OBJS += $(OBJDIR)/src/expand_v2.o
 OBJS += $(OBJDIR)/src/libdivsufsort/lib/divsufsort.o
 OBJS += $(OBJDIR)/src/libdivsufsort/lib/sssort.o
 OBJS += $(OBJDIR)/src/libdivsufsort/lib/trsort.o
--- a/README.md
+++ b/README.md
@ -40,7 +40,7 @@ Inspirations:
 License:

 * The LZSA code is available under the Zlib license.
-* The compressor (shrink.c) is available under the CC0 license due to using portions of code from Eric Bigger's Wimlib in the suffix array-based matchfinder.
+* The match finder (matchfinder.c) is available under the CC0 license due to using portions of code from Eric Bigger's Wimlib in the suffix array-based matchfinder.

 # Stream format

--- a/VS2017/lzsa.vcxproj
+++ b/VS2017/lzsa.vcxproj
@ -177,24 +177,32 @@
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <ClInclude Include="..\src\expand.h" />
+    <ClInclude Include="..\src\expand_v1.h" />
+    <ClInclude Include="..\src\expand_v2.h" />
    <ClInclude Include="..\src\format.h" />
    <ClInclude Include="..\src\frame.h" />
+    <ClInclude Include="..\src\lib.h" />
    <ClInclude Include="..\src\libdivsufsort\include\config.h" />
    <ClInclude Include="..\src\libdivsufsort\include\divsufsort.h" />
    <ClInclude Include="..\src\libdivsufsort\include\divsufsort_private.h" />
-    <ClInclude Include="..\src\shrink.h" />
+    <ClInclude Include="..\src\matchfinder.h" />
+    <ClInclude Include="..\src\shrink_v1.h" />
+    <ClInclude Include="..\src\shrink_v2.h" />
    <ClInclude Include="pch.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\src\expand.c" />
+    <ClCompile Include="..\src\expand_v1.c" />
+    <ClCompile Include="..\src\expand_v2.c" />
    <ClCompile Include="..\src\frame.c" />
+    <ClCompile Include="..\src\lib.c" />
    <ClCompile Include="..\src\libdivsufsort\lib\divsufsort.c" />
    <ClCompile Include="..\src\libdivsufsort\lib\sssort.c" />
    <ClCompile Include="..\src\libdivsufsort\lib\trsort.c" />
    <ClCompile Include="..\src\libdivsufsort\lib\utils.c" />
-    <ClCompile Include="..\src\main.c" />
-    <ClCompile Include="..\src\shrink.c" />
+    <ClCompile Include="..\src\lzsa.c" />
+    <ClCompile Include="..\src\matchfinder.c" />
+    <ClCompile Include="..\src\shrink_v1.c" />
+    <ClCompile Include="..\src\shrink_v2.c" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/VS2017/lzsa.vcxproj.filters
+++ b/VS2017/lzsa.vcxproj.filters
@ -27,15 +27,9 @@
    <ClInclude Include="pch.h">
      <Filter>Fichiers d%27en-tête</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\shrink.h">
-      <Filter>Fichiers sources</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\format.h">
      <Filter>Fichiers sources</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\expand.h">
-      <Filter>Fichiers sources</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\libdivsufsort\include\config.h">
      <Filter>Fichiers sources\libdivsufsort\include</Filter>
    </ClInclude>
@ -48,17 +42,26 @@
    <ClInclude Include="..\src\frame.h">
      <Filter>Fichiers sources</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\expand_v2.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\shrink_v2.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\expand_v1.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\shrink_v1.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\matchfinder.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\lib.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\src\main.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\src\shrink.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\src\expand.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\libdivsufsort\lib\utils.c">
      <Filter>Fichiers sources\libdivsufsort\lib</Filter>
    </ClCompile>
@ -74,5 +77,26 @@
    <ClCompile Include="..\src\frame.c">
      <Filter>Fichiers sources</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\expand_v2.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\shrink_v2.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\expand_v1.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\shrink_v1.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\matchfinder.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\lib.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\lzsa.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
  </ItemGroup>
 </Project>
--- a/asm/6502/decompress_v1.asm
+++ b/asm/6502/decompress_v1.asm
@ -1,5 +1,5 @@
 ; -----------------------------------------------------------------------------
-; Decompress raw LZSA block. Create one with lzsa -r <original_file> <compressed_file>
+; Decompress raw LZSA1 block. Create one with lzsa -r <original_file> <compressed_file>
 ;
 ; in:
 ; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
@ -31,7 +31,7 @@
 OFFSLO = $43                            ; zero-page location for temp offset
 OFFSHI = $44

-DECOMPRESS_LZSA
+DECOMPRESS_LZSA1
   LDY #$00

 DECODE_TOKEN
--- a/asm/6502/decompress_v2.asm
+++ b/asm/6502/decompress_v2.asm
@ -0,0 +1,245 @@
+; -----------------------------------------------------------------------------
+; Decompress raw LZSA2 block.
+; Create one with lzsa -r -f2 <original_file> <compressed_file>
+;
+; in:
+; * LZSA_SRC_LO and LZSA_SRC_HI contain the compressed raw block address
+; * LZSA_DST_LO and LZSA_DST_HI contain the destination buffer address
+;
+; out:
+; * LZSA_DST_LO and LZSA_DST_HI contain the last decompressed byte address, +1
+; -----------------------------------------------------------------------------
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+; -----------------------------------------------------------------------------
+
+OFFSLO = $43                            ; zero-page location for temp offset
+OFFSHI = $44
+FIXUP = $4B
+NIBBLES = $FB
+NIBCOUNT = $FC
+
+DECOMPRESS_LZSA2
+   LDY #$00
+   STY NIBBLES
+   STY NIBCOUNT
+
+DECODE_TOKEN
+   JSR GETSRC                           ; read token byte: XYZ|LL|MMM
+   PHA                                  ; preserve token on stack
+
+   AND #$18                             ; isolate literals count (LL)
+   BEQ NO_LITERALS                      ; skip if no literals to copy
+   CMP #$18                             ; LITERALS_RUN_LEN_V2 << 3?
+   BNE EMBEDDED_LITERALS                ; if less, count is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra literals length nibble
+   CLC                                  ; add nibble to len from token
+   ADC #$03                             ; (LITERALS_RUN_LEN_V2)
+   CMP #$12                             ; LITERALS_RUN_LEN_V2 + 15 ?
+   BNE PREPARE_COPY_LITERALS            ; if less, literals count is complete
+
+   JSR GETSRC                           ; get extra byte of variable literals count
+   TAX                                  ; non-zero?
+   BNE PREPARE_COPY_LITERALS_HIGH       ; if so, literals count is complete
+
+                                        ; handle 16 bits literals count
+                                        ; literals count = directly these 16 bits
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+   JMP PREPARE_COPY_LITERALS_HIGH
+
+EMBEDDED_LITERALS
+   LSR A                                ; shift literals count into place
+   LSR A
+   LSR A
+
+PREPARE_COPY_LITERALS
+   TAX
+PREPARE_COPY_LITERALS_HIGH
+   INY
+
+COPY_LITERALS
+   JSR GETPUT                           ; copy one byte of literals
+   DEX
+   BNE COPY_LITERALS
+   DEY
+   BNE COPY_LITERALS
+   
+NO_LITERALS
+   PLA                                  ; retrieve token from stack
+   PHA                                  ; preserve token again
+   BMI REPMATCH_OR_LARGE_OFFSET         ; 1YZ: rep-match or 13/16 bit offset
+
+   ASL                                  ; 0YZ: 5 or 9 bit offset
+   BMI OFFSET_9_BIT         
+    
+                                        ; 00Z: 5 bit offset
+   LSR A                                ; Shift Z (offset bit 4) in place
+   LSR A
+   AND #$10
+   STA FIXUP
+
+   JSR GETNIBBLE                        ; get nibble for offset bits 0-3
+   ORA FIXUP                            ; merge offset bit 4
+   ORA #$E0                             ; set offset bits 7-5 to 1
+   TAX                                  ; store low byte of match offset
+   LDA #$0FF                            ; set offset bits 15-8 to 1
+   BNE GOT_OFFSET                       ; (*same as JMP GOT_OFFSET but shorter)
+   
+OFFSET_9_BIT                            ; 01Z: 9 bit offset
+   ASL                                  ; shift Z (offset bit 8) in place
+   ROL
+   ROL
+   ORA #$FE                             ; set offset bits 15-9 to 1
+   STA OFFSHI
+
+   JSR GETSRC                           ; get offset bits 0-7 from stream in A
+   TAX                                  ; store low byte of match offset
+   JMP GOT_OFFSET_LO                    ; go prepare match
+
+REPMATCH_OR_LARGE_OFFSET
+   ASL                                  ; 13 bit offset?
+   BMI REPMATCH_OR_16_BIT               ; handle rep-match or 16-bit offset if not
+
+                                        ; 10Z: 13 bit offset
+
+   LSR A                                ; shift Z (offset bit 4) in place
+   LSR A
+   AND #$10
+   STA FIXUP
+
+   JSR GETSRC                           ; get offset bits 0-7 from stream in A
+   TAX                                  ; store low byte of match offset
+
+   JSR GETNIBBLE                        ; get nibble for offset bits 8-11
+   ORA FIXUP                            ; merge offset bit 12
+   CLC
+   ADC #$DE                             ; set bits 13-15 to 1 and substract 2 (to substract 512)
+   BNE GOT_OFFSET                       ; go prepare match (*same as JMP GOT_OFFSET but shorter)
+
+REPMATCH_OR_16_BIT                      ; rep-match or 16 bit offset
+   ASL                                  ; XYZ=111?
+   BMI REP_MATCH                        ; reuse previous offset if so (rep-match)
+   
+                                        ; 110: handle 16 bit offset
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+
+GOT_OFFSET
+   STA OFFSHI                           ; store final match offset
+GOT_OFFSET_LO
+   STX OFFSLO
+
+REP_MATCH
+   CLC                                  ; add dest + match offset
+   LDA PUTDST+1                         ; low 8 bits
+   ADC OFFSLO
+   STA COPY_MATCH_LOOP+1                ; store back reference address
+   LDA OFFSHI                           ; high 8 bits
+   ADC PUTDST+2
+   STA COPY_MATCH_LOOP+2                ; store high 8 bits of address
+   
+   PLA                                  ; retrieve token from stack again
+   AND #$07                             ; isolate match len (MMM)
+   CLC
+   ADC #$02                             ; add MIN_MATCH_SIZE_V2
+   CMP #$09                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   BNE PREPARE_COPY_MATCH               ; if less, length is directly embedded in token
+
+   JSR GETNIBBLE                        ; get extra match length nibble
+   CLC                                  ; add nibble to len from token
+   ADC #$09                             ; (MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2)
+   CMP #$18                             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   BNE PREPARE_COPY_MATCH               ; if less, match length is complete
+
+   JSR GETSRC                           ; get extra byte of variable match length
+   TAX                                  ; non-zero?
+   BNE PREPARE_COPY_MATCH_Y             ; if so, the match length is complete
+
+                                        ; Handle 16 bits match length
+   JSR GETLARGESRC                      ; grab low 8 bits in X, high 8 bits in A
+   TAY                                  ; put high 8 bits in Y
+                                        ; large match length with zero high byte?
+   BEQ DECOMPRESSION_DONE               ; if so, this is the EOD code, bail
+   TXA
+
+PREPARE_COPY_MATCH
+   TAX
+PREPARE_COPY_MATCH_Y
+   INY
+
+COPY_MATCH_LOOP
+   LDA $AAAA                            ; get one byte of backreference
+   INC COPY_MATCH_LOOP+1
+   BNE GETMATCH_DONE
+   INC COPY_MATCH_LOOP+2
+GETMATCH_DONE
+   JSR PUTDST                           ; copy to destination
+   DEX
+   BNE COPY_MATCH_LOOP
+   DEY
+   BNE COPY_MATCH_LOOP
+   JMP DECODE_TOKEN
+
+GETNIBBLE
+   DEC NIBCOUNT
+   BPL HAS_NIBBLES
+
+   LDA #$01
+   STA NIBCOUNT
+   JSR GETSRC                           ; get 2 nibbles
+   STA NIBBLES
+   LSR A
+   LSR A
+   LSR A
+   LSR A
+   RTS
+
+HAS_NIBBLES
+   LDA NIBBLES
+   AND #$0F                             ; isolate low 4 bits of nibble
+   RTS
+
+GETPUT
+   JSR GETSRC
+PUTDST
+LZSA_DST_LO = *+1
+LZSA_DST_HI = *+2
+   STA $AAAA
+   INC PUTDST+1
+   BNE PUTDST_DONE
+   INC PUTDST+2
+PUTDST_DONE
+DECOMPRESSION_DONE
+   RTS
+
+GETLARGESRC
+   JSR GETSRC                           ; grab low 8 bits
+   TAX                                  ; move to X
+                                        ; fall through grab high 8 bits
+
+GETSRC
+LZSA_SRC_LO = *+1
+LZSA_SRC_HI = *+2
+   LDA $AAAA
+   INC GETSRC+1
+   BNE GETSRC_DONE
+   INC GETSRC+2
+GETSRC_DONE
+   RTS
--- a/asm/8088/decompress_small_v1.S
+++ b/asm/8088/decompress_small_v1.S
@ -22,15 +22,15 @@
   bits 16

 ;  ---------------------------------------------------------------------------
-;  Decompress raw LZSA block
+;  Decompress raw LZSA1 block
 ;  inputs:
-;  * ds:si: raw LZSA block
+;  * ds:si: raw LZSA1 block
 ;  * es:di: output buffer
 ;  output:
 ;  * ax:    decompressed size
 ;  ---------------------------------------------------------------------------

-lzsa_decompress:
+lzsa1_decompress:
   push di                 ; remember decompression offset
   cld                     ; make string operations (lods, movs, stos..) move forward

--- a/asm/8088/decompress_small_v2.S
+++ b/asm/8088/decompress_small_v2.S
@ -0,0 +1,174 @@
+;  decompress_small.S - space-efficient decompressor implementation for 8088
+;
+;  Copyright (C) 2019 Emmanuel Marty
+;
+;  This software is provided 'as-is', without any express or implied
+;  warranty.  In no event will the authors be held liable for any damages
+;  arising from the use of this software.
+;
+;  Permission is granted to anyone to use this software for any purpose,
+;  including commercial applications, and to alter it and redistribute it
+;  freely, subject to the following restrictions:
+;
+;  1. The origin of this software must not be misrepresented; you must not
+;     claim that you wrote the original software. If you use this software
+;     in a product, an acknowledgment in the product documentation would be
+;     appreciated but is not required.
+;  2. Altered source versions must be plainly marked as such, and must not be
+;     misrepresented as being the original software.
+;  3. This notice may not be removed or altered from any source distribution.
+
+   segment .text
+   bits 16
+
+;  ---------------------------------------------------------------------------
+;  Decompress raw LZSA2 block
+;  inputs:
+;  * ds:si: raw LZSA2 block
+;  * es:di: output buffer
+;  output:
+;  * ax:    decompressed size
+;  ---------------------------------------------------------------------------
+
+lzsa2_decompress:
+   push di                 ; remember decompression offset
+   cld                     ; make string operations (lods, movs, stos..) move forward
+
+   xor cx,cx
+   xor bx,bx
+   xor bp,bp
+
+.decode_token:
+   mov ax,cx               ; clear ah - cx is zero from above or from after rep movsb in .copy_match
+   lodsb                   ; read token byte: XYZ|LL|MMMM
+   mov dx,ax               ; keep token in dl
+   
+   and al,018H             ; isolate literals length in token (LL)
+   mov cl,3
+   shr al,cl               ; shift literals length into place
+
+   cmp al,03H              ; LITERALS_RUN_LEN_V2?
+   jne .got_literals       ; no, we have the full literals count from the token, go copy
+
+   call .get_nibble        ; get extra literals length nibble
+   add al,cl               ; add len from token to nibble 
+   cmp al,012H             ; LITERALS_RUN_LEN_V2 + 15 ?
+   jne .got_literals       ; if not, we have the full literals count, go copy
+
+   lodsb                   ; grab extra length byte
+   test al,al              ; zero?
+   jne .got_literals       ; if not, we have the full literals count, go copy
+
+   lodsw                   ; grab 16-bit extra length
+
+.got_literals:
+   xchg cx,ax
+   rep movsb               ; copy cx literals from ds:si to es:di
+
+   test dl,dl              ; check match offset mode in token (X bit)
+   js .rep_match_or_large_offset
+
+   cmp dl,040H             ; check if this is a 5 or 9-bit offset (Y bit)
+   jnb .offset_9_bit
+
+                           ; 5 bit offset
+   xchg ax,cx              ; clear ah - cx is zero from the rep movsb above
+   mov al,020H             ; shift Z (offset bit 4) in place
+   and al,dl
+   shr al,1
+   call .get_nibble        ; get nibble for offset bits 0-3
+   or al,cl                ; merge nibble
+   or al,0E0H              ; set offset bits 7-5 to 1
+   dec ah                  ; set offset bits 15-8 to 1
+   jmp short .get_match_length
+
+.offset_9_bit:             ; 9 bit offset
+   xchg ax,cx              ; clear ah - cx is zero from the rep movsb above
+   lodsb                   ; get 8 bit offset from stream in A
+   dec ah                  ; set offset bits 15-8 to 1
+   test dl,020H            ; test bit Z (offset bit 8)
+   jne .get_match_length
+   dec ah                  ; clear bit 8 if Z bit is clear
+   jmp short .get_match_length
+
+.rep_match_or_large_offset:
+   cmp dl,0c0H             ; check if this is a 13-bit offset or a 16-bit offset/rep match (Y bit)
+   jnb .rep_match_or_16_bit
+
+                           ; 13 bit offset
+   lodsb                   ; load match offset bits 0-7
+
+   mov ah,020H             ; shift Z (offset bit 12) in place
+   and ah,dl
+   shr ah,1
+   call .get_nibble        ; get nibble for offset bits 8-11
+   or ah,cl                ; merge nibble
+   or ah,0E0H              ; set offset bits 15-13 to 1
+   sub ah,2                ; substract 512
+   jmp short .get_match_length
+
+.rep_match_or_16_bit:
+   test dl,020H            ; test bit Z (offset bit 8)
+   jne .repeat_match       ; rep-match
+
+                           ; 16 bit offset
+   lodsw                   ; Get 2-byte match offset
+
+.get_match_length:
+   mov bp,ax               ; bp: offset
+.repeat_match:
+   mov ax,dx               ; ax: original token
+   and al,07H              ; isolate match length in token (MMM)
+   add al,2                ; add MIN_MATCH_SIZE_V2
+
+   cmp al,09H              ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2?
+   jne .got_matchlen       ; no, we have the full match length from the token, go copy
+
+   call .get_nibble        ; get extra literals length nibble
+   add al,cl               ; add len from token to nibble 
+   cmp al,018H             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
+   jne .got_matchlen       ; no, we have the full match length from the token, go copy
+
+   lodsb                   ; grab extra length byte
+   test al,al              ; zero?
+   jne .got_matchlen       ; if not, we have the entire length
+
+   lodsw                   ; grab 16-bit length
+   test ax,ax              ; bail if we hit EOD
+   je short .done_decompressing 
+
+.got_matchlen:
+   xchg cx,ax              ; copy match length into cx
+   push ds                 ; save ds:si (current pointer to compressed data)
+   xchg si,ax          
+   push es
+   pop ds
+   mov si,di               ; ds:si now points at back reference in output data
+   add si,bp
+   rep movsb               ; copy match
+   xchg si,ax              ; restore ds:si
+   pop ds
+   jmp .decode_token       ; go decode another token
+
+.done_decompressing:
+   pop ax                  ; retrieve the original decompression offset
+   xchg ax,di              ; compute decompressed size
+   sub ax,di
+   ret                     ; done
+
+.get_nibble:
+   dec bh                  ; nibble ready?
+   jns .has_nibble
+   
+   mov cx,ax
+   lodsb                   ; load two nibbles
+   mov bl,al
+   mov bh,1
+   mov ax,cx
+
+.has_nibble:
+   mov cl,4                ; swap 4 high and low bits of nibble
+   ror bl,cl
+   mov cl,0FH
+   and cl,bl
+   ret
--- a/src/expand_v1.c
+++ b/src/expand_v1.c
@ -1,5 +1,5 @@
 /*
- * expand.c - block decompressor implementation
+ * expand_v1.c - LZSA1 block decompressor implementation
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -20,11 +20,21 @@
 * 3. This notice may not be removed or altered from any source distribution.
 */

+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "format.h"
-#include "expand.h"
+#include "expand_v1.h"

 #ifdef _MSC_VER
 #define FORCE_INLINE __forceinline
@ -32,11 +42,11 @@
 #define FORCE_INLINE __attribute__((always_inline))
 #endif /* _MSC_VER */

-static inline FORCE_INLINE int lzsa_expand_literals_slow(const unsigned char **ppInBlock, const unsigned char *pInBlockEnd, unsigned int nLiterals, unsigned char **ppCurOutData, const unsigned char *pOutDataEnd) {
+static inline FORCE_INLINE int lzsa_expand_literals_slow_v1(const unsigned char **ppInBlock, const unsigned char *pInBlockEnd, unsigned int nLiterals, unsigned char **ppCurOutData, const unsigned char *pOutDataEnd) {
   const unsigned char *pInBlock = *ppInBlock;
   unsigned char *pCurOutData = *ppCurOutData;

-   if (nLiterals == LITERALS_RUN_LEN) {
+   if (nLiterals == LITERALS_RUN_LEN_V1) {
      unsigned char nByte;

      if (pInBlock < pInBlockEnd) {
@ -83,12 +93,12 @@ static inline FORCE_INLINE int lzsa_expand_literals_slow(const unsigned char **p
   return 0;
 }

-static inline FORCE_INLINE int lzsa_expand_match_slow(const unsigned char **ppInBlock, const unsigned char *pInBlockEnd, const unsigned char *pSrc, unsigned int nMatchLen, unsigned char **ppCurOutData, const unsigned char *pOutDataEnd, const unsigned char *pOutDataFastEnd) {
+static inline FORCE_INLINE int lzsa_expand_match_slow_v1(const unsigned char **ppInBlock, const unsigned char *pInBlockEnd, const unsigned char *pSrc, unsigned int nMatchLen, unsigned char **ppCurOutData, const unsigned char *pOutDataEnd, const unsigned char *pOutDataFastEnd) {
   const unsigned char *pInBlock = *ppInBlock;
   unsigned char *pCurOutData = *ppCurOutData;

-   nMatchLen += MIN_MATCH_SIZE;
-   if (nMatchLen == (MATCH_RUN_LEN + MIN_MATCH_SIZE)) {
+   nMatchLen += MIN_MATCH_SIZE_V1;
+   if (nMatchLen == (MATCH_RUN_LEN_V1 + MIN_MATCH_SIZE_V1)) {
      unsigned char nByte;

      if (pInBlock < pInBlockEnd) {
@ -159,7 +169,7 @@ static inline FORCE_INLINE int lzsa_expand_match_slow(const unsigned char **ppIn
 }

 /**
- * Decompress one data block
+ * Decompress one LZSA1 data block
 *
 * @param pInBlock pointer to compressed data
 * @param nInBlockSize size of compressed data, in bytes
@ -169,7 +179,7 @@ static inline FORCE_INLINE int lzsa_expand_match_slow(const unsigned char **ppIn
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_expand_block(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize) {
+int lzsa_expand_block_v1(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize) {
   const unsigned char *pInBlockEnd = pInBlock + nBlockSize;
   const unsigned char *pInBlockFastEnd = pInBlock + nBlockSize - 8;
   unsigned char *pCurOutData = pOutData + nOutDataOffset;
@ -182,36 +192,35 @@ int lzsa_expand_block(const unsigned char *pInBlock, int nBlockSize, unsigned ch
      const unsigned char token = *pInBlock++;
      unsigned int nLiterals = (unsigned int)((token & 0x70) >> 4);

-      if (nLiterals < LITERALS_RUN_LEN) {
+      if (nLiterals < LITERALS_RUN_LEN_V1) {
         memcpy(pCurOutData, pInBlock, 8);
         pInBlock += nLiterals;
         pCurOutData += nLiterals;
      }
      else {
-         if (lzsa_expand_literals_slow(&pInBlock, pInBlockEnd, nLiterals, &pCurOutData, pOutDataEnd))
+         if (lzsa_expand_literals_slow_v1(&pInBlock, pInBlockEnd, nLiterals, &pCurOutData, pOutDataEnd))
            return -1;
      }

      if ((pInBlock + 1) < pInBlockEnd) { /* The last token in the block does not include match information */
         int nMatchOffset;

-         nMatchOffset = ((unsigned int)(*pInBlock++ ^ 0xff));
+         nMatchOffset = ((unsigned int)(*pInBlock++)) | 0xffffff00;
         if (token & 0x80) {
-            nMatchOffset |= (((unsigned int)(*pInBlock++ ^ 0xff)) << 8);
+            nMatchOffset = (nMatchOffset & 0xffff00ff) | (((unsigned int)(*pInBlock++)) << 8);
         }
-         nMatchOffset++;

-         const unsigned char *pSrc = pCurOutData - nMatchOffset;
+         const unsigned char *pSrc = pCurOutData + nMatchOffset;
         if (pSrc >= pOutData) {
            unsigned int nMatchLen = (unsigned int)(token & 0x0f);
-            if (nMatchLen < MATCH_RUN_LEN && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+            if (nMatchLen < MATCH_RUN_LEN_V1 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
               memcpy(pCurOutData, pSrc, 8);
               memcpy(pCurOutData + 8, pSrc + 8, 8);
               memcpy(pCurOutData + 16, pSrc + 16, 4);
-               pCurOutData += (MIN_MATCH_SIZE + nMatchLen);
+               pCurOutData += (MIN_MATCH_SIZE_V1 + nMatchLen);
            }
            else {
-               if (lzsa_expand_match_slow(&pInBlock, pInBlockEnd, pSrc, nMatchLen, &pCurOutData, pOutDataEnd, pOutDataFastEnd))
+               if (lzsa_expand_match_slow_v1(&pInBlock, pInBlockEnd, pSrc, nMatchLen, &pCurOutData, pOutDataEnd, pOutDataFastEnd))
                  return -1;
            }
         }
@ -227,22 +236,21 @@ int lzsa_expand_block(const unsigned char *pInBlock, int nBlockSize, unsigned ch
      const unsigned char token = *pInBlock++;
      unsigned int nLiterals = (unsigned int)((token & 0x70) >> 4);

-      if (lzsa_expand_literals_slow(&pInBlock, pInBlockEnd, nLiterals, &pCurOutData, pOutDataEnd))
+      if (lzsa_expand_literals_slow_v1(&pInBlock, pInBlockEnd, nLiterals, &pCurOutData, pOutDataEnd))
         return -1;

      if ((pInBlock + 1) < pInBlockEnd) { /* The last token in the block does not include match information */
         int nMatchOffset;

-         nMatchOffset = ((unsigned int)(*pInBlock++ ^ 0xff));
+         nMatchOffset = ((unsigned int)(*pInBlock++)) | 0xffffff00;
         if (token & 0x80) {
-            nMatchOffset |= (((unsigned int)(*pInBlock++ ^ 0xff)) << 8);
+            nMatchOffset = (nMatchOffset & 0xffff00ff) | (((unsigned int)(*pInBlock++)) << 8);
         }
-         nMatchOffset++;

-         const unsigned char *pSrc = pCurOutData - nMatchOffset;
+         const unsigned char *pSrc = pCurOutData + nMatchOffset;
         if (pSrc >= pOutData) {
            unsigned int nMatchLen = (unsigned int)(token & 0x0f);
-            if (lzsa_expand_match_slow(&pInBlock, pInBlockEnd, pSrc, nMatchLen, &pCurOutData, pOutDataEnd, pOutDataFastEnd))
+            if (lzsa_expand_match_slow_v1(&pInBlock, pInBlockEnd, pSrc, nMatchLen, &pCurOutData, pOutDataEnd, pOutDataFastEnd))
               return -1;
         }
         else {
--- a/src/expand_v1.h
+++ b/src/expand_v1.h
@ -1,5 +1,5 @@
 /*
- * expand.h - block decompressor definitions
+ * expand_v1.h - LZSA1 block decompressor definitions
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -20,11 +20,21 @@
 * 3. This notice may not be removed or altered from any source distribution.
 */

-#ifndef _EXPAND_H
-#define _EXPAND_H
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#ifndef _EXPAND_V1_H
+#define _EXPAND_V1_H

 /**
- * Decompress one data block
+ * Decompress one LZSA1 data block
 *
 * @param pInBlock pointer to compressed data
 * @param nInBlockSize size of compressed data, in bytes
@ -34,6 +44,6 @@
 *
 * @return size of decompressed data in bytes, or -1 for error
 */
-int lzsa_expand_block(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize);
+int lzsa_expand_block_v1(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize);

-#endif /* _EXPAND_H */
+#endif /* _EXPAND_V1_H */
--- a/src/expand_v2.c
+++ b/src/expand_v2.c
@ -0,0 +1,330 @@
+/*
+ * expand_v2.c - LZSA2 block decompressor implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "format.h"
+#include "expand_v2.h"
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else /* _MSC_VER */
+#define FORCE_INLINE __attribute__((always_inline))
+#endif /* _MSC_VER */
+
+static inline FORCE_INLINE unsigned int lzsa_get_nibble_v2(const unsigned char **ppInBlock, const unsigned char *pInBlockEnd, int *nCurNibbles, unsigned char *nibbles) {
+   unsigned int nValue;
+
+   if ((*nCurNibbles ^= 1) != 0) {
+      const unsigned char *pInBlock = *ppInBlock;
+      if (pInBlock >= pInBlockEnd) return -1;
+      (*nibbles) = *pInBlock++;
+      *ppInBlock = pInBlock;
+   }
+
+   nValue = ((unsigned int)((*nibbles) & 0xf0)) >> 4;
+
+   (*nibbles) <<= 4;
+
+   return nValue;
+}
+
+static inline FORCE_INLINE int lzsa_expand_literals_slow_v2(const unsigned char **ppInBlock, const unsigned char *pInBlockEnd, unsigned int nLiterals, int *nCurNibbles, unsigned char *nibbles,
+      unsigned char **ppCurOutData, const unsigned char *pOutDataEnd) {
+   const unsigned char *pInBlock = *ppInBlock;
+   unsigned char *pCurOutData = *ppCurOutData;
+
+   if (nLiterals == LITERALS_RUN_LEN_V2) {
+      nLiterals += lzsa_get_nibble_v2(&pInBlock, pInBlockEnd, nCurNibbles, nibbles);
+
+      if (nLiterals == (LITERALS_RUN_LEN_V2 + 15)) {
+         if (pInBlock < pInBlockEnd) {
+            nLiterals = ((unsigned int)*pInBlock++);
+
+            if (nLiterals == 0) {
+               if ((pInBlock + 1) < pInBlockEnd) {
+                  nLiterals = ((unsigned int)*pInBlock++);
+                  nLiterals |= (((unsigned int)*pInBlock++) << 8);
+               }
+               else {
+                  return -1;
+               }
+            }
+         }
+         else {
+            return -1;
+         }
+      }
+   }
+
+   if (nLiterals != 0) {
+      if ((pInBlock + nLiterals) <= pInBlockEnd &&
+         (pCurOutData + nLiterals) <= pOutDataEnd) {
+         memcpy(pCurOutData, pInBlock, nLiterals);
+         pInBlock += nLiterals;
+         pCurOutData += nLiterals;
+      }
+      else {
+         return -1;
+      }
+   }
+
+   *ppInBlock = pInBlock;
+   *ppCurOutData = pCurOutData;
+   return 0;
+}
+
+static inline FORCE_INLINE int lzsa_expand_match_slow_v2(const unsigned char **ppInBlock, const unsigned char *pInBlockEnd, const unsigned char *pSrc, unsigned int nMatchLen, int *nCurNibbles, unsigned char *nibbles,
+      unsigned char **ppCurOutData, const unsigned char *pOutDataEnd, const unsigned char *pOutDataFastEnd) {
+   const unsigned char *pInBlock = *ppInBlock;
+   unsigned char *pCurOutData = *ppCurOutData;
+
+   nMatchLen += MIN_MATCH_SIZE_V2;
+   if (nMatchLen == (MATCH_RUN_LEN_V2 + MIN_MATCH_SIZE_V2)) {
+      nMatchLen += lzsa_get_nibble_v2(&pInBlock, pInBlockEnd, nCurNibbles, nibbles);
+
+      if (nMatchLen == (MATCH_RUN_LEN_V2 + MIN_MATCH_SIZE_V2 + 15)) {
+         if (pInBlock < pInBlockEnd) {
+            nMatchLen = ((unsigned int)*pInBlock++);
+
+            if (nMatchLen == 0) {
+               if ((pInBlock + 1) < pInBlockEnd) {
+                  nMatchLen = ((unsigned int)*pInBlock++);
+                  nMatchLen |= (((unsigned int)*pInBlock++) << 8);
+               }
+               else {
+                  return -1;
+               }
+            }
+         }
+         else {
+            return -1;
+         }
+      }
+   }
+
+   if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
+      /* Do a deterministic, left to right byte copy instead of memcpy() so as to handle overlaps */
+
+      if ((pCurOutData - pSrc) >= 8 && (pCurOutData + nMatchLen) < (pOutDataFastEnd - 15)) {
+         const unsigned char *pCopySrc = pSrc;
+         unsigned char *pCopyDst = pCurOutData;
+         const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;
+
+         do {
+            memcpy(pCopyDst, pCopySrc, 8);
+            memcpy(pCopyDst + 8, pCopySrc + 8, 8);
+            pCopySrc += 16;
+            pCopyDst += 16;
+         } while (pCopyDst < pCopyEndDst);
+
+         pCurOutData += nMatchLen;
+      }
+      else {
+         while (nMatchLen >= 4) {
+            *pCurOutData++ = *pSrc++;
+            *pCurOutData++ = *pSrc++;
+            *pCurOutData++ = *pSrc++;
+            *pCurOutData++ = *pSrc++;
+            nMatchLen -= 4;
+         }
+         while (nMatchLen) {
+            *pCurOutData++ = *pSrc++;
+            nMatchLen--;
+         }
+      }
+   }
+   else {
+      return -1;
+   }
+
+   *ppInBlock = pInBlock;
+   *ppCurOutData = pCurOutData;
+   return 0;
+}
+
+/**
+ * Decompress one LZSA2 data block
+ *
+ * @param pInBlock pointer to compressed data
+ * @param nInBlockSize size of compressed data, in bytes
+ * @param pOutData pointer to output decompression buffer (previously decompressed bytes + room for decompressing this block)
+ * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes)
+ * @param nBlockMaxSize total size of output decompression buffer, in bytes
+ *
+ * @return size of decompressed data in bytes, or -1 for error
+ */
+int lzsa_expand_block_v2(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize) {
+   const unsigned char *pInBlockEnd = pInBlock + nBlockSize;
+   const unsigned char *pInBlockFastEnd = pInBlock + nBlockSize - 8;
+   unsigned char *pCurOutData = pOutData + nOutDataOffset;
+   const unsigned char *pOutDataEnd = pCurOutData + nBlockMaxSize;
+   const unsigned char *pOutDataFastEnd = pOutDataEnd - 20;
+   int nCurNibbles = 0;
+   unsigned char nibbles;
+   int nMatchOffset = 0;
+
+   /* Fast loop */
+
+   while (pInBlock < pInBlockFastEnd && pCurOutData < pOutDataFastEnd) {
+      const unsigned char token = *pInBlock++;
+      unsigned int nLiterals = (unsigned int)((token & 0x18) >> 3);
+
+      if (nLiterals < LITERALS_RUN_LEN_V2) {
+         memcpy(pCurOutData, pInBlock, 8);
+         pInBlock += nLiterals;
+         pCurOutData += nLiterals;
+      }
+      else {
+         if (lzsa_expand_literals_slow_v2(&pInBlock, pInBlockEnd, nLiterals, &nCurNibbles, &nibbles, &pCurOutData, pOutDataEnd))
+            return -1;
+      }
+
+      if ((pInBlock + 1) < pInBlockEnd) { /* The last token in the block does not include match information */
+         unsigned char nOffsetMode = token & 0xc0;
+
+         switch (nOffsetMode) {
+         case 0x00:
+            /* 5 bit offset */
+            nMatchOffset = (unsigned int)lzsa_get_nibble_v2(&pInBlock, pInBlockEnd, &nCurNibbles, &nibbles);
+            nMatchOffset |= ((token & 0x20) >> 1);
+            nMatchOffset |= 0xffffffe0;
+            break;
+
+         case 0x40:
+            /* 9 bit offset */
+            nMatchOffset = (unsigned int)(*pInBlock++);
+            nMatchOffset |= (((unsigned int)(token & 0x20)) << 3);
+            nMatchOffset |= 0xfffffe00;
+            break;
+
+         case 0x80:
+            /* 13 bit offset */
+            nMatchOffset = (unsigned int)(*pInBlock++);
+            nMatchOffset |= (lzsa_get_nibble_v2(&pInBlock, pInBlockEnd, &nCurNibbles, &nibbles) << 8);
+            nMatchOffset |= (((unsigned int)(token & 0x20)) << 7);
+            nMatchOffset |= 0xffffe000;
+            nMatchOffset -= 512;
+            break;
+
+         default:
+            /* Check if this is a 16 bit offset or a rep-match */
+            if ((token & 0x20) == 0) {
+               /* 16 bit offset */
+               nMatchOffset = (unsigned int)(*pInBlock++);
+               nMatchOffset |= (((unsigned int)(*pInBlock++)) << 8);
+               nMatchOffset |= 0xffff0000;
+            }
+            break;
+         }
+
+         const unsigned char *pSrc = pCurOutData + nMatchOffset;
+         if (pSrc >= pOutData) {
+            unsigned int nMatchLen = (unsigned int)(token & 0x07);
+            if (nMatchLen < MATCH_RUN_LEN_V2 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+               memcpy(pCurOutData, pSrc, 8);
+               memcpy(pCurOutData + 8, pSrc + 8, 4);
+               pCurOutData += (MIN_MATCH_SIZE_V2 + nMatchLen);
+            }
+            else {
+               if (lzsa_expand_match_slow_v2(&pInBlock, pInBlockEnd, pSrc, nMatchLen, &nCurNibbles, &nibbles, &pCurOutData, pOutDataEnd, pOutDataFastEnd))
+                  return -1;
+            }
+         }
+         else {
+            return -1;
+         }
+      }
+   }
+
+   /* Slow loop for the remainder of the buffer */
+
+   while (pInBlock < pInBlockEnd) {
+      const unsigned char token = *pInBlock++;
+      unsigned int nLiterals = (unsigned int)((token & 0x18) >> 3);
+
+      if (lzsa_expand_literals_slow_v2(&pInBlock, pInBlockEnd, nLiterals, &nCurNibbles, &nibbles, &pCurOutData, pOutDataEnd))
+         return -1;
+
+      if ((pInBlock + 1) < pInBlockEnd) { /* The last token in the block does not include match information */
+         unsigned char nOffsetMode = token & 0xc0;
+
+         switch (nOffsetMode) {
+         case 0x00:
+            /* 5 bit offset */
+            nMatchOffset = (unsigned int)lzsa_get_nibble_v2(&pInBlock, pInBlockEnd, &nCurNibbles, &nibbles);
+            nMatchOffset |= ((token & 0x20) >> 1);
+            nMatchOffset |= 0xffffffe0;
+            break;
+
+         case 0x40:
+            /* 9 bit offset */
+            nMatchOffset = (unsigned int)(*pInBlock++);
+            nMatchOffset |= (((unsigned int)(token & 0x20)) << 3);
+            nMatchOffset |= 0xfffffe00;
+            break;
+
+         case 0x80:
+            /* 13 bit offset */
+            nMatchOffset = (unsigned int)(*pInBlock++);
+            nMatchOffset |= (lzsa_get_nibble_v2(&pInBlock, pInBlockEnd, &nCurNibbles, &nibbles) << 8);
+            nMatchOffset |= (((unsigned int)(token & 0x20)) << 7);
+            nMatchOffset |= 0xffffe000;
+            nMatchOffset -= 512;
+            break;
+
+         default:
+            /* Check if this is a 16 bit offset or a rep-match */
+            if ((token & 0x20) == 0) {
+               /* 16 bit offset */
+               nMatchOffset = (unsigned int)(*pInBlock++);
+               nMatchOffset |= (((unsigned int)(*pInBlock++)) << 8);
+               nMatchOffset |= 0xffff0000;
+            }
+            break;
+         }
+
+         const unsigned char *pSrc = pCurOutData + nMatchOffset;
+         if (pSrc >= pOutData) {
+            unsigned int nMatchLen = (unsigned int)(token & 0x07);
+            if (lzsa_expand_match_slow_v2(&pInBlock, pInBlockEnd, pSrc, nMatchLen, &nCurNibbles, &nibbles, &pCurOutData, pOutDataEnd, pOutDataFastEnd))
+               return -1;
+         }
+         else {
+            return -1;
+         }
+      }
+   }
+
+   return (int)(pCurOutData - (pOutData + nOutDataOffset));
+}
--- a/src/expand_v2.h
+++ b/src/expand_v2.h
@ -0,0 +1,49 @@
+/*
+ * expand_v2.h - LZSA2 block decompressor definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#ifndef _EXPAND_V2_H
+#define _EXPAND_V2_H
+
+/**
+ * Decompress one LZSA2 data block
+ *
+ * @param pInBlock pointer to compressed data
+ * @param nInBlockSize size of compressed data, in bytes
+ * @param pOutData pointer to output decompression buffer (previously decompressed bytes + room for decompressing this block)
+ * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes)
+ * @param nBlockMaxSize total size of output decompression buffer, in bytes
+ *
+ * @return size of decompressed data in bytes, or -1 for error
+ */
+int lzsa_expand_block_v2(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize);
+
+#endif /* _EXPAND_V2_H */
--- a/src/format.h
+++ b/src/format.h
@ -20,13 +20,28 @@
 * 3. This notice may not be removed or altered from any source distribution.
 */

+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
 #ifndef _FORMAT_H
 #define _FORMAT_H

-#define MIN_MATCH_SIZE  3
 #define MIN_OFFSET 1
 #define MAX_OFFSET 0xffff
-#define LITERALS_RUN_LEN 7
-#define MATCH_RUN_LEN 15
+
+#define MIN_MATCH_SIZE_V1 3
+#define LITERALS_RUN_LEN_V1 7
+#define MATCH_RUN_LEN_V1 15
+
+#define MIN_MATCH_SIZE_V2 2
+#define LITERALS_RUN_LEN_V2 3
+#define MATCH_RUN_LEN_V2 7

 #endif /* _FORMAT_H */
--- a/src/frame.c
+++ b/src/frame.c
@ -20,9 +20,18 @@
 * 3. This notice may not be removed or altered from any source distribution.
 */

+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
 #include <stdlib.h>
 #include "frame.h"
-#include "shrink.h"

 #define LZSA_ID_0   0x7b
 #define LZSA_ID_1   0x9e
@ -53,11 +62,11 @@ int lzsa_get_frame_size(void) {
 *
 * @return number of encoded bytes, or -1 for failure
 */
-int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize) {
-   if (nMaxFrameDataSize >= 3) {
+int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize, int nFormatVersion) {
+   if (nMaxFrameDataSize >= 3 && (nFormatVersion == 1 || nFormatVersion == 2)) {
      pFrameData[0] = LZSA_ID_0;                         /* Magic number */
      pFrameData[1] = LZSA_ID_1;
-      pFrameData[2] = 0;                                 /* Format version 1 */
+      pFrameData[2] = (nFormatVersion == 2) ? 0x20 : 0;  /* Format version 1 */

      return 3;
   }
@ -139,14 +148,16 @@ int lzsa_encode_footer_frame(unsigned char *pFrameData, const int nMaxFrameDataS
 *
 * @return 0 for success, or -1 for failure
 */
-int lzsa_decode_header(const unsigned char *pFrameData, const int nFrameDataSize) {
+int lzsa_decode_header(const unsigned char *pFrameData, const int nFrameDataSize, int *nFormatVersion) {
   if (nFrameDataSize != 3 ||
      pFrameData[0] != LZSA_ID_0 ||
      pFrameData[1] != LZSA_ID_1 ||
-      pFrameData[2] != 0) {
+      (pFrameData[2] & 0x1f) != 0 ||
+      ((pFrameData[2] & 0xe0) != 0x00 && (pFrameData[2] & 0xe0) != 0x20)) {
      return -1;
   }
   else {
+      *nFormatVersion = (pFrameData[2] & 0xe0) ? 2 : 1;
      return 0;
   }
 }
--- a/src/frame.h
+++ b/src/frame.h
@ -20,6 +20,16 @@
 * 3. This notice may not be removed or altered from any source distribution.
 */

+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
 #ifndef _FRAME_H
 #define _FRAME_H

@ -45,7 +55,7 @@ int lzsa_get_frame_size(void);
 *
 * @return number of encoded bytes, or -1 for failure
 */
-int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize);
+int lzsa_encode_header(unsigned char *pFrameData, const int nMaxFrameDataSize, int nFormatVersion);

 /**
 * Encode compressed block frame header
@ -87,7 +97,7 @@ int lzsa_encode_footer_frame(unsigned char *pFrameData, const int nMaxFrameDataS
 *
 * @return 0 for success, or -1 for failure
 */
-int lzsa_decode_header(const unsigned char *pFrameData, const int nFrameDataSize);
+int lzsa_decode_header(const unsigned char *pFrameData, const int nFrameDataSize, int *nFormatVersion);

 /**
 * Decode frame header
--- a/src/lib.c
+++ b/src/lib.c
@ -0,0 +1,217 @@
+/*
+ * lib.c - LZSA library implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "lib.h"
+#include "matchfinder.h"
+#include "shrink_v1.h"
+#include "shrink_v2.h"
+#include "expand_v1.h"
+#include "expand_v2.h"
+#include "format.h"
+
+/**
+ * Initialize compression context
+ *
+ * @param pCompressor compression context to initialize
+ * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
+ * @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE)
+ * @param nFlags compression flags
+ *
+ * @return 0 for success, non-zero for failure
+ */
+int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize, const int nFormatVersion, const int nFlags) {
+   int nResult;
+   int nMinMatchSizeForFormat = (nFormatVersion == 1) ? MIN_MATCH_SIZE_V1 : MIN_MATCH_SIZE_V2;
+
+   nResult = divsufsort_init(&pCompressor->divsufsort_context);
+   pCompressor->intervals = NULL;
+   pCompressor->pos_data = NULL;
+   pCompressor->open_intervals = NULL;
+   pCompressor->match = NULL;
+   pCompressor->best_match = NULL;
+   pCompressor->slot_cost = NULL;
+   pCompressor->repmatch_opt = NULL;
+   pCompressor->min_match_size = nMinMatchSize;
+   if (pCompressor->min_match_size < nMinMatchSizeForFormat)
+      pCompressor->min_match_size = nMinMatchSizeForFormat;
+   else if (pCompressor->min_match_size > 5)
+      pCompressor->min_match_size = 5;
+   pCompressor->format_version = nFormatVersion;
+   pCompressor->flags = nFlags;
+   pCompressor->num_commands = 0;
+
+   if (!nResult) {
+      pCompressor->intervals = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));
+
+      if (pCompressor->intervals) {
+         pCompressor->pos_data = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));
+
+         if (pCompressor->pos_data) {
+            pCompressor->open_intervals = (unsigned int *)malloc((LCP_MAX + 1) * sizeof(unsigned int));
+
+            if (pCompressor->open_intervals) {
+               pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(lzsa_match));
+
+               if (pCompressor->match) {
+                  if (pCompressor->format_version == 2) {
+                     pCompressor->best_match = (lzsa_match *)malloc(nMaxWindowSize * sizeof(lzsa_match));
+
+                     if (pCompressor->best_match) {
+                        pCompressor->slot_cost = (int *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(int));
+
+                        if (pCompressor->slot_cost) {
+                           pCompressor->repmatch_opt = (lzsa_repmatch_opt *)malloc(nMaxWindowSize * sizeof(lzsa_repmatch_opt));
+
+                           if (pCompressor->repmatch_opt)
+                              return 0;
+                        }
+                     }
+                  }
+                  else {
+                     return 0;
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   lzsa_compressor_destroy(pCompressor);
+   return 100;
+}
+
+/**
+ * Clean up compression context and free up any associated resources
+ *
+ * @param pCompressor compression context to clean up
+ */
+void lzsa_compressor_destroy(lsza_compressor *pCompressor) {
+   divsufsort_destroy(&pCompressor->divsufsort_context);
+
+   if (pCompressor->repmatch_opt) {
+      free(pCompressor->repmatch_opt);
+      pCompressor->repmatch_opt = NULL;
+   }
+
+   if (pCompressor->slot_cost) {
+      free(pCompressor->slot_cost);
+      pCompressor->slot_cost = NULL;
+   }
+
+   if (pCompressor->best_match) {
+      free(pCompressor->best_match);
+      pCompressor->best_match = NULL;
+   }
+
+   if (pCompressor->match) {
+      free(pCompressor->match);
+      pCompressor->match = NULL;
+   }
+
+   if (pCompressor->open_intervals) {
+      free(pCompressor->open_intervals);
+      pCompressor->open_intervals = NULL;
+   }
+
+   if (pCompressor->pos_data) {
+      free(pCompressor->pos_data);
+      pCompressor->pos_data = NULL;
+   }
+
+   if (pCompressor->intervals) {
+      free(pCompressor->intervals);
+      pCompressor->intervals = NULL;
+   }
+}
+
+/**
+ * Compress one block of data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+int lzsa_shrink_block(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
+   if (lzsa_build_suffix_array(pCompressor, pInWindow, nPreviousBlockSize + nInDataSize))
+      return -1;
+   if (nPreviousBlockSize) {
+      lzsa_skip_matches(pCompressor, 0, nPreviousBlockSize);
+   }
+   lzsa_find_all_matches(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+
+   if (pCompressor->format_version == 1) {
+      return lzsa_optimize_and_write_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);
+   }
+   else if (pCompressor->format_version == 2) {
+      return lzsa_optimize_and_write_block_v2(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize);
+   }
+   else {
+      return -1;
+   }
+}
+
+/**
+ * Get the number of compression commands issued in compressed data blocks
+ *
+ * @return number of commands
+ */
+int lzsa_compressor_get_command_count(lsza_compressor *pCompressor) {
+   return pCompressor->num_commands;
+}
+
+/**
+ * Decompress one data block
+ *
+ * @param pInBlock pointer to compressed data
+ * @param nInBlockSize size of compressed data, in bytes
+ * @param pOutData pointer to output decompression buffer (previously decompressed bytes + room for decompressing this block)
+ * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes)
+ * @param nBlockMaxSize total size of output decompression buffer, in bytes
+ *
+ * @return size of decompressed data in bytes, or -1 for error
+ */
+int lzsa_expand_block(const int nFormatVersion, const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize) {
+   if (nFormatVersion == 1)
+      return lzsa_expand_block_v1(pInBlock, nBlockSize, pOutData, nOutDataOffset, nBlockMaxSize);
+   else if (nFormatVersion == 2)
+      return lzsa_expand_block_v2(pInBlock, nBlockSize, pOutData, nOutDataOffset, nBlockMaxSize);
+   else
+      return -1;
+}
--- a/src/shrink.h
+++ b/src/shrink.h
@ -1,5 +1,5 @@
 /*
- * shrink.h - block compressor definitions
+ * lib.h - LZSA library definitions
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -20,8 +20,18 @@
 * 3. This notice may not be removed or altered from any source distribution.
 */

-#ifndef _SHRINK_H
-#define _SHRINK_H
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#ifndef _LIB_H
+#define _LIB_H

 #include "divsufsort.h"

@ -29,17 +39,46 @@
 #define LZSA_FLAG_FAVOR_RATIO    (1<<0)      /**< 1 to compress with the best ratio, 0 to trade some compression ratio for extra decompression speed */
 #define LZSA_FLAG_RAW_BLOCK      (1<<1)      /**< 1 to emit raw block */

-/* Forward declarations */
-typedef struct _lzsa_match lzsa_match;
+#define LCP_BITS 15
+#define LCP_MAX (1<<(LCP_BITS - 1))
+#define LCP_SHIFT (32-LCP_BITS)
+#define LCP_MASK (((1<<LCP_BITS) - 1) << LCP_SHIFT)
+#define POS_MASK ((1<<LCP_SHIFT) - 1)
+
+#define NMATCHES_PER_OFFSET 8
+#define MATCHES_PER_OFFSET_SHIFT 3
+
+#define LEAVE_ALONE_MATCH_SIZE 1000
+
+#define LAST_MATCH_OFFSET 4
+#define LAST_LITERALS 1
+
+#define MODESWITCH_PENALTY 1
+
+/** One match */
+typedef struct _lzsa_match {
+   unsigned short length;
+   unsigned short offset;
+} lzsa_match;
+
+typedef struct _lzsa_repmatch_opt {
+   int incoming_offset;
+   short best_slot_for_incoming;
+   short expected_repmatch;
+} lzsa_repmatch_opt;

 /** Compression context */
-typedef struct {
+typedef struct _lsza_compressor {
   divsufsort_ctx_t divsufsort_context;
   unsigned int *intervals;
   unsigned int *pos_data;
   unsigned int *open_intervals;
   lzsa_match *match;
+   lzsa_match *best_match;
+   int *slot_cost;
+   lzsa_repmatch_opt *repmatch_opt;
   int min_match_size;
+   int format_version;
   int flags;
   int num_commands;
 } lsza_compressor;
@ -54,7 +93,7 @@ typedef struct {
 *
 * @return 0 for success, non-zero for failure
 */
-int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize, const int nFlags);
+int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize, const int nFormatVersion, const int nFlags);

 /**
 * Clean up compression context and free up any associated resources
@ -84,4 +123,17 @@ int lzsa_shrink_block(lsza_compressor *pCompressor, const unsigned char *pInWind
 */
 int lzsa_compressor_get_command_count(lsza_compressor *pCompressor);

-#endif /* _SHRINK_H */
+/**
+ * Decompress one data block
+ *
+ * @param pInBlock pointer to compressed data
+ * @param nInBlockSize size of compressed data, in bytes
+ * @param pOutData pointer to output decompression buffer (previously decompressed bytes + room for decompressing this block)
+ * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes)
+ * @param nBlockMaxSize total size of output decompression buffer, in bytes
+ *
+ * @return size of decompressed data in bytes, or -1 for error
+ */
+int lzsa_expand_block(const int nFormatVersion, const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nOutDataOffset, int nBlockMaxSize);
+
+#endif /* _LIB_H */
--- a/src/lzsa.c
+++ b/src/lzsa.c
@ -1,5 +1,5 @@
 /*
- * main.c - command line compression utility for the LZSA format
+ * lzsa.c - command line compression utility for the LZSA format
 *
 * Copyright (C) 2019 Emmanuel Marty
 *
@ -20,6 +20,16 @@
 * 3. This notice may not be removed or altered from any source distribution.
 */

+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
 #include <stdio.h>
 #include <stdbool.h>
 #include <stdlib.h>
@ -31,17 +41,18 @@
 #endif
 #include "format.h"
 #include "frame.h"
-#include "shrink.h"
-#include "expand.h"
+#include "lib.h"

 #define BLOCK_SIZE 65536
 #define OPT_VERBOSE     1
 #define OPT_RAW         2
 #define OPT_FAVOR_RATIO 4

+#define TOOL_VERSION "0.6.0"
+
 /*---------------------------------------------------------------------------*/

-static long long lzsa_get_time() {
+static long long do_get_time() {
   long long nTime;

 #ifdef _WIN32
@ -60,7 +71,7 @@ static long long lzsa_get_time() {

 /*---------------------------------------------------------------------------*/

-static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, const int nMinMatchSize) {
+static int do_compress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, const int nMinMatchSize, const int nFormatVersion) {
   FILE *f_in, *f_out;
   unsigned char *pInData, *pOutData;
   lsza_compressor compressor;
@ -146,7 +157,7 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename,
      nFlags |= LZSA_FLAG_FAVOR_RATIO;
   if (nOptions & OPT_RAW)
      nFlags |= LZSA_FLAG_RAW_BLOCK;
-   nResult = lzsa_compressor_init(&compressor, BLOCK_SIZE * 2, nMinMatchSize, nFlags);
+   nResult = lzsa_compressor_init(&compressor, BLOCK_SIZE * 2, nMinMatchSize, nFormatVersion, nFlags);
   if (nResult != 0) {
      free(pOutData);
      pOutData = NULL;
@ -165,7 +176,7 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename,
   }

   if ((nOptions & OPT_RAW) == 0) {
-      int nHeaderSize = lzsa_encode_header(cFrameData, 16);
+      int nHeaderSize = lzsa_encode_header(cFrameData, 16, nFormatVersion);
      if (nHeaderSize < 0)
         bError = true;
      else {
@ -175,7 +186,7 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename,
   }

   if (nOptions & OPT_VERBOSE) {
-      nStartTime = lzsa_get_time();
+      nStartTime = do_get_time();
   }

   int nPreviousBlockSize = 0;
@ -280,7 +291,7 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename,
   nCompressedSize += (long long)nFooterSize;

   if (!bError && (nOptions & OPT_VERBOSE)) {
-      nEndTime = lzsa_get_time();
+      nEndTime = do_get_time();

      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
@ -315,7 +326,7 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename,

 /*---------------------------------------------------------------------------*/

-static int lzsa_decompress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+static int do_decompress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, int nFormatVersion) {
   long long nStartTime = 0LL, nEndTime = 0LL;
   long long nOriginalSize = 0LL;
   unsigned int nFileSize = 0;
@ -338,7 +349,7 @@ static int lzsa_decompress(const char *pszInFilename, const char *pszOutFilename
         return 100;
      }

-      if (lzsa_decode_header(cFrameData, nHeaderSize) < 0) {
+      if (lzsa_decode_header(cFrameData, nHeaderSize, &nFormatVersion) < 0) {
         fclose(pInFile);
         pInFile = NULL;
         fprintf(stderr, "invalid magic number or format version in input file\n");
@ -423,7 +434,7 @@ static int lzsa_decompress(const char *pszInFilename, const char *pszOutFilename
   }

   if (nOptions & OPT_VERBOSE) {
-      nStartTime = lzsa_get_time();
+      nStartTime = do_get_time();
   }

   int nDecompressionError = 0;
@ -476,7 +487,7 @@ static int lzsa_decompress(const char *pszInFilename, const char *pszOutFilename
            else {
               unsigned int nBlockOffs = 0;

-               nDecompressedSize = lzsa_expand_block(pInBlock, nBlockSize, pOutData, BLOCK_SIZE, BLOCK_SIZE);
+               nDecompressedSize = lzsa_expand_block(nFormatVersion, pInBlock, nBlockSize, pOutData, BLOCK_SIZE, BLOCK_SIZE);
               if (nDecompressedSize < 0) {
                  nDecompressionError = nDecompressedSize;
                  break;
@ -518,7 +529,7 @@ static int lzsa_decompress(const char *pszInFilename, const char *pszOutFilename
   }
   else {
      if (nOptions & OPT_VERBOSE) {
-         nEndTime = lzsa_get_time();
+         nEndTime = do_get_time();
         double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
         double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
         fprintf(stdout, "Decompressed '%s' in %g seconds, %g Mb/s\n",
@ -529,7 +540,7 @@ static int lzsa_decompress(const char *pszInFilename, const char *pszOutFilename
   }
 }

-static int lzsa_compare(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+static int do_compare(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, int nFormatVersion) {
   long long nStartTime = 0LL, nEndTime = 0LL;
   long long nOriginalSize = 0LL;
   long long nKnownGoodSize = 0LL;
@ -553,7 +564,7 @@ static int lzsa_compare(const char *pszInFilename, const char *pszOutFilename, c
         return 100;
      }

-      if (lzsa_decode_header(cFrameData, nHeaderSize) < 0) {
+      if (lzsa_decode_header(cFrameData, nHeaderSize, &nFormatVersion) < 0) {
         fclose(pInFile);
         pInFile = NULL;
         fprintf(stderr, "invalid magic number or format version in input file\n");
@ -659,7 +670,7 @@ static int lzsa_compare(const char *pszInFilename, const char *pszOutFilename, c
   }

   if (nOptions & OPT_VERBOSE) {
-      nStartTime = lzsa_get_time();
+      nStartTime = do_get_time();
   }

   int nDecompressionError = 0;
@ -715,7 +726,7 @@ static int lzsa_compare(const char *pszInFilename, const char *pszOutFilename, c
            else {
               unsigned int nBlockOffs = 0;

-               nDecompressedSize = lzsa_expand_block(pInBlock, nBlockSize, pOutData, BLOCK_SIZE, BLOCK_SIZE);
+               nDecompressedSize = lzsa_expand_block(nFormatVersion, pInBlock, nBlockSize, pOutData, BLOCK_SIZE, BLOCK_SIZE);
               if (nDecompressedSize < 0) {
                  nDecompressionError = nDecompressedSize;
                  break;
@ -771,7 +782,7 @@ static int lzsa_compare(const char *pszInFilename, const char *pszOutFilename, c
   }
   else {
      if (nOptions & OPT_VERBOSE) {
-         nEndTime = lzsa_get_time();
+         nEndTime = do_get_time();
         double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
         double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
         fprintf(stdout, "Compared '%s' in %g seconds, %g Mb/s\n",
@ -793,9 +804,11 @@ int main(int argc, char **argv) {
   bool bCommandDefined = false;
   bool bVerifyCompression = false;
   bool bMinMatchDefined = false;
+   bool bFormatVersionDefined = false;
   char cCommand = 'z';
-   int nMinMatchSize = MIN_MATCH_SIZE;
+   int nMinMatchSize = 0;
   unsigned int nOptions = OPT_FAVOR_RATIO;
+   int nFormatVersion = 1;

   for (i = 1; i < argc; i++) {
      if (!strcmp(argv[i], "-d")) {
@ -840,7 +853,7 @@ int main(int argc, char **argv) {
         if (!bMinMatchDefined && (i + 1) < argc) {
            char *pEnd = NULL;
            nMinMatchSize = (int)strtol(argv[i + 1], &pEnd, 10);
-            if (pEnd && pEnd != argv[i + 1] && (nMinMatchSize >= MIN_MATCH_SIZE && nMinMatchSize < MATCH_RUN_LEN)) {
+            if (pEnd && pEnd != argv[i + 1] && (nMinMatchSize >= 2 && nMinMatchSize <= 5)) {
               i++;
               bMinMatchDefined = true;
               nOptions &= (~OPT_FAVOR_RATIO);
@ -856,7 +869,7 @@ int main(int argc, char **argv) {
         if (!bMinMatchDefined) {
            char *pEnd = NULL;
            nMinMatchSize = (int)strtol(argv[i] + 2, &pEnd, 10);
-            if (pEnd && pEnd != (argv[i]+2) && (nMinMatchSize >= MIN_MATCH_SIZE && nMinMatchSize < MATCH_RUN_LEN)) {
+            if (pEnd && pEnd != (argv[i]+2) && (nMinMatchSize >= 2 && nMinMatchSize <= 5)) {
               bMinMatchDefined = true;
               nOptions &= (~OPT_FAVOR_RATIO);
            }
@ -869,7 +882,7 @@ int main(int argc, char **argv) {
      }
      else if (!strcmp(argv[i], "--prefer-ratio")) {
         if (!bMinMatchDefined) {
-            nMinMatchSize = MIN_MATCH_SIZE;
+            nMinMatchSize = 0;
            bMinMatchDefined = true;
         }
         else
@ -884,6 +897,35 @@ int main(int argc, char **argv) {
         else
            bArgsError = true;
      }
+      else if (!strcmp(argv[i], "-f")) {
+         if (!bFormatVersionDefined && (i + 1) < argc) {
+            char *pEnd = NULL;
+            nFormatVersion = (int)strtol(argv[i + 1], &pEnd, 10);
+            if (pEnd && pEnd != argv[i + 1] && (nFormatVersion >= 1 && nFormatVersion <= 2)) {
+               i++;
+               bFormatVersionDefined = true;
+            }
+            else {
+               bArgsError = true;
+            }
+         }
+         else
+            bArgsError = true;
+      }
+      else if (!strncmp(argv[i], "-f", 2)) {
+         if (!bFormatVersionDefined) {
+            char *pEnd = NULL;
+            nFormatVersion = (int)strtol(argv[i] + 2, &pEnd, 10);
+            if (pEnd && pEnd != (argv[i] + 2) && (nFormatVersion >= 1 && nFormatVersion <= 2)) {
+               bFormatVersionDefined = true;
+            }
+            else {
+               bArgsError = true;
+            }
+         }
+         else
+            bArgsError = true;
+      }
      else if (!strcmp(argv[i], "-v")) {
         if ((nOptions & OPT_VERBOSE) == 0) {
            nOptions |= OPT_VERBOSE;
@ -911,26 +953,28 @@ int main(int argc, char **argv) {
   }

   if (bArgsError || !pszInFilename || !pszOutFilename) {
+      fprintf(stderr, "lzsa command-line tool v" TOOL_VERSION " by Emmanuel Marty and spke\n");
      fprintf(stderr, "usage: %s [-c] [-d] [-v] [-r] <infile> <outfile>\n", argv[0]);
      fprintf(stderr, "       -c: check resulting stream after compressing\n");
      fprintf(stderr, "       -d: decompress (default: compress)\n");
      fprintf(stderr, "       -v: be verbose\n");
+      fprintf(stderr, "       -f <value>: LZSA compression format (1-2)\n");
      fprintf(stderr, "       -r: raw block format (max. 64 Kb files)\n");
      fprintf(stderr, "       -D <filename>: use dictionary file\n");
-      fprintf(stderr, "       -m <value>: minimum match size (3-14) (default: 3)\n");
+      fprintf(stderr, "       -m <value>: minimum match size (3-5) (default: 3)\n");
      fprintf(stderr, "       --prefer-ratio: favor compression ratio (default)\n");
      fprintf(stderr, "       --prefer-speed: favor decompression speed (same as -m3)\n");
      return 100;
   }

   if (cCommand == 'z') {
-      int nResult = lzsa_compress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMinMatchSize);
+      int nResult = do_compress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMinMatchSize, nFormatVersion);
      if (nResult == 0 && bVerifyCompression) {
-         nResult = lzsa_compare(pszOutFilename, pszInFilename, pszDictionaryFilename, nOptions);
+         nResult = do_compare(pszOutFilename, pszInFilename, pszDictionaryFilename, nOptions, nFormatVersion);
      }
   }
   else if (cCommand == 'd') {
-      return lzsa_decompress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions);
+      return do_decompress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nFormatVersion);
   }
   else {
      return 100;
--- a/src/matchfinder.c
+++ b/src/matchfinder.c
@ -0,0 +1,294 @@
+/*
+ * matchfinder.c - LZ match finder implementation
+ *
+ * The following copying information applies to this specific source code file:
+ *
+ * Written in 2019 by Emmanuel Marty <marty.emmanuel@gmail.com>
+ * Portions written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
+ *
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "matchfinder.h"
+#include "format.h"
+#include "lib.h"
+
+/**
+ * Parse input data, build suffix array and overlaid data structures to speed up match finding
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
+ *
+ * @return 0 for success, non-zero for failure
+ */
+int lzsa_build_suffix_array(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize) {
+   unsigned int *intervals = pCompressor->intervals;
+
+   /* Build suffix array from input data */
+   if (divsufsort_build_array(&pCompressor->divsufsort_context, pInWindow, (saidx_t*)intervals, nInWindowSize) != 0) {
+      return 100;
+   }
+
+   int *PLCP = (int*)pCompressor->pos_data;  /* Use temporarily */
+   int *Phi = PLCP;
+   int nCurLen = 0;
+   int i;
+
+   /* Compute the permuted LCP first (Kärkkäinen method) */
+   Phi[intervals[0]] = -1;
+   for (i = 1; i < nInWindowSize; i++)
+      Phi[intervals[i]] = intervals[i - 1];
+   for (i = 0; i < nInWindowSize; i++) {
+      if (Phi[i] == -1) {
+         PLCP[i] = 0;
+         continue;
+      }
+      int nMaxLen = (i > Phi[i]) ? (nInWindowSize - i) : (nInWindowSize - Phi[i]);
+      while (nCurLen < nMaxLen && pInWindow[i + nCurLen] == pInWindow[Phi[i] + nCurLen]) nCurLen++;
+      PLCP[i] = nCurLen;
+      if (nCurLen > 0)
+         nCurLen--;
+   }
+
+   /* Rotate permuted LCP into the LCP. This has better cache locality than the direct Kasai LCP method. This also
+    * saves us from having to build the inverse suffix array index, as the LCP is calculated without it using this method,
+    * and the interval builder below doesn't need it either. */
+   intervals[0] &= POS_MASK;
+   int nMinMatchSize = pCompressor->min_match_size;
+   for (i = 1; i < nInWindowSize - 1; i++) {
+      int nIndex = (int)(intervals[i] & POS_MASK);
+      int nLen = PLCP[nIndex];
+      if (nLen < nMinMatchSize)
+         nLen = 0;
+      if (nLen > LCP_MAX)
+         nLen = LCP_MAX;
+      intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nLen) << LCP_SHIFT);
+   }
+   if (i < nInWindowSize)
+      intervals[i] &= POS_MASK;
+
+   /**
+    * Build intervals for finding matches
+    *
+    * Methodology and code fragment taken from wimlib (CC0 license):
+    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
+    */
+   unsigned int * const SA_and_LCP = intervals;
+   unsigned int *pos_data = pCompressor->pos_data;
+   unsigned int next_interval_idx;
+   unsigned int *top = pCompressor->open_intervals;
+   unsigned int prev_pos = SA_and_LCP[0] & POS_MASK;
+
+   *top = 0;
+   intervals[0] = 0;
+   next_interval_idx = 1;
+
+   for (int r = 1; r < nInWindowSize; r++) {
+      const unsigned int next_pos = SA_and_LCP[r] & POS_MASK;
+      const unsigned int next_lcp = SA_and_LCP[r] & LCP_MASK;
+      const unsigned int top_lcp = *top & LCP_MASK;
+
+      if (next_lcp == top_lcp) {
+         /* Continuing the deepest open interval  */
+         pos_data[prev_pos] = *top;
+      }
+      else if (next_lcp > top_lcp) {
+         /* Opening a new interval  */
+         *++top = next_lcp | next_interval_idx++;
+         pos_data[prev_pos] = *top;
+      }
+      else {
+         /* Closing the deepest open interval  */
+         pos_data[prev_pos] = *top;
+         for (;;) {
+            const unsigned int closed_interval_idx = *top-- & POS_MASK;
+            const unsigned int superinterval_lcp = *top & LCP_MASK;
+
+            if (next_lcp == superinterval_lcp) {
+               /* Continuing the superinterval */
+               intervals[closed_interval_idx] = *top;
+               break;
+            }
+            else if (next_lcp > superinterval_lcp) {
+               /* Creating a new interval that is a
+                * superinterval of the one being
+                * closed, but still a subinterval of
+                * its superinterval  */
+               *++top = next_lcp | next_interval_idx++;
+               intervals[closed_interval_idx] = *top;
+               break;
+            }
+            else {
+               /* Also closing the superinterval  */
+               intervals[closed_interval_idx] = *top;
+            }
+         }
+      }
+      prev_pos = next_pos;
+   }
+
+   /* Close any still-open intervals.  */
+   pos_data[prev_pos] = *top;
+   for (; top > pCompressor->open_intervals; top--)
+      intervals[*top & POS_MASK] = *(top - 1);
+
+   /* Success */
+   return 0;
+}
+
+/**
+ * Find matches at the specified offset in the input window
+ *
+ * @param pCompressor compression context
+ * @param nOffset offset to find matches at, in the input window
+ * @param pMatches pointer to returned matches
+ * @param nMaxMatches maximum number of matches to return (0 for none)
+ *
+ * @return number of matches
+ */
+int lzsa_find_matches_at(lsza_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches) {
+   unsigned int *intervals = pCompressor->intervals;
+   unsigned int *pos_data = pCompressor->pos_data;
+   unsigned int ref;
+   unsigned int super_ref;
+   unsigned int match_pos;
+   lzsa_match *matchptr;
+
+   /**
+    * Find matches using intervals
+    *
+    * Taken from wimlib (CC0 license):
+    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
+    */
+
+    /* Get the deepest lcp-interval containing the current suffix. */
+   ref = pos_data[nOffset];
+
+   pos_data[nOffset] = 0;
+
+   /* Ascend until we reach a visited interval, the root, or a child of the
+    * root.  Link unvisited intervals to the current suffix as we go.  */
+   while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
+      intervals[ref & POS_MASK] = nOffset;
+      ref = super_ref;
+   }
+
+   if (super_ref == 0) {
+      /* In this case, the current interval may be any of:
+       * (1) the root;
+       * (2) an unvisited child of the root;
+       * (3) an interval last visited by suffix 0
+       *
+       * We could avoid the ambiguity with (3) by using an lcp
+       * placeholder value other than 0 to represent "visited", but
+       * it's fastest to use 0.  So we just don't allow matches with
+       * position 0.  */
+
+      if (ref != 0)  /* Not the root?  */
+         intervals[ref & POS_MASK] = nOffset;
+      return 0;
+   }
+
+   /* Ascend indirectly via pos_data[] links.  */
+   match_pos = super_ref;
+   matchptr = pMatches;
+   for (;;) {
+      while ((super_ref = pos_data[match_pos]) > ref)
+         match_pos = intervals[super_ref & POS_MASK];
+      intervals[ref & POS_MASK] = nOffset;
+      pos_data[match_pos] = ref;
+
+      if ((matchptr - pMatches) < nMaxMatches) {
+         int nMatchOffset = (int)(nOffset - match_pos);
+
+         if (nMatchOffset <= MAX_OFFSET) {
+            matchptr->length = (unsigned short)(ref >> LCP_SHIFT);
+            matchptr->offset = (unsigned short)nMatchOffset;
+            matchptr++;
+         }
+      }
+
+      if (super_ref == 0)
+         break;
+      ref = super_ref;
+      match_pos = intervals[ref & POS_MASK];
+   }
+
+   return (int)(matchptr - pMatches);
+}
+
+/**
+ * Skip previously compressed bytes
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically 0)
+ * @param nEndOffset offset to skip to in input window (typically the number of previously compressed bytes)
+ */
+void lzsa_skip_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   lzsa_match match;
+   int i;
+
+   /* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
+    * we don't store the matches. */
+   for (i = nStartOffset; i < nEndOffset; i++) {
+      lzsa_find_matches_at(pCompressor, i, &match, 0);
+   }
+}
+
+/**
+ * Find all matches for the data to be compressed. Up to NMATCHES_PER_OFFSET matches are stored for each offset, for
+ * the optimizer to look at.
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ */
+void lzsa_find_all_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   lzsa_match *pMatch = pCompressor->match + (nStartOffset << MATCHES_PER_OFFSET_SHIFT);
+   int i;
+
+   for (i = nStartOffset; i < nEndOffset; i++) {
+      int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, NMATCHES_PER_OFFSET);
+      int m;
+
+      for (m = 0; m < NMATCHES_PER_OFFSET; m++) {
+         if (nMatches <= m || i > (nEndOffset - LAST_MATCH_OFFSET)) {
+            pMatch->length = 0;
+            pMatch->offset = 0;
+         }
+         else {
+            int nMaxLen = (nEndOffset - LAST_LITERALS) - i;
+            if (nMaxLen < 0)
+               nMaxLen = 0;
+            if (pMatch->length > nMaxLen)
+               pMatch->length = (unsigned short)nMaxLen;
+         }
+
+         pMatch++;
+      }
+   }
+}
--- a/src/matchfinder.h
+++ b/src/matchfinder.h
@ -0,0 +1,82 @@
+/*
+ * matchfinder.h - LZ match finder definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#ifndef _MATCHFINDER_H
+#define _MATCHFINDER_H
+
+/* Forward declarations */
+typedef struct _lzsa_match lzsa_match;
+typedef struct _lsza_compressor lsza_compressor;
+
+/**
+ * Parse input data, build suffix array and overlaid data structures to speed up match finding
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
+ *
+ * @return 0 for success, non-zero for failure
+ */
+int lzsa_build_suffix_array(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize);
+
+/**
+ * Find matches at the specified offset in the input window
+ *
+ * @param pCompressor compression context
+ * @param nOffset offset to find matches at, in the input window
+ * @param pMatches pointer to returned matches
+ * @param nMaxMatches maximum number of matches to return (0 for none)
+ *
+ * @return number of matches
+ */
+int lzsa_find_matches_at(lsza_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches);
+
+/**
+ * Skip previously compressed bytes
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically 0)
+ * @param nEndOffset offset to skip to in input window (typically the number of previously compressed bytes)
+ */
+void lzsa_skip_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset);
+
+/**
+ * Find all matches for the data to be compressed. Up to NMATCHES_PER_OFFSET matches are stored for each offset, for
+ * the optimizer to look at.
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ */
+void lzsa_find_all_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset);
+
+#endif /* _MATCHFINDER_H */
--- a/src/shrink.c
+++ b/src/shrink.c
@ -1,830 +0,0 @@
-/*
- * shrink.c - block compressor implementation
- *
- * The following copying information applies to this specific source code file:
- *
- * Written in 2019 by Emmanuel Marty <marty.emmanuel@gmail.com>
- * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
- * Portions written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
- *
- * To the extent possible under law, the author(s) have dedicated all copyright
- * and related and neighboring rights to this software to the public domain
- * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
- * Dedication (the "CC0").
- *
- * This software is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
- *
- * You should have received a copy of the CC0 along with this software; if not
- * see <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-/*
- * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
- *
- * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
- * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
- * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "shrink.h"
-#include "format.h"
-
-#define LCP_BITS 15
-#define LCP_MAX (1<<(LCP_BITS - 1))
-#define LCP_SHIFT (32-LCP_BITS)
-#define LCP_MASK (((1<<LCP_BITS) - 1) << LCP_SHIFT)
-#define POS_MASK ((1<<LCP_SHIFT) - 1)
-
-#define NMATCHES_PER_OFFSET 8
-#define MATCHES_PER_OFFSET_SHIFT 3
-
-#define LEAVE_ALONE_MATCH_SIZE 1000
-
-#define LAST_MATCH_OFFSET 4
-#define LAST_LITERALS 1
-
-#define MODESWITCH_PENALTY 1
-
-/** One match */
-typedef struct _lzsa_match {
-   unsigned short length;
-   unsigned short offset;
-} lzsa_match;
-
-/**
- * Initialize compression context
- *
- * @param pCompressor compression context to initialize
- * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
- * @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE)
- * @param nFlags compression flags
- *
- * @return 0 for success, non-zero for failure
- */
-int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize, const int nFlags) {
-   int nResult;
-
-   nResult = divsufsort_init(&pCompressor->divsufsort_context);
-   pCompressor->intervals = NULL;
-   pCompressor->pos_data = NULL;
-   pCompressor->open_intervals = NULL;
-   pCompressor->match = NULL;
-   pCompressor->min_match_size = nMinMatchSize;
-   if (pCompressor->min_match_size < MIN_MATCH_SIZE)
-      pCompressor->min_match_size = MIN_MATCH_SIZE;
-   else if (pCompressor->min_match_size > (MATCH_RUN_LEN - 1))
-      pCompressor->min_match_size = MATCH_RUN_LEN - 1;
-   pCompressor->flags = nFlags;
-   pCompressor->num_commands = 0;
-
-   if (!nResult) {
-      pCompressor->intervals = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));
-
-      if (pCompressor->intervals) {
-         pCompressor->pos_data = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));
-
-         if (pCompressor->pos_data) {
-            pCompressor->open_intervals = (unsigned int *)malloc((LCP_MAX + 1) * sizeof(unsigned int));
-
-            if (pCompressor->open_intervals) {
-               pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(lzsa_match));
-
-               if (pCompressor->match)
-                  return 0;
-            }
-         }
-      }
-   }
-
-   lzsa_compressor_destroy(pCompressor);
-   return 100;
-}
-
-/**
- * Clean up compression context and free up any associated resources
- *
- * @param pCompressor compression context to clean up
- */
-void lzsa_compressor_destroy(lsza_compressor *pCompressor) {
-   divsufsort_destroy(&pCompressor->divsufsort_context);
-
-   if (pCompressor->match) {
-      free(pCompressor->match);
-      pCompressor->match = NULL;
-   }
-
-   if (pCompressor->open_intervals) {
-      free(pCompressor->open_intervals);
-      pCompressor->open_intervals = NULL;
-   }
-
-   if (pCompressor->pos_data) {
-      free(pCompressor->pos_data);
-      pCompressor->pos_data = NULL;
-   }
-
-   if (pCompressor->intervals) {
-      free(pCompressor->intervals);
-      pCompressor->intervals = NULL;
-   }
-}
-
-/**
- * Parse input data, build suffix array and overlaid data structures to speed up match finding
- *
- * @param pCompressor compression context
- * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
- * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
- *
- * @return 0 for success, non-zero for failure
- */
-static int lzsa_build_suffix_array(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize) {
-   unsigned int *intervals = pCompressor->intervals;
-
-   /* Build suffix array from input data */
-   if (divsufsort_build_array(&pCompressor->divsufsort_context, pInWindow, (saidx_t*)intervals, nInWindowSize) != 0) {
-      return 100;
-   }
-
-   int *PLCP = (int*)pCompressor->pos_data;  /* Use temporarily */
-   int *Phi = PLCP;
-   int nCurLen = 0;
-   int i;
-
-   /* Compute the permuted LCP first (Kärkkäinen method) */
-   Phi[intervals[0]] = -1;
-   for (i = 1; i < nInWindowSize; i++)
-      Phi[intervals[i]] = intervals[i - 1];
-   for (i = 0; i < nInWindowSize; i++) {
-      if (Phi[i] == -1) {
-         PLCP[i] = 0;
-         continue;
-      }
-      int nMaxLen = (i > Phi[i]) ? (nInWindowSize - i) : (nInWindowSize - Phi[i]);
-      while (nCurLen < nMaxLen && pInWindow[i + nCurLen] == pInWindow[Phi[i] + nCurLen]) nCurLen++;
-      PLCP[i] = nCurLen;
-      if (nCurLen > 0)
-         nCurLen--;
-   }
-
-   /* Rotate permuted LCP into the LCP. This has better cache locality than the direct Kasai LCP method. This also
-    * saves us from having to build the inverse suffix array index, as the LCP is calculated without it using this method,
-    * and the interval builder below doesn't need it either. */
-   intervals[0] &= POS_MASK;
-   int nMinMatchSize = pCompressor->min_match_size;
-   for (i = 1; i < nInWindowSize - 1; i++) {
-      int nIndex = (int)(intervals[i] & POS_MASK);
-      int nLen = PLCP[nIndex];
-      if (nLen < nMinMatchSize)
-         nLen = 0;
-      if (nLen > LCP_MAX)
-         nLen = LCP_MAX;
-      intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nLen) << LCP_SHIFT);
-   }
-   if (i < nInWindowSize)
-      intervals[i] &= POS_MASK;
-
-   /**
-    * Build intervals for finding matches
-    *
-    * Methodology and code fragment taken from wimlib (CC0 license):
-    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
-    */
-   unsigned int * const SA_and_LCP = intervals;
-   unsigned int *pos_data = pCompressor->pos_data;
-   unsigned int next_interval_idx;
-   unsigned int *top = pCompressor->open_intervals;
-   unsigned int prev_pos = SA_and_LCP[0] & POS_MASK;
-
-   *top = 0;
-   intervals[0] = 0;
-   next_interval_idx = 1;
-
-   for (int r = 1; r < nInWindowSize; r++) {
-      const unsigned int next_pos = SA_and_LCP[r] & POS_MASK;
-      const unsigned int next_lcp = SA_and_LCP[r] & LCP_MASK;
-      const unsigned int top_lcp = *top & LCP_MASK;
-
-      if (next_lcp == top_lcp) {
-         /* Continuing the deepest open interval  */
-         pos_data[prev_pos] = *top;
-      }
-      else if (next_lcp > top_lcp) {
-         /* Opening a new interval  */
-         *++top = next_lcp | next_interval_idx++;
-         pos_data[prev_pos] = *top;
-      }
-      else {
-         /* Closing the deepest open interval  */
-         pos_data[prev_pos] = *top;
-         for (;;) {
-            const unsigned int closed_interval_idx = *top-- & POS_MASK;
-            const unsigned int superinterval_lcp = *top & LCP_MASK;
-
-            if (next_lcp == superinterval_lcp) {
-               /* Continuing the superinterval */
-               intervals[closed_interval_idx] = *top;
-               break;
-            }
-            else if (next_lcp > superinterval_lcp) {
-               /* Creating a new interval that is a
-                * superinterval of the one being
-                * closed, but still a subinterval of
-                * its superinterval  */
-               *++top = next_lcp | next_interval_idx++;
-               intervals[closed_interval_idx] = *top;
-               break;
-            }
-            else {
-               /* Also closing the superinterval  */
-               intervals[closed_interval_idx] = *top;
-            }
-         }
-      }
-      prev_pos = next_pos;
-   }
-
-   /* Close any still-open intervals.  */
-   pos_data[prev_pos] = *top;
-   for (; top > pCompressor->open_intervals; top--)
-      intervals[*top & POS_MASK] = *(top - 1);
-
-   /* Success */
-   return 0;
-}
-
-/**
- * Find matches at the specified offset in the input window
- *
- * @param pCompressor compression context
- * @param nOffset offset to find matches at, in the input window
- * @param pMatches pointer to returned matches
- * @param nMaxMatches maximum number of matches to return (0 for none)
- *
- * @return number of matches
- */
-static int lzsa_find_matches_at(lsza_compressor *pCompressor, const int nOffset, lzsa_match *pMatches, const int nMaxMatches) {
-   unsigned int *intervals = pCompressor->intervals;
-   unsigned int *pos_data = pCompressor->pos_data;
-   unsigned int ref;
-   unsigned int super_ref;
-   unsigned int match_pos;
-   lzsa_match *matchptr;
-
-   /**
-    * Find matches using intervals
-    *
-    * Taken from wimlib (CC0 license):
-    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
-    */
-
-    /* Get the deepest lcp-interval containing the current suffix. */
-   ref = pos_data[nOffset];
-
-   pos_data[nOffset] = 0;
-
-   /* Ascend until we reach a visited interval, the root, or a child of the
-    * root.  Link unvisited intervals to the current suffix as we go.  */
-   while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
-      intervals[ref & POS_MASK] = nOffset;
-      ref = super_ref;
-   }
-
-   if (super_ref == 0) {
-      /* In this case, the current interval may be any of:
-       * (1) the root;
-       * (2) an unvisited child of the root;
-       * (3) an interval last visited by suffix 0
-       *
-       * We could avoid the ambiguity with (3) by using an lcp
-       * placeholder value other than 0 to represent "visited", but
-       * it's fastest to use 0.  So we just don't allow matches with
-       * position 0.  */
-
-      if (ref != 0)  /* Not the root?  */
-         intervals[ref & POS_MASK] = nOffset;
-      return 0;
-   }
-
-   /* Ascend indirectly via pos_data[] links.  */
-   match_pos = super_ref;
-   matchptr = pMatches;
-   for (;;) {
-      while ((super_ref = pos_data[match_pos]) > ref)
-         match_pos = intervals[super_ref & POS_MASK];
-      intervals[ref & POS_MASK] = nOffset;
-      pos_data[match_pos] = ref;
-
-      if ((matchptr - pMatches) < nMaxMatches) {
-         int nMatchOffset = (int)(nOffset - match_pos);
-
-         if (nMatchOffset <= MAX_OFFSET) {
-            matchptr->length = (unsigned short)(ref >> LCP_SHIFT);
-            matchptr->offset = (unsigned short)nMatchOffset;
-            matchptr++;
-         }
-      }
-
-      if (super_ref == 0)
-         break;
-      ref = super_ref;
-      match_pos = intervals[ref & POS_MASK];
-   }
-
-   return (int)(matchptr - pMatches);
-}
-
-/**
- * Skip previously compressed bytes
- *
- * @param pCompressor compression context
- * @param nStartOffset current offset in input window (typically 0)
- * @param nEndOffset offset to skip to in input window (typically the number of previously compressed bytes)
- */
-static void lzsa_skip_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
-   lzsa_match match;
-   int i;
-
-   /* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
-    * we don't store the matches. */
-   for (i = nStartOffset; i < nEndOffset; i++) {
-      lzsa_find_matches_at(pCompressor, i, &match, 0);
-   }
-}
-
-/**
- * Find all matches for the data to be compressed. Up to NMATCHES_PER_OFFSET matches are stored for each offset, for
- * the optimizer to look at.
- *
- * @param pCompressor compression context
- * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
- * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
- */
-static void lzsa_find_all_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
-   lzsa_match *pMatch = pCompressor->match + (nStartOffset << MATCHES_PER_OFFSET_SHIFT);
-   int i;
-
-   for (i = nStartOffset; i < nEndOffset; i++) {
-      int nMatches = lzsa_find_matches_at(pCompressor, i, pMatch, NMATCHES_PER_OFFSET);
-      int m;
-
-      for (m = 0; m < NMATCHES_PER_OFFSET; m++) {
-         if (nMatches <= m || i > (nEndOffset - LAST_MATCH_OFFSET)) {
-            pMatch->length = 0;
-            pMatch->offset = 0;
-         }
-         else {
-            int nMaxLen = (nEndOffset - LAST_LITERALS) - i;
-            if (nMaxLen < 0)
-               nMaxLen = 0;
-            if (pMatch->length > nMaxLen)
-               pMatch->length = (unsigned short)nMaxLen;
-         }
-
-         pMatch++;
-      }
-   }
-}
-
-/**
- * Get the number of extra bits required to represent a literals length
- *
- * @param nLength literals length
- *
- * @return number of extra bits required
- */
-static inline int lzsa_get_literals_varlen_size(const int nLength) {
-   if (nLength < LITERALS_RUN_LEN) {
-      return 0;
-   }
-   else {
-      if (nLength < 256)
-         return 8;
-      else {
-         if (nLength < 512)
-            return 16;
-         else
-            return 24;
-      }
-   }
-}
-
-/**
- * Write extra literals length bytes to output (compressed) buffer. The caller must first check that there is enough
- * room to write the bytes.
- *
- * @param pOutData pointer to output buffer
- * @param nOutOffset current write index into output buffer
- * @param nLength literals length
- */
-static inline int lzsa_write_literals_varlen(unsigned char *pOutData, int nOutOffset, int nLength) {
-   if (nLength >= LITERALS_RUN_LEN) {
-      if (nLength < 256)
-         pOutData[nOutOffset++] = nLength - LITERALS_RUN_LEN;
-      else {
-         if (nLength < 512) {
-            pOutData[nOutOffset++] = 250;
-            pOutData[nOutOffset++] = nLength - 256;
-         }
-         else {
-            pOutData[nOutOffset++] = 249;
-            pOutData[nOutOffset++] = nLength & 0xff;
-            pOutData[nOutOffset++] = (nLength >> 8) & 0xff;
-         }
-      }
-   }
-
-   return nOutOffset;
-}
-
-/**
- * Get the number of extra bits required to represent an encoded match length
- *
- * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE)
- *
- * @return number of extra bits required
- */
-static inline int lzsa_get_match_varlen_size(const int nLength) {
-   if (nLength < MATCH_RUN_LEN) {
-      return 0;
-   }
-   else {
-      if ((nLength + MIN_MATCH_SIZE) < 256)
-         return 8;
-      else {
-         if ((nLength + MIN_MATCH_SIZE) < 512)
-            return 16;
-         else
-            return 24;
-      }
-   }
-}
-
-/**
- * Write extra encoded match length bytes to output (compressed) buffer. The caller must first check that there is enough
- * room to write the bytes.
- *
- * @param pOutData pointer to output buffer
- * @param nOutOffset current write index into output buffer
- * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE)
- */
-static inline int lzsa_write_match_varlen(unsigned char *pOutData, int nOutOffset, int nLength) {
-   if (nLength >= MATCH_RUN_LEN) {
-      if ((nLength + MIN_MATCH_SIZE) < 256)
-         pOutData[nOutOffset++] = nLength - MATCH_RUN_LEN;
-      else {
-         if ((nLength + MIN_MATCH_SIZE) < 512) {
-            pOutData[nOutOffset++] = 239;
-            pOutData[nOutOffset++] = nLength + MIN_MATCH_SIZE - 256;
-         }
-         else {
-            pOutData[nOutOffset++] = 238;
-            pOutData[nOutOffset++] = (nLength + MIN_MATCH_SIZE) & 0xff;
-            pOutData[nOutOffset++] = ((nLength + MIN_MATCH_SIZE) >> 8) & 0xff;
-         }
-      }
-   }
-
-   return nOutOffset;
-}
-
-/**
- * Attempt to pick optimal matches, so as to produce the smallest possible output that decompresses to the same input
- *
- * @param pCompressor compression context
- * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
- * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
- */
-static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
-   int *cost = (int*)pCompressor->pos_data;  /* Reuse */
-   int nLastLiteralsOffset;
-   int nMinMatchSize = pCompressor->min_match_size;
-   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
-   int i;
-
-   cost[nEndOffset - 1] = 8;
-   nLastLiteralsOffset = nEndOffset;
-
-   for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
-      int nBestCost, nBestMatchLen, nBestMatchOffset;
-
-      int nLiteralsLen = nLastLiteralsOffset - i;
-      nBestCost = 8 + cost[i + 1];
-      if (nLiteralsLen == LITERALS_RUN_LEN || nLiteralsLen == 256 || nLiteralsLen == 512) {
-         /* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
-          * The cost automatically accumulates down the chain. */
-         nBestCost += 8;
-      }
-      if (pCompressor->match[(i + 1) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
-         nBestCost += MODESWITCH_PENALTY;
-      nBestMatchLen = 0;
-      nBestMatchOffset = 0;
-
-      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
-      int m;
-
-      for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
-         int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 8 : 16;
-
-         if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
-            int nCurCost;
-            int nMatchLen = pMatch[m].length;
-
-            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
-               nMatchLen = nEndOffset - LAST_LITERALS - i;
-
-            nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE);
-            nCurCost += cost[i + nMatchLen];
-            if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
-               nCurCost += MODESWITCH_PENALTY;
-
-            if (nBestCost > (nCurCost - nFavorRatio)) {
-               nBestCost = nCurCost;
-               nBestMatchLen = nMatchLen;
-               nBestMatchOffset = pMatch[m].offset;
-            }
-         }
-         else {
-            int nMatchLen = pMatch[m].length;
-            int k, nMatchRunLen;
-
-            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
-               nMatchLen = nEndOffset - LAST_LITERALS - i;
-
-            nMatchRunLen = nMatchLen;
-            if (nMatchRunLen > MATCH_RUN_LEN)
-               nMatchRunLen = MATCH_RUN_LEN;
-
-            for (k = nMinMatchSize; k < nMatchRunLen; k++) {
-               int nCurCost;
-
-               nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */;
-               nCurCost += cost[i + k];
-               if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
-                  nCurCost += MODESWITCH_PENALTY;
-
-               if (nBestCost > (nCurCost - nFavorRatio)) {
-                  nBestCost = nCurCost;
-                  nBestMatchLen = k;
-                  nBestMatchOffset = pMatch[m].offset;
-               }
-            }
-
-            for (; k <= nMatchLen; k++) {
-               int nCurCost;
-
-               nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE);
-               nCurCost += cost[i + k];
-               if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
-                  nCurCost += MODESWITCH_PENALTY;
-
-               if (nBestCost > (nCurCost - nFavorRatio)) {
-                  nBestCost = nCurCost;
-                  nBestMatchLen = k;
-                  nBestMatchOffset = pMatch[m].offset;
-               }
-            }
-         }
-      }
-
-      if (nBestMatchLen >= MIN_MATCH_SIZE)
-         nLastLiteralsOffset = i;
-
-      cost[i] = nBestCost;
-      pMatch->length = nBestMatchLen;
-      pMatch->offset = nBestMatchOffset;
-   }
-}
-
-/**
- * Attempt to minimize the number of commands issued in the compressed data block, in order to speed up decompression without
- * impacting the compression ratio
- *
- * @param pCompressor compression context
- * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
- * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
- *
- * @return non-zero if the number of tokens was reduced, 0 if it wasn't
- */
-static int lzsa_optimize_command_count(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
-   int i;
-   int nNumLiterals = 0;
-   int nDidReduce = 0;
-
-   for (i = nStartOffset; i < nEndOffset; ) {
-      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
-
-      if (pMatch->length >= MIN_MATCH_SIZE) {
-         int nMatchLen = pMatch->length;
-         int nReduce = 0;
-
-         if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <EE> <ll> <ll> <offset> <offset> <EE> <mm> <mm> */ {
-            int nMatchOffset = pMatch->offset;
-            int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE;
-            int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + ((nMatchOffset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
-
-            if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE) {
-               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nMatchLen))) {
-                  /* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
-                   * make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
-                   * at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
-                   * match command by literals, the output size will not increase and it will remove one command. */
-                  nReduce = 1;
-               }
-            }
-            else {
-               int nCurIndex = i + nMatchLen;
-               int nNextNumLiterals = 0;
-
-               do {
-                  nCurIndex++;
-                  nNextNumLiterals++;
-               } while (nCurIndex < nEndOffset && pCompressor->match[nCurIndex << MATCHES_PER_OFFSET_SHIFT].length < MIN_MATCH_SIZE);
-
-               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size(nNextNumLiterals))) {
-                  /* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
-                   * more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
-                  nReduce = 1;
-               }
-            }
-         }
-
-         if (nReduce) {
-            int j;
-
-            for (j = 0; j < nMatchLen; j++) {
-               pCompressor->match[(i + j) << MATCHES_PER_OFFSET_SHIFT].length = 0;
-            }
-            nNumLiterals += nMatchLen;
-            i += nMatchLen;
-
-            nDidReduce = 1;
-         }
-         else {
-            if ((i + nMatchLen) < nEndOffset && nMatchLen >= LCP_MAX &&
-               pMatch->offset && pMatch->offset <= 32 && pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset == pMatch->offset && (nMatchLen % pMatch->offset) == 0 &&
-               (nMatchLen + pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length) <= MAX_OFFSET) {
-               /* Join */
-
-               pMatch->length += pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length;
-               pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset = 0;
-               pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length = -1;
-               continue;
-            }
-
-            nNumLiterals = 0;
-            i += nMatchLen;
-         }
-      }
-      else {
-         nNumLiterals++;
-         i++;
-      }
-   }
-
-   return nDidReduce;
-}
-
-/**
- * Emit block of compressed data
- *
- * @param pCompressor compression context
- * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
- * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
- * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
- * @param pOutData pointer to output buffer
- * @param nMaxOutDataSize maximum size of output buffer, in bytes
- *
- * @return size of compressed data in output buffer, or -1 if the data is uncompressible
- */
-static int lzsa_write_block(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize) {
-   int i;
-   int nNumLiterals = 0;
-   int nInFirstLiteralOffset = 0;
-   int nOutOffset = 0;
-
-   for (i = nStartOffset; i < nEndOffset; ) {
-      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
-
-      if (pMatch->length >= MIN_MATCH_SIZE) {
-         int nMatchOffset = pMatch->offset;
-         int nMatchLen = pMatch->length;
-         int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE;
-         int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
-         int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN) ? MATCH_RUN_LEN : nEncodedMatchLen;
-         int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
-         int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
-
-         if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
-            return -1;
-         if (nMatchOffset < MIN_OFFSET || nMatchOffset > MAX_OFFSET)
-            return -1;
-
-         pOutData[nOutOffset++] = nTokenLongOffset | (nTokenLiteralsLen << 4) | nTokenMatchLen;
-         nOutOffset = lzsa_write_literals_varlen(pOutData, nOutOffset, nNumLiterals);
-
-         if (nNumLiterals != 0) {
-            memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
-            nOutOffset += nNumLiterals;
-            nNumLiterals = 0;
-         }
-
-         pOutData[nOutOffset++] = (-nMatchOffset) & 0xff;
-         if (nTokenLongOffset) {
-            pOutData[nOutOffset++] = (-nMatchOffset) >> 8;
-         }
-         nOutOffset = lzsa_write_match_varlen(pOutData, nOutOffset, nEncodedMatchLen);
-         i += nMatchLen;
-
-         pCompressor->num_commands++;
-      }
-      else {
-         if (nNumLiterals == 0)
-            nInFirstLiteralOffset = i;
-         nNumLiterals++;
-         i++;
-      }
-   }
-
-   {
-      int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
-      int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3);
-
-      if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
-         return -1;
-
-      if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK)
-         pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x0f;
-      else
-         pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x00;
-      nOutOffset = lzsa_write_literals_varlen(pOutData, nOutOffset, nNumLiterals);
-
-      if (nNumLiterals != 0) {
-         memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
-         nOutOffset += nNumLiterals;
-         nNumLiterals = 0;
-      }
-
-      pCompressor->num_commands++;
-   }
-
-   if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
-      /* Emit EOD marker for raw block */
-
-      if ((nOutOffset + 4) > nMaxOutDataSize)
-         return -1;
-
-      pOutData[nOutOffset++] = 0;
-      pOutData[nOutOffset++] = 238;
-      pOutData[nOutOffset++] = 0;
-      pOutData[nOutOffset++] = 0;
-   }
-
-   return nOutOffset;
-}
-
-/**
- * Compress one block of data
- *
- * @param pCompressor compression context
- * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
- * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
- * @param nInDataSize number of input bytes to compress
- * @param pOutData pointer to output buffer
- * @param nMaxOutDataSize maximum size of output buffer, in bytes
- *
- * @return size of compressed data in output buffer, or -1 if the data is uncompressible
- */
-int lzsa_shrink_block(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
-   if (lzsa_build_suffix_array(pCompressor, pInWindow, nPreviousBlockSize + nInDataSize))
-      return -1;
-   if (nPreviousBlockSize) {
-      lzsa_skip_matches(pCompressor, 0, nPreviousBlockSize);
-   }
-   lzsa_find_all_matches(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
-   lzsa_optimize_matches(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
-
-   int nDidReduce;
-   int nPasses = 0;
-   do {
-      nDidReduce = lzsa_optimize_command_count(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
-      nPasses++;
-   } while (nDidReduce && nPasses < 20);
-
-   return lzsa_write_block(pCompressor, pInWindow, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, pOutData, nMaxOutDataSize);
-}
-
-/**
- * Get the number of compression commands issued in compressed data blocks
- *
- * @return number of commands
- */
-int lzsa_compressor_get_command_count(lsza_compressor *pCompressor) {
-   return pCompressor->num_commands;
-}
--- a/src/shrink_v1.c
+++ b/src/shrink_v1.c
@ -0,0 +1,460 @@
+/*
+ * shrink_v1.c - LZSA1 block compressor implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "lib.h"
+#include "shrink_v1.h"
+#include "format.h"
+
+/**
+ * Get the number of extra bits required to represent a literals length
+ *
+ * @param nLength literals length
+ *
+ * @return number of extra bits required
+ */
+static inline int lzsa_get_literals_varlen_size_v1(const int nLength) {
+   if (nLength < LITERALS_RUN_LEN_V1) {
+      return 0;
+   }
+   else {
+      if (nLength < 256)
+         return 8;
+      else {
+         if (nLength < 512)
+            return 16;
+         else
+            return 24;
+      }
+   }
+}
+
+/**
+ * Write extra literals length bytes to output (compressed) buffer. The caller must first check that there is enough
+ * room to write the bytes.
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nLength literals length
+ */
+static inline int lzsa_write_literals_varlen_v1(unsigned char *pOutData, int nOutOffset, int nLength) {
+   if (nLength >= LITERALS_RUN_LEN_V1) {
+      if (nLength < 256)
+         pOutData[nOutOffset++] = nLength - LITERALS_RUN_LEN_V1;
+      else {
+         if (nLength < 512) {
+            pOutData[nOutOffset++] = 250;
+            pOutData[nOutOffset++] = nLength - 256;
+         }
+         else {
+            pOutData[nOutOffset++] = 249;
+            pOutData[nOutOffset++] = nLength & 0xff;
+            pOutData[nOutOffset++] = (nLength >> 8) & 0xff;
+         }
+      }
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Get the number of extra bits required to represent an encoded match length
+ *
+ * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE_V1)
+ *
+ * @return number of extra bits required
+ */
+static inline int lzsa_get_match_varlen_size_v1(const int nLength) {
+   if (nLength < MATCH_RUN_LEN_V1) {
+      return 0;
+   }
+   else {
+      if ((nLength + MIN_MATCH_SIZE_V1) < 256)
+         return 8;
+      else {
+         if ((nLength + MIN_MATCH_SIZE_V1) < 512)
+            return 16;
+         else
+            return 24;
+      }
+   }
+}
+
+/**
+ * Write extra encoded match length bytes to output (compressed) buffer. The caller must first check that there is enough
+ * room to write the bytes.
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE_V1)
+ */
+static inline int lzsa_write_match_varlen_v1(unsigned char *pOutData, int nOutOffset, int nLength) {
+   if (nLength >= MATCH_RUN_LEN_V1) {
+      if ((nLength + MIN_MATCH_SIZE_V1) < 256)
+         pOutData[nOutOffset++] = nLength - MATCH_RUN_LEN_V1;
+      else {
+         if ((nLength + MIN_MATCH_SIZE_V1) < 512) {
+            pOutData[nOutOffset++] = 239;
+            pOutData[nOutOffset++] = nLength + MIN_MATCH_SIZE_V1 - 256;
+         }
+         else {
+            pOutData[nOutOffset++] = 238;
+            pOutData[nOutOffset++] = (nLength + MIN_MATCH_SIZE_V1) & 0xff;
+            pOutData[nOutOffset++] = ((nLength + MIN_MATCH_SIZE_V1) >> 8) & 0xff;
+         }
+      }
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Attempt to pick optimal matches, so as to produce the smallest possible output that decompresses to the same input
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ */
+static void lzsa_optimize_matches_v1(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   int *cost = (int*)pCompressor->pos_data;  /* Reuse */
+   int nLastLiteralsOffset;
+   int nMinMatchSize = pCompressor->min_match_size;
+   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
+   int i;
+
+   cost[nEndOffset - 1] = 8;
+   nLastLiteralsOffset = nEndOffset;
+
+   for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
+      int nBestCost, nBestMatchLen, nBestMatchOffset;
+
+      int nLiteralsLen = nLastLiteralsOffset - i;
+      nBestCost = 8 + cost[i + 1];
+      if (nLiteralsLen == LITERALS_RUN_LEN_V1 || nLiteralsLen == 256 || nLiteralsLen == 512) {
+         /* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
+          * The cost automatically accumulates down the chain. */
+         nBestCost += 8;
+      }
+      if (pCompressor->match[(i + 1) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
+         nBestCost += MODESWITCH_PENALTY;
+      nBestMatchLen = 0;
+      nBestMatchOffset = 0;
+
+      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
+      int m;
+
+      for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
+         int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 8 : 16;
+
+         if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
+            int nCurCost;
+            int nMatchLen = pMatch[m].length;
+
+            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
+               nMatchLen = nEndOffset - LAST_LITERALS - i;
+
+            nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size_v1(nMatchLen - MIN_MATCH_SIZE_V1);
+            nCurCost += cost[i + nMatchLen];
+            if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
+               nCurCost += MODESWITCH_PENALTY;
+
+            if (nBestCost > (nCurCost - nFavorRatio)) {
+               nBestCost = nCurCost;
+               nBestMatchLen = nMatchLen;
+               nBestMatchOffset = pMatch[m].offset;
+            }
+         }
+         else {
+            int nMatchLen = pMatch[m].length;
+            int k, nMatchRunLen;
+
+            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
+               nMatchLen = nEndOffset - LAST_LITERALS - i;
+
+            nMatchRunLen = nMatchLen;
+            if (nMatchRunLen > MATCH_RUN_LEN_V1)
+               nMatchRunLen = MATCH_RUN_LEN_V1;
+
+            for (k = nMinMatchSize; k < nMatchRunLen; k++) {
+               int nCurCost;
+
+               nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */;
+               nCurCost += cost[i + k];
+               if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
+                  nCurCost += MODESWITCH_PENALTY;
+
+               if (nBestCost > (nCurCost - nFavorRatio)) {
+                  nBestCost = nCurCost;
+                  nBestMatchLen = k;
+                  nBestMatchOffset = pMatch[m].offset;
+               }
+            }
+
+            for (; k <= nMatchLen; k++) {
+               int nCurCost;
+
+               nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size_v1(k - MIN_MATCH_SIZE_V1);
+               nCurCost += cost[i + k];
+               if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1)
+                  nCurCost += MODESWITCH_PENALTY;
+
+               if (nBestCost > (nCurCost - nFavorRatio)) {
+                  nBestCost = nCurCost;
+                  nBestMatchLen = k;
+                  nBestMatchOffset = pMatch[m].offset;
+               }
+            }
+         }
+      }
+
+      if (nBestMatchLen >= MIN_MATCH_SIZE_V1)
+         nLastLiteralsOffset = i;
+
+      cost[i] = nBestCost;
+      pMatch->length = nBestMatchLen;
+      pMatch->offset = nBestMatchOffset;
+   }
+}
+
+/**
+ * Attempt to minimize the number of commands issued in the compressed data block, in order to speed up decompression without
+ * impacting the compression ratio
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ *
+ * @return non-zero if the number of tokens was reduced, 0 if it wasn't
+ */
+static int lzsa_optimize_command_count_v1(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   int i;
+   int nNumLiterals = 0;
+   int nDidReduce = 0;
+
+   for (i = nStartOffset; i < nEndOffset; ) {
+      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
+
+      if (pMatch->length >= MIN_MATCH_SIZE_V1) {
+         int nMatchLen = pMatch->length;
+         int nReduce = 0;
+
+         if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <EE> <ll> <ll> <offset> <offset> <EE> <mm> <mm> */ {
+            int nMatchOffset = pMatch->offset;
+            int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V1;
+            int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + ((nMatchOffset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size_v1(nEncodedMatchLen);
+
+            if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE_V1) {
+               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v1(nNumLiterals + nMatchLen))) {
+                  /* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
+                   * make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
+                   * at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
+                   * match command by literals, the output size will not increase and it will remove one command. */
+                  nReduce = 1;
+               }
+            }
+            else {
+               int nCurIndex = i + nMatchLen;
+               int nNextNumLiterals = 0;
+
+               do {
+                  nCurIndex++;
+                  nNextNumLiterals++;
+               } while (nCurIndex < nEndOffset && pCompressor->match[nCurIndex << MATCHES_PER_OFFSET_SHIFT].length < MIN_MATCH_SIZE_V1);
+
+               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v1(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size_v1(nNextNumLiterals))) {
+                  /* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
+                   * more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
+                  nReduce = 1;
+               }
+            }
+         }
+
+         if (nReduce) {
+            int j;
+
+            for (j = 0; j < nMatchLen; j++) {
+               pCompressor->match[(i + j) << MATCHES_PER_OFFSET_SHIFT].length = 0;
+            }
+            nNumLiterals += nMatchLen;
+            i += nMatchLen;
+
+            nDidReduce = 1;
+         }
+         else {
+            if ((i + nMatchLen) < nEndOffset && nMatchLen >= LCP_MAX &&
+               pMatch->offset && pMatch->offset <= 32 && pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset == pMatch->offset && (nMatchLen % pMatch->offset) == 0 &&
+               (nMatchLen + pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length) <= MAX_OFFSET) {
+               /* Join */
+
+               pMatch->length += pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length;
+               pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset = 0;
+               pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length = -1;
+               continue;
+            }
+
+            nNumLiterals = 0;
+            i += nMatchLen;
+         }
+      }
+      else {
+         nNumLiterals++;
+         i++;
+      }
+   }
+
+   return nDidReduce;
+}
+
+/**
+ * Emit block of compressed data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int lzsa_write_block_v1(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize) {
+   int i;
+   int nNumLiterals = 0;
+   int nInFirstLiteralOffset = 0;
+   int nOutOffset = 0;
+
+   for (i = nStartOffset; i < nEndOffset; ) {
+      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
+
+      if (pMatch->length >= MIN_MATCH_SIZE_V1) {
+         int nMatchOffset = pMatch->offset;
+         int nMatchLen = pMatch->length;
+         int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V1;
+         int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
+         int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN_V1) ? MATCH_RUN_LEN_V1 : nEncodedMatchLen;
+         int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
+         int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size_v1(nEncodedMatchLen);
+
+         if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
+            return -1;
+         if (nMatchOffset < MIN_OFFSET || nMatchOffset > MAX_OFFSET)
+            return -1;
+
+         pOutData[nOutOffset++] = nTokenLongOffset | (nTokenLiteralsLen << 4) | nTokenMatchLen;
+         nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals);
+
+         if (nNumLiterals != 0) {
+            memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
+            nOutOffset += nNumLiterals;
+            nNumLiterals = 0;
+         }
+
+         pOutData[nOutOffset++] = (-nMatchOffset) & 0xff;
+         if (nTokenLongOffset) {
+            pOutData[nOutOffset++] = (-nMatchOffset) >> 8;
+         }
+         nOutOffset = lzsa_write_match_varlen_v1(pOutData, nOutOffset, nEncodedMatchLen);
+         i += nMatchLen;
+
+         pCompressor->num_commands++;
+      }
+      else {
+         if (nNumLiterals == 0)
+            nInFirstLiteralOffset = i;
+         nNumLiterals++;
+         i++;
+      }
+   }
+
+   {
+      int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V1) ? LITERALS_RUN_LEN_V1 : nNumLiterals;
+      int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v1(nNumLiterals) + (nNumLiterals << 3);
+
+      if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
+         return -1;
+
+      if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK)
+         pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x0f;
+      else
+         pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x00;
+      nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals);
+
+      if (nNumLiterals != 0) {
+         memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
+         nOutOffset += nNumLiterals;
+         nNumLiterals = 0;
+      }
+
+      pCompressor->num_commands++;
+   }
+
+   if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
+      /* Emit EOD marker for raw block */
+
+      if ((nOutOffset + 4) > nMaxOutDataSize)
+         return -1;
+
+      pOutData[nOutOffset++] = 0;
+      pOutData[nOutOffset++] = 238;
+      pOutData[nOutOffset++] = 0;
+      pOutData[nOutOffset++] = 0;
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed LZSA1 data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+int lzsa_optimize_and_write_block_v1(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
+   lzsa_optimize_matches_v1(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+
+   int nDidReduce;
+   int nPasses = 0;
+   do {
+      nDidReduce = lzsa_optimize_command_count_v1(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      nPasses++;
+   } while (nDidReduce && nPasses < 20);
+
+   return lzsa_write_block_v1(pCompressor, pInWindow, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, pOutData, nMaxOutDataSize);
+}
--- a/src/shrink_v1.h
+++ b/src/shrink_v1.h
@ -0,0 +1,53 @@
+/*
+ * shrink_v1.h - LZSA1 block compressor definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#ifndef _SHRINK_V1_H
+#define _SHRINK_V1_H
+
+/* Forward declarations */
+typedef struct _lsza_compressor lsza_compressor;
+
+/**
+ * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed LZSA1 data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+int lzsa_optimize_and_write_block_v1(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize);
+
+#endif /* _SHRINK_V1_H */
--- a/src/shrink_v2.c
+++ b/src/shrink_v2.c
@ -0,0 +1,733 @@
+/*
+ * shrink_v2.c - LZSA2 block compressor implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "lib.h"
+#include "shrink_v2.h"
+#include "format.h"
+
+/**
+ * Write 4-bit nibble to output (compressed) buffer
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurNibbleOffset write index into output buffer, of current byte being filled with nibbles
+ * @param nCurFreeNibbles current number of free nibbles in byte
+ * @param nNibbleValue value to write (0..15)
+ */
+static int lzsa_write_nibble_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int *nCurFreeNibbles, int nNibbleValue) {
+   if (nOutOffset < 0) return -1;
+
+   if ((*nCurNibbleOffset) == -1) {
+      if (nOutOffset >= nMaxOutDataSize) return -1;
+      (*nCurNibbleOffset) = nOutOffset;
+      (*nCurFreeNibbles) = 2;
+      pOutData[nOutOffset++] = 0;
+   }
+
+   pOutData[*nCurNibbleOffset] = (pOutData[*nCurNibbleOffset] << 4) | (nNibbleValue & 0x0f);
+   (*nCurFreeNibbles)--;
+   if ((*nCurFreeNibbles) == 0) {
+      (*nCurNibbleOffset) = -1;
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Get the number of extra bits required to represent a literals length
+ *
+ * @param nLength literals length
+ *
+ * @return number of extra bits required
+ */
+static inline int lzsa_get_literals_varlen_size_v2(const int nLength) {
+   if (nLength < LITERALS_RUN_LEN_V2) {
+      return 0;
+   }
+   else {
+      if (nLength < (LITERALS_RUN_LEN_V2 + 15)) {
+         return 4;
+      }
+      else {
+         if (nLength < 256)
+            return 4+8;
+         else {
+            return 4+24;
+         }
+      }
+   }
+}
+
+/**
+ * Write extra literals length bytes to output (compressed) buffer. The caller must first check that there is enough
+ * room to write the bytes.
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nLength literals length
+ */
+static inline int lzsa_write_literals_varlen_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int *nCurFreeNibbles, int nLength) {
+   if (nLength >= LITERALS_RUN_LEN_V2) {
+      if (nLength < (LITERALS_RUN_LEN_V2 + 15)) {
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, nLength - LITERALS_RUN_LEN_V2);
+      }
+      else {
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, 15);
+         if (nOutOffset < 0) return -1;
+
+         if (nLength < 256)
+            pOutData[nOutOffset++] = nLength;
+         else {
+            pOutData[nOutOffset++] = 0;
+            pOutData[nOutOffset++] = nLength & 0xff;
+            pOutData[nOutOffset++] = (nLength >> 8) & 0xff;
+         }
+      }
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Get the number of extra bits required to represent an encoded match length
+ *
+ * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE_V2)
+ *
+ * @return number of extra bits required
+ */
+static inline int lzsa_get_match_varlen_size_v2(const int nLength) {
+   if (nLength < MATCH_RUN_LEN_V2) {
+      return 0;
+   }
+   else {
+      if (nLength < (MATCH_RUN_LEN_V2 + 15))
+         return 4;
+      else {
+         if ((nLength + MIN_MATCH_SIZE_V2) < 256)
+            return 4+8;
+         else {
+            return 4 + 24;
+         }
+      }
+   }
+}
+
+/**
+ * Write extra encoded match length bytes to output (compressed) buffer. The caller must first check that there is enough
+ * room to write the bytes.
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nLength encoded match length (actual match length - MIN_MATCH_SIZE_V2)
+ */
+static inline int lzsa_write_match_varlen_v2(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurNibbleOffset, int *nCurFreeNibbles, int nLength) {
+   if (nLength >= MATCH_RUN_LEN_V2) {
+      if (nLength < (MATCH_RUN_LEN_V2 + 15)) {
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, nLength - MATCH_RUN_LEN_V2);
+      }
+      else {
+         nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, nCurNibbleOffset, nCurFreeNibbles, 15);
+         if (nOutOffset < 0) return -1;
+
+         if ((nLength + MIN_MATCH_SIZE_V2) < 256)
+            pOutData[nOutOffset++] = nLength + MIN_MATCH_SIZE_V2;
+         else {
+            pOutData[nOutOffset++] = 0;
+            pOutData[nOutOffset++] = (nLength + MIN_MATCH_SIZE_V2) & 0xff;
+            pOutData[nOutOffset++] = ((nLength + MIN_MATCH_SIZE_V2) >> 8) & 0xff;
+         }
+      }
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Attempt to pick optimal matches, so as to produce the smallest possible output that decompresses to the same input
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ */
+static void lzsa_optimize_matches_v2(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   int *cost = (int*)pCompressor->pos_data;  /* Reuse */
+   int *prev_match = (int*)pCompressor->intervals; /* Reuse */
+   lzsa_repmatch_opt *repmatch_opt = pCompressor->repmatch_opt;
+   lzsa_match *pBestMatch = pCompressor->best_match;
+   int nLastLiteralsOffset;
+   int nMinMatchSize = pCompressor->min_match_size;
+   const int nFavorRatio = (pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) ? 1 : 0;
+   int i;
+
+   cost[nEndOffset - 1] = 8;
+   prev_match[nEndOffset - 1] = nEndOffset;
+   nLastLiteralsOffset = nEndOffset;
+
+   pCompressor->best_match[nEndOffset - 1].length = 0;
+   pCompressor->best_match[nEndOffset - 1].offset = 0;
+
+   repmatch_opt[nEndOffset - 1].best_slot_for_incoming = -1;
+   repmatch_opt[nEndOffset - 1].incoming_offset = -1;
+   repmatch_opt[nEndOffset - 1].expected_repmatch = 0;
+
+   for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
+      int nLiteralsCost;
+
+      int nLiteralsLen = nLastLiteralsOffset - i;
+      nLiteralsCost = 8 + cost[i + 1];
+      if (nLiteralsLen == LITERALS_RUN_LEN_V2) {
+         /* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
+          * The cost automatically accumulates down the chain. */
+         nLiteralsCost += 4;
+      }
+      else if (nLiteralsLen == (LITERALS_RUN_LEN_V2 + 15)) {
+         /* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
+          * The cost automatically accumulates down the chain. */
+         nLiteralsCost += 8;
+      }
+      else if (nLiteralsLen == 256) {
+         /* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
+          * The cost automatically accumulates down the chain. */
+         nLiteralsCost += 16;
+      }
+      if (pCompressor->best_match[i + 1].length >= MIN_MATCH_SIZE_V2)
+         nLiteralsCost += MODESWITCH_PENALTY;
+
+      lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
+      int *pSlotCost = pCompressor->slot_cost + (i << MATCHES_PER_OFFSET_SHIFT);
+      int m;
+
+      cost[i] = nLiteralsCost;
+      pCompressor->best_match[i].length = 0;
+      pCompressor->best_match[i].offset = 0;
+
+      repmatch_opt[i].best_slot_for_incoming = -1;
+      repmatch_opt[i].incoming_offset = -1;
+      repmatch_opt[i].expected_repmatch = 0;
+
+      for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
+         int nBestCost, nBestMatchLen, nBestMatchOffset, nBestUpdatedSlot, nBestUpdatedIndex, nBestExpectedRepMatch;
+
+         nBestCost = nLiteralsCost;
+         nBestMatchLen = 0;
+         nBestMatchOffset = 0;
+         nBestUpdatedSlot = -1;
+         nBestUpdatedIndex = -1;
+         nBestExpectedRepMatch = 0;
+
+         if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
+            int nCurCost;
+            int nMatchLen = pMatch[m].length;
+
+            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
+               nMatchLen = nEndOffset - LAST_LITERALS - i;
+
+            int nCurIndex = prev_match[i + nMatchLen];
+
+            int nMatchOffsetSize = 0;
+            int nCurExpectedRepMatch = 1;
+            if (nCurIndex >= nEndOffset || pCompressor->best_match[nCurIndex].length < MIN_MATCH_SIZE_V2 ||
+                pCompressor->best_match[nCurIndex].offset != pMatch[m].offset) {
+               nMatchOffsetSize = (pMatch[m].offset <= 32) ? 4 : ((pMatch[m].offset <= 512) ? 8 : ((pMatch[m].offset <= (8192 + 512)) ? 12 : 16));
+               nCurExpectedRepMatch = 0;
+            }
+
+            nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size_v2(nMatchLen - MIN_MATCH_SIZE_V2);
+            nCurCost += cost[i + nMatchLen];
+            if (pCompressor->best_match[i + nMatchLen].length >= MIN_MATCH_SIZE_V2)
+               nCurCost += MODESWITCH_PENALTY;
+
+            if (nBestCost > (nCurCost - nFavorRatio)) {
+               nBestCost = nCurCost;
+               nBestMatchLen = nMatchLen;
+               nBestMatchOffset = pMatch[m].offset;
+               nBestUpdatedSlot = -1;
+               nBestUpdatedIndex = -1;
+               nBestExpectedRepMatch = nCurExpectedRepMatch;
+            }
+         }
+         else {
+            int nMatchLen = pMatch[m].length;
+            int k, nMatchRunLen;
+
+            if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
+               nMatchLen = nEndOffset - LAST_LITERALS - i;
+
+            nMatchRunLen = nMatchLen;
+            if (nMatchRunLen > MATCH_RUN_LEN_V2)
+               nMatchRunLen = MATCH_RUN_LEN_V2;
+
+            for (k = nMinMatchSize; k < nMatchRunLen; k++) {
+               int nCurCost;
+
+               int nCurIndex = prev_match[i + k];
+               int nMatchOffsetSize = 0;
+               int nCurExpectedRepMatch = 1;
+               if (nCurIndex >= nEndOffset || pCompressor->best_match[nCurIndex].length < MIN_MATCH_SIZE_V2 ||
+                  pCompressor->best_match[nCurIndex].offset != pMatch[m].offset) {
+                  nMatchOffsetSize = (pMatch[m].offset <= 32) ? 4 : ((pMatch[m].offset <= 512) ? 8 : ((pMatch[m].offset <= (8192 + 512)) ? 12 : 16));
+                  nCurExpectedRepMatch = 0;
+               }
+
+               nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */;
+               nCurCost += cost[i + k];
+               if (pCompressor->best_match[i + k].length >= MIN_MATCH_SIZE_V2)
+                  nCurCost += MODESWITCH_PENALTY;
+
+               int nCurUpdatedSlot = -1;
+               int nCurUpdatedIndex = -1;
+
+               if (nMatchOffsetSize && nCurIndex < nEndOffset && pCompressor->best_match[nCurIndex].length >= MIN_MATCH_SIZE_V2 && !repmatch_opt[nCurIndex].expected_repmatch) {
+                  int r;
+
+                  for (r = 0; r < NMATCHES_PER_OFFSET && pCompressor->match[(nCurIndex << MATCHES_PER_OFFSET_SHIFT) + r].length >= MIN_MATCH_SIZE_V2; r++) {
+                     if (pCompressor->match[(nCurIndex << MATCHES_PER_OFFSET_SHIFT) + r].offset == pMatch[m].offset) {
+                        int nAltCost = nCurCost - nMatchOffsetSize + pCompressor->slot_cost[(nCurIndex << MATCHES_PER_OFFSET_SHIFT) + r] - cost[nCurIndex];
+
+                        if (nAltCost <= nCurCost) {
+                           nCurUpdatedSlot = r;
+                           nCurUpdatedIndex = nCurIndex;
+                           nCurCost = nAltCost;
+                           nCurExpectedRepMatch = 2;
+                        }
+                     }
+                  }
+               }
+
+               if (nBestCost > (nCurCost - nFavorRatio)) {
+                  nBestCost = nCurCost;
+                  nBestMatchLen = k;
+                  nBestMatchOffset = pMatch[m].offset;
+                  nBestUpdatedSlot = nCurUpdatedSlot;
+                  nBestUpdatedIndex = nCurUpdatedIndex;
+                  nBestExpectedRepMatch = nCurExpectedRepMatch;
+               }
+            }
+
+            for (; k <= nMatchLen; k++) {
+               int nCurCost;
+
+               int nCurIndex = prev_match[i + k];
+               int nMatchOffsetSize = 0;
+               int nCurExpectedRepMatch = 1;
+               if (nCurIndex >= nEndOffset || pCompressor->best_match[nCurIndex].length < MIN_MATCH_SIZE_V2 ||
+                  pCompressor->best_match[nCurIndex].offset != pMatch[m].offset) {
+                  nMatchOffsetSize = (pMatch[m].offset <= 32) ? 4 : ((pMatch[m].offset <= 512) ? 8 : ((pMatch[m].offset <= (8192 + 512)) ? 12 : 16));
+                  nCurExpectedRepMatch = 0;
+               }
+
+               nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size_v2(k - MIN_MATCH_SIZE_V2);
+               nCurCost += cost[i + k];
+               if (pCompressor->best_match[i + k].length >= MIN_MATCH_SIZE_V2)
+                  nCurCost += MODESWITCH_PENALTY;
+
+               int nCurUpdatedSlot = -1;
+               int nCurUpdatedIndex = -1;
+
+               if (nMatchOffsetSize && nCurIndex < nEndOffset && pCompressor->best_match[nCurIndex].length >= MIN_MATCH_SIZE_V2 && !repmatch_opt[nCurIndex].expected_repmatch) {
+                  int r;
+
+                  for (r = 0; r < NMATCHES_PER_OFFSET && pCompressor->match[(nCurIndex << MATCHES_PER_OFFSET_SHIFT) + r].length >= MIN_MATCH_SIZE_V2; r++) {
+                     if (pCompressor->match[(nCurIndex << MATCHES_PER_OFFSET_SHIFT) + r].offset == pMatch[m].offset) {
+                        int nAltCost = nCurCost - nMatchOffsetSize + pCompressor->slot_cost[(nCurIndex << MATCHES_PER_OFFSET_SHIFT) + r] - cost[nCurIndex];
+
+                        if (nAltCost <= nCurCost) {
+                           nCurUpdatedSlot = r;
+                           nCurUpdatedIndex = nCurIndex;
+                           nCurCost = nAltCost;
+                           nCurExpectedRepMatch = 2;
+                        }
+                     }
+                  }
+               }
+
+               if (nBestCost > (nCurCost - nFavorRatio)) {
+                  nBestCost = nCurCost;
+                  nBestMatchLen = k;
+                  nBestMatchOffset = pMatch[m].offset;
+                  nBestUpdatedSlot = nCurUpdatedSlot;
+                  nBestUpdatedIndex = nCurUpdatedIndex;
+                  nBestExpectedRepMatch = nCurExpectedRepMatch;
+               }
+            }
+         }
+
+         pSlotCost[m] = nBestCost;
+         pMatch[m].length = nBestMatchLen;
+         pMatch[m].offset = nBestMatchOffset; /* not necessary */
+
+         if (m == 0 || (nBestMatchLen && cost[i] >= nBestCost)) {
+            cost[i] = nBestCost;
+            pCompressor->best_match[i].length = nBestMatchLen;
+            pCompressor->best_match[i].offset = nBestMatchOffset;
+
+            repmatch_opt[i].expected_repmatch = nBestExpectedRepMatch;
+
+            if (nBestUpdatedSlot >= 0 && nBestUpdatedIndex >= 0) {
+               repmatch_opt[nBestUpdatedIndex].best_slot_for_incoming = nBestUpdatedSlot;
+               repmatch_opt[nBestUpdatedIndex].incoming_offset = i;
+            }
+         }
+      }
+      for (; m < NMATCHES_PER_OFFSET; m++) {
+         pSlotCost[m] = 0;
+      }
+
+      if (pCompressor->best_match[i].length >= MIN_MATCH_SIZE_V2)
+         nLastLiteralsOffset = i;
+
+      prev_match[i] = nLastLiteralsOffset;
+   }
+
+   int nIncomingOffset = -1;
+   for (i = nStartOffset; i < nEndOffset; ) {
+      if (pCompressor->best_match[i].length >= MIN_MATCH_SIZE_V2) {
+         if (nIncomingOffset >= 0 && repmatch_opt[i].incoming_offset == nIncomingOffset && repmatch_opt[i].best_slot_for_incoming >= 0) {
+            lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT) + repmatch_opt[i].best_slot_for_incoming;
+            int *pSlotCost = pCompressor->slot_cost + (i << MATCHES_PER_OFFSET_SHIFT) + repmatch_opt[i].best_slot_for_incoming;
+
+            pCompressor->best_match[i].length = pMatch->length;
+            pCompressor->best_match[i].offset = pMatch->offset;
+            cost[i] = *pSlotCost;
+
+            if (repmatch_opt[i].expected_repmatch == 2)
+               repmatch_opt[i].expected_repmatch = 1;
+         }
+         else {
+            if (repmatch_opt[i].expected_repmatch == 2)
+               repmatch_opt[i].expected_repmatch = 0;
+         }
+
+         nIncomingOffset = i;
+         i += pCompressor->best_match[i].length;
+      }
+      else {
+         i++;
+      }
+   }
+}
+
+/**
+ * Attempt to minimize the number of commands issued in the compressed data block, in order to speed up decompression without
+ * impacting the compression ratio
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ *
+ * @return non-zero if the number of tokens was reduced, 0 if it wasn't
+ */
+static int lzsa_optimize_command_count_v2(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   int i;
+   int nNumLiterals = 0;
+   int nDidReduce = 0;
+   int nPreviousMatchOffset = -1;
+   lzsa_repmatch_opt *repmatch_opt = pCompressor->repmatch_opt;
+
+   for (i = nStartOffset; i < nEndOffset; ) {
+      lzsa_match *pMatch = pCompressor->best_match + i;
+
+      if (pMatch->length >= MIN_MATCH_SIZE_V2) {
+         int nMatchLen = pMatch->length;
+         int nReduce = 0;
+         int nCurrentMatchOffset = i;
+
+         if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <EE> <ll> <ll> <offset> <offset> <EE> <mm> <mm> */ {
+            int nMatchOffset = pMatch->offset;
+            int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V2;
+            int nUndoRepMatchCost = (nPreviousMatchOffset < 0 || !repmatch_opt[nPreviousMatchOffset].expected_repmatch) ? 0 : 16;
+
+            if (pCompressor->best_match[i + nMatchLen].length >= MIN_MATCH_SIZE_V2) {
+               int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + lzsa_get_match_varlen_size_v2(nEncodedMatchLen) - nUndoRepMatchCost;
+
+               if (pCompressor->best_match[i + nMatchLen].offset != nMatchOffset) {
+                  nCommandSize += (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16)) /* match offset */;
+               }
+
+               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v2(nNumLiterals + nMatchLen))) {
+                  /* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
+                   * make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
+                   * at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
+                   * match command by literals, the output size will not increase and it will remove one command. */
+                  nReduce = 1;
+               }
+            }
+            else {
+               int nCurIndex = i + nMatchLen;
+               int nNextNumLiterals = 0;
+               int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + lzsa_get_match_varlen_size_v2(nEncodedMatchLen) - nUndoRepMatchCost;;
+
+               do {
+                  nCurIndex++;
+                  nNextNumLiterals++;
+               } while (nCurIndex < nEndOffset && pCompressor->best_match[nCurIndex].length < MIN_MATCH_SIZE_V2);
+
+               if (nCurIndex >= nEndOffset || pCompressor->best_match[nCurIndex].length < MIN_MATCH_SIZE_V2 ||
+                  pCompressor->best_match[nCurIndex].offset != nMatchOffset) {
+                  nCommandSize += (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16)) /* match offset */;
+               }
+
+               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v2(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size_v2(nNextNumLiterals))) {
+                  /* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
+                   * more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
+                  nReduce = 1;
+               }
+            }
+         }
+
+         if (nReduce) {
+            int j;
+
+            for (j = 0; j < nMatchLen; j++) {
+               pCompressor->best_match[i + j].length = 0;
+            }
+            nNumLiterals += nMatchLen;
+            i += nMatchLen;
+
+            nDidReduce = 1;
+
+            if (nPreviousMatchOffset >= 0) {
+               repmatch_opt[nPreviousMatchOffset].expected_repmatch = 0;
+               nPreviousMatchOffset = -1;
+            }
+         }
+         else {
+            if ((i + nMatchLen) < nEndOffset && nMatchLen >= LCP_MAX &&
+               pMatch->offset && pMatch->offset <= 32 && pCompressor->best_match[i + nMatchLen].offset == pMatch->offset && (nMatchLen % pMatch->offset) == 0 &&
+               (nMatchLen + pCompressor->best_match[i + nMatchLen].length) <= MAX_OFFSET) {
+               /* Join */
+
+               pMatch->length += pCompressor->best_match[i + nMatchLen].length;
+               pCompressor->best_match[i + nMatchLen].offset = 0;
+               pCompressor->best_match[i + nMatchLen].length = -1;
+               continue;
+            }
+
+            nNumLiterals = 0;
+            i += nMatchLen;
+         }
+
+         nPreviousMatchOffset = nCurrentMatchOffset;
+      }
+      else {
+         nNumLiterals++;
+         i++;
+      }
+   }
+
+   return nDidReduce;
+}
+
+/**
+ * Emit block of compressed data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int lzsa_write_block_v2(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, const int nMaxOutDataSize) {
+   int i;
+   int nNumLiterals = 0;
+   int nInFirstLiteralOffset = 0;
+   int nOutOffset = 0;
+   int nCurNibbleOffset = -1, nCurFreeNibbles = 0;
+   int nRepMatchOffset = 0;
+   lzsa_repmatch_opt *repmatch_opt = pCompressor->repmatch_opt;
+
+   for (i = nStartOffset; i < nEndOffset; ) {
+      lzsa_match *pMatch = pCompressor->best_match + i;
+
+      if (pMatch->length >= MIN_MATCH_SIZE_V2) {
+         int nMatchOffset = pMatch->offset;
+         int nMatchLen = pMatch->length;
+         int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V2;
+         int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V2) ? LITERALS_RUN_LEN_V2 : nNumLiterals;
+         int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN_V2) ? MATCH_RUN_LEN_V2 : nEncodedMatchLen;
+         int nTokenOffsetMode;
+         int nOffsetSize;
+
+         if (nMatchOffset == nRepMatchOffset) {
+            nTokenOffsetMode = 0xe0;
+            nOffsetSize = 0;
+         }
+         else {
+            if (nMatchOffset <= 32) {
+               nTokenOffsetMode = 0x00 | (((-nMatchOffset) & 0x10) << 1);
+               nOffsetSize = 4;
+            }
+            else if (nMatchOffset <= 512) {
+               nTokenOffsetMode = 0x40 | (((-nMatchOffset) & 0x100) >> 3);
+               nOffsetSize = 8;
+            }
+            else if (nMatchOffset <= (8192 + 512)) {
+               nTokenOffsetMode = 0x80 | (((-(nMatchOffset - 512)) & 0x1000) >> 7);
+               nOffsetSize = 12;
+            }
+            else {
+               nTokenOffsetMode = 0xc0;
+               nOffsetSize = 16;
+            }
+         }
+
+         int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + (nNumLiterals << 3) + nOffsetSize /* match offset */ + lzsa_get_match_varlen_size_v2(nEncodedMatchLen);
+
+         if ((nOutOffset + ((nCommandSize + 7) >> 3)) > nMaxOutDataSize)
+            return -1;
+         if (nMatchOffset < MIN_OFFSET || nMatchOffset > MAX_OFFSET)
+            return -1;
+
+         pOutData[nOutOffset++] = nTokenOffsetMode | (nTokenLiteralsLen << 3) | nTokenMatchLen;
+         nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals);
+         if (nOutOffset < 0) return -1;
+
+         if (nNumLiterals != 0) {
+            memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
+            nOutOffset += nNumLiterals;
+            nNumLiterals = 0;
+         }
+
+         if (nTokenOffsetMode == 0x00 || nTokenOffsetMode == 0x20) {
+            nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, (-nMatchOffset) & 0x0f);
+            if (nOutOffset < 0) return -1;
+         }
+         else if (nTokenOffsetMode == 0x40 || nTokenOffsetMode == 0x60) {
+            pOutData[nOutOffset++] = (-nMatchOffset) & 0xff;
+         }
+         else if (nTokenOffsetMode == 0x80 || nTokenOffsetMode == 0xa0) {
+            pOutData[nOutOffset++] = (-(nMatchOffset - 512)) & 0xff;
+            nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, ((-(nMatchOffset - 512)) >> 8) & 0x0f);
+            if (nOutOffset < 0) return -1;
+         }
+         else if (nTokenOffsetMode == 0xc0) {
+            pOutData[nOutOffset++] = (-nMatchOffset) & 0xff;
+            pOutData[nOutOffset++] = (-nMatchOffset) >> 8;
+         }
+         nRepMatchOffset = nMatchOffset;
+
+         nOutOffset = lzsa_write_match_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nEncodedMatchLen);
+         if (nOutOffset < 0) return -1;
+
+         i += nMatchLen;
+
+         pCompressor->num_commands++;
+      }
+      else {
+         if (nNumLiterals == 0)
+            nInFirstLiteralOffset = i;
+         nNumLiterals++;
+         i++;
+      }
+   }
+
+   {
+      int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN_V2) ? LITERALS_RUN_LEN_V2 : nNumLiterals;
+      int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + (nNumLiterals << 3);
+
+      if ((nOutOffset + ((nCommandSize + 7) >> 3)) > nMaxOutDataSize)
+         return -1;
+
+      if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK)
+         pOutData[nOutOffset++] = (nTokenLiteralsLen << 3) | 0x47;
+      else
+         pOutData[nOutOffset++] = (nTokenLiteralsLen << 3) | 0x00;
+      nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals);
+      if (nOutOffset < 0) return -1;
+
+      if (nNumLiterals != 0) {
+         memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
+         nOutOffset += nNumLiterals;
+         nNumLiterals = 0;
+      }
+
+      pCompressor->num_commands++;
+   }
+
+   if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
+      /* Emit EOD marker for raw block */
+
+      if (nOutOffset >= nMaxOutDataSize)
+         return -1;
+      pOutData[nOutOffset++] = 0;      /* Match offset */
+
+      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, 15);   /* Extended match length nibble */
+      if (nOutOffset < 0) return -1;
+
+      if ((nOutOffset + 3) > nMaxOutDataSize)
+         return -1;
+
+      pOutData[nOutOffset++] = 0;      /* Extended match length byte */
+      pOutData[nOutOffset++] = 0;      /* 16-bit match length */
+      pOutData[nOutOffset++] = 0;
+   }
+
+   if (nCurNibbleOffset != -1) {
+      nOutOffset = lzsa_write_nibble_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, 0);
+      if (nOutOffset < 0 || nCurNibbleOffset != -1)
+         return -1;
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed LZSA2 data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+int lzsa_optimize_and_write_block_v2(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize) {
+   lzsa_optimize_matches_v2(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+
+   int nDidReduce;
+   int nPasses = 0;
+   do {
+      nDidReduce = lzsa_optimize_command_count_v2(pCompressor, nPreviousBlockSize, nPreviousBlockSize + nInDataSize);
+      nPasses++;
+   } while (nDidReduce && nPasses < 20);
+
+   return lzsa_write_block_v2(pCompressor, pInWindow, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, pOutData, nMaxOutDataSize);
+}
--- a/src/shrink_v2.h
+++ b/src/shrink_v2.h
@ -0,0 +1,53 @@
+/*
+ * shrink_v2.h - LZSA2 block compressor definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#ifndef _SHRINK_V2_H
+#define _SHRINK_V2_H
+
+/* Forward declarations */
+typedef struct _lsza_compressor lsza_compressor;
+
+/**
+ * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed LZSA2 data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+int lzsa_optimize_and_write_block_v2(lsza_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize);
+
+#endif /* _SHRINK_V2_H */