2019-04-01 16:04:56 +00:00
/*
* shrink . c - block compressor implementation
*
* The following copying information applies to this specific source code file :
*
* Written in 2019 by Emmanuel Marty < marty . emmanuel @ gmail . com >
2019-04-06 22:01:22 +00:00
* With help , ideas , optimizations and speed measurements by spke < zxintrospec @ gmail . com >
2019-04-01 16:04:56 +00:00
* Portions written in 2014 - 2015 by Eric Biggers < ebiggers3 @ gmail . com >
*
* To the extent possible under law , the author ( s ) have dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide via the Creative Commons Zero 1.0 Universal Public Domain
* Dedication ( the " CC0 " ) .
*
* This software is distributed in the hope that it will be useful , but WITHOUT
* ANY WARRANTY ; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE . See the CC0 for more details .
*
* You should have received a copy of the CC0 along with this software ; if not
* see < http : //creativecommons.org/publicdomain/zero/1.0/>.
*/
/*
* Uses the libdivsufsort library Copyright ( c ) 2003 - 2008 Yuta Mori
*
* Inspired by LZ4 by Yann Collet . https : //github.com/lz4/lz4
* With ideas from Lizard by Przemyslaw Skibinski and Yann Collet . https : //github.com/inikep/lizard
2019-04-13 10:33:39 +00:00
* Also with ideas from smallz4 by Stephan Brumme . https : //create.stephan-brumme.com/smallz4/
2019-04-06 22:01:22 +00:00
*
2019-04-01 16:04:56 +00:00
*/
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include "shrink.h"
# include "format.h"
2019-05-03 11:45:37 +00:00
# define LCP_BITS 15
# define LCP_MAX (1<<(LCP_BITS - 1))
2019-04-03 08:16:12 +00:00
# define LCP_SHIFT (32-LCP_BITS)
2019-05-03 11:45:37 +00:00
# define LCP_MASK (((1<<LCP_BITS) - 1) << LCP_SHIFT)
2019-04-01 16:04:56 +00:00
# define POS_MASK ((1<<LCP_SHIFT) - 1)
2019-04-03 08:16:12 +00:00
# define NMATCHES_PER_OFFSET 8
# define MATCHES_PER_OFFSET_SHIFT 3
2019-04-01 16:04:56 +00:00
2019-04-03 08:16:12 +00:00
# define LEAVE_ALONE_MATCH_SIZE 1000
2019-04-01 16:04:56 +00:00
2019-04-05 07:28:16 +00:00
# define LAST_MATCH_OFFSET 4
# define LAST_LITERALS 1
2019-04-25 11:01:56 +00:00
# define MODESWITCH_PENALTY 1
2019-04-02 10:12:12 +00:00
/** One match */
typedef struct _lzsa_match {
unsigned short length ;
unsigned short offset ;
} lzsa_match ;
/**
* Initialize compression context
*
* @ param pCompressor compression context to initialize
* @ param nMaxWindowSize maximum size of input data window ( previously compressed bytes + bytes to compress )
2019-04-21 07:41:12 +00:00
* @ param nMinMatchSize minimum match size ( cannot be less than MIN_MATCH_SIZE )
2019-05-02 09:23:57 +00:00
* @ param nFlags compression flags
2019-04-02 10:12:12 +00:00
*
* @ return 0 for success , non - zero for failure
*/
2019-05-02 09:23:57 +00:00
int lzsa_compressor_init ( lsza_compressor * pCompressor , const int nMaxWindowSize , const int nMinMatchSize , const int nFlags ) {
2019-04-06 22:01:22 +00:00
int nResult ;
nResult = divsufsort_init ( & pCompressor - > divsufsort_context ) ;
pCompressor - > intervals = NULL ;
2019-04-01 16:04:56 +00:00
pCompressor - > pos_data = NULL ;
pCompressor - > open_intervals = NULL ;
pCompressor - > match = NULL ;
2019-04-21 07:41:12 +00:00
pCompressor - > min_match_size = nMinMatchSize ;
if ( pCompressor - > min_match_size < MIN_MATCH_SIZE )
pCompressor - > min_match_size = MIN_MATCH_SIZE ;
else if ( pCompressor - > min_match_size > ( MATCH_RUN_LEN - 1 ) )
pCompressor - > min_match_size = MATCH_RUN_LEN - 1 ;
2019-05-02 09:23:57 +00:00
pCompressor - > flags = nFlags ;
2019-04-05 14:32:11 +00:00
pCompressor - > num_commands = 0 ;
2019-04-01 16:04:56 +00:00
2019-04-06 22:01:22 +00:00
if ( ! nResult ) {
pCompressor - > intervals = ( unsigned int * ) malloc ( nMaxWindowSize * sizeof ( unsigned int ) ) ;
2019-04-01 16:04:56 +00:00
2019-04-06 22:01:22 +00:00
if ( pCompressor - > intervals ) {
pCompressor - > pos_data = ( unsigned int * ) malloc ( nMaxWindowSize * sizeof ( unsigned int ) ) ;
2019-04-01 16:04:56 +00:00
2019-04-06 22:01:22 +00:00
if ( pCompressor - > pos_data ) {
pCompressor - > open_intervals = ( unsigned int * ) malloc ( ( LCP_MAX + 1 ) * sizeof ( unsigned int ) ) ;
2019-04-01 16:04:56 +00:00
2019-04-06 22:01:22 +00:00
if ( pCompressor - > open_intervals ) {
pCompressor - > match = ( lzsa_match * ) malloc ( nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof ( lzsa_match ) ) ;
2019-04-01 16:04:56 +00:00
2019-04-06 22:01:22 +00:00
if ( pCompressor - > match )
return 0 ;
}
2019-04-01 16:04:56 +00:00
}
}
}
2019-04-06 22:01:22 +00:00
lzsa_compressor_destroy ( pCompressor ) ;
2019-04-01 16:04:56 +00:00
return 100 ;
}
2019-04-02 10:12:12 +00:00
/**
* Clean up compression context and free up any associated resources
*
* @ param pCompressor compression context to clean up
*/
2019-04-01 16:04:56 +00:00
void lzsa_compressor_destroy ( lsza_compressor * pCompressor ) {
2019-04-06 22:01:22 +00:00
divsufsort_destroy ( & pCompressor - > divsufsort_context ) ;
2019-04-01 16:04:56 +00:00
if ( pCompressor - > match ) {
free ( pCompressor - > match ) ;
pCompressor - > match = NULL ;
}
if ( pCompressor - > open_intervals ) {
free ( pCompressor - > open_intervals ) ;
pCompressor - > open_intervals = NULL ;
}
if ( pCompressor - > pos_data ) {
free ( pCompressor - > pos_data ) ;
pCompressor - > pos_data = NULL ;
}
if ( pCompressor - > intervals ) {
free ( pCompressor - > intervals ) ;
pCompressor - > intervals = NULL ;
}
}
2019-04-02 13:03:21 +00:00
/**
* Parse input data , build suffix array and overlaid data structures to speed up match finding
*
* @ param pCompressor compression context
* @ param pInWindow pointer to input data window ( previously compressed bytes + bytes to compress )
* @ param nInWindowSize total input size in bytes ( previously compressed bytes + bytes to compress )
*
* @ return 0 for success , non - zero for failure
*/
2019-04-01 16:04:56 +00:00
static int lzsa_build_suffix_array ( lsza_compressor * pCompressor , const unsigned char * pInWindow , const int nInWindowSize ) {
2019-04-03 08:16:12 +00:00
unsigned int * intervals = pCompressor - > intervals ;
2019-04-01 16:04:56 +00:00
/* Build suffix array from input data */
2019-04-06 22:01:22 +00:00
if ( divsufsort_build_array ( & pCompressor - > divsufsort_context , pInWindow , ( saidx_t * ) intervals , nInWindowSize ) ! = 0 ) {
2019-04-01 16:04:56 +00:00
return 100 ;
}
2019-04-03 08:16:12 +00:00
int * PLCP = ( int * ) pCompressor - > pos_data ; /* Use temporarily */
2019-04-01 16:04:56 +00:00
int * Phi = PLCP ;
int nCurLen = 0 ;
int i ;
/* Compute the permuted LCP first (K<> rkk<6B> inen method) */
Phi [ intervals [ 0 ] ] = - 1 ;
for ( i = 1 ; i < nInWindowSize ; i + + )
Phi [ intervals [ i ] ] = intervals [ i - 1 ] ;
for ( i = 0 ; i < nInWindowSize ; i + + ) {
if ( Phi [ i ] = = - 1 ) {
PLCP [ i ] = 0 ;
continue ;
}
int nMaxLen = ( i > Phi [ i ] ) ? ( nInWindowSize - i ) : ( nInWindowSize - Phi [ i ] ) ;
while ( nCurLen < nMaxLen & & pInWindow [ i + nCurLen ] = = pInWindow [ Phi [ i ] + nCurLen ] ) nCurLen + + ;
PLCP [ i ] = nCurLen ;
if ( nCurLen > 0 )
nCurLen - - ;
}
2019-04-02 13:03:21 +00:00
/* Rotate permuted LCP into the LCP. This has better cache locality than the direct Kasai LCP method. This also
* saves us from having to build the inverse suffix array index , as the LCP is calculated without it using this method ,
* and the interval builder below doesn ' t need it either . */
2019-04-01 16:04:56 +00:00
intervals [ 0 ] & = POS_MASK ;
2019-04-21 07:41:12 +00:00
int nMinMatchSize = pCompressor - > min_match_size ;
2019-04-03 08:16:12 +00:00
for ( i = 1 ; i < nInWindowSize - 1 ; i + + ) {
int nIndex = ( int ) ( intervals [ i ] & POS_MASK ) ;
2019-04-01 16:04:56 +00:00
int nLen = PLCP [ nIndex ] ;
2019-04-21 07:41:12 +00:00
if ( nLen < nMinMatchSize )
2019-04-01 16:04:56 +00:00
nLen = 0 ;
if ( nLen > LCP_MAX )
nLen = LCP_MAX ;
2019-04-03 08:16:12 +00:00
intervals [ i ] = ( ( unsigned int ) nIndex ) | ( ( ( unsigned int ) nLen ) < < LCP_SHIFT ) ;
2019-04-01 16:04:56 +00:00
}
2019-04-03 08:16:12 +00:00
if ( i < nInWindowSize )
intervals [ i ] & = POS_MASK ;
2019-04-01 16:04:56 +00:00
/**
* Build intervals for finding matches
*
* Methodology and code fragment taken from wimlib ( CC0 license ) :
* https : //wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
*/
2019-04-03 08:16:12 +00:00
unsigned int * const SA_and_LCP = intervals ;
unsigned int * pos_data = pCompressor - > pos_data ;
unsigned int next_interval_idx ;
unsigned int * top = pCompressor - > open_intervals ;
unsigned int prev_pos = SA_and_LCP [ 0 ] & POS_MASK ;
2019-04-01 16:04:56 +00:00
* top = 0 ;
intervals [ 0 ] = 0 ;
next_interval_idx = 1 ;
for ( int r = 1 ; r < nInWindowSize ; r + + ) {
2019-04-03 08:16:12 +00:00
const unsigned int next_pos = SA_and_LCP [ r ] & POS_MASK ;
const unsigned int next_lcp = SA_and_LCP [ r ] & LCP_MASK ;
const unsigned int top_lcp = * top & LCP_MASK ;
2019-04-01 16:04:56 +00:00
if ( next_lcp = = top_lcp ) {
/* Continuing the deepest open interval */
pos_data [ prev_pos ] = * top ;
}
else if ( next_lcp > top_lcp ) {
/* Opening a new interval */
* + + top = next_lcp | next_interval_idx + + ;
pos_data [ prev_pos ] = * top ;
}
else {
/* Closing the deepest open interval */
pos_data [ prev_pos ] = * top ;
for ( ; ; ) {
2019-04-03 08:16:12 +00:00
const unsigned int closed_interval_idx = * top - - & POS_MASK ;
const unsigned int superinterval_lcp = * top & LCP_MASK ;
2019-04-01 16:04:56 +00:00
if ( next_lcp = = superinterval_lcp ) {
/* Continuing the superinterval */
intervals [ closed_interval_idx ] = * top ;
break ;
}
else if ( next_lcp > superinterval_lcp ) {
/* Creating a new interval that is a
* superinterval of the one being
* closed , but still a subinterval of
* its superinterval */
* + + top = next_lcp | next_interval_idx + + ;
intervals [ closed_interval_idx ] = * top ;
break ;
}
else {
/* Also closing the superinterval */
intervals [ closed_interval_idx ] = * top ;
}
}
}
prev_pos = next_pos ;
}
/* Close any still-open intervals. */
pos_data [ prev_pos ] = * top ;
for ( ; top > pCompressor - > open_intervals ; top - - )
intervals [ * top & POS_MASK ] = * ( top - 1 ) ;
/* Success */
return 0 ;
}
2019-04-02 13:03:21 +00:00
/**
* Find matches at the specified offset in the input window
*
* @ param pCompressor compression context
* @ param nOffset offset to find matches at , in the input window
* @ param pMatches pointer to returned matches
* @ param nMaxMatches maximum number of matches to return ( 0 for none )
*
* @ return number of matches
*/
2019-04-01 16:04:56 +00:00
static int lzsa_find_matches_at ( lsza_compressor * pCompressor , const int nOffset , lzsa_match * pMatches , const int nMaxMatches ) {
2019-04-03 08:16:12 +00:00
unsigned int * intervals = pCompressor - > intervals ;
unsigned int * pos_data = pCompressor - > pos_data ;
unsigned int ref ;
unsigned int super_ref ;
unsigned int match_pos ;
2019-04-01 16:04:56 +00:00
lzsa_match * matchptr ;
/**
* Find matches using intervals
*
* Taken from wimlib ( CC0 license ) :
* https : //wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
*/
/* Get the deepest lcp-interval containing the current suffix. */
ref = pos_data [ nOffset ] ;
pos_data [ nOffset ] = 0 ;
/* Ascend until we reach a visited interval, the root, or a child of the
* root . Link unvisited intervals to the current suffix as we go . */
while ( ( super_ref = intervals [ ref & POS_MASK ] ) & LCP_MASK ) {
intervals [ ref & POS_MASK ] = nOffset ;
ref = super_ref ;
}
if ( super_ref = = 0 ) {
/* In this case, the current interval may be any of:
* ( 1 ) the root ;
* ( 2 ) an unvisited child of the root ;
* ( 3 ) an interval last visited by suffix 0
*
* We could avoid the ambiguity with ( 3 ) by using an lcp
* placeholder value other than 0 to represent " visited " , but
* it ' s fastest to use 0. So we just don ' t allow matches with
* position 0. */
if ( ref ! = 0 ) /* Not the root? */
intervals [ ref & POS_MASK ] = nOffset ;
return 0 ;
}
/* Ascend indirectly via pos_data[] links. */
match_pos = super_ref ;
matchptr = pMatches ;
for ( ; ; ) {
while ( ( super_ref = pos_data [ match_pos ] ) > ref )
match_pos = intervals [ super_ref & POS_MASK ] ;
intervals [ ref & POS_MASK ] = nOffset ;
pos_data [ match_pos ] = ref ;
if ( ( matchptr - pMatches ) < nMaxMatches ) {
2019-04-03 08:16:12 +00:00
int nMatchOffset = ( int ) ( nOffset - match_pos ) ;
2019-04-01 16:04:56 +00:00
if ( nMatchOffset < = MAX_OFFSET ) {
matchptr - > length = ( unsigned short ) ( ref > > LCP_SHIFT ) ;
matchptr - > offset = ( unsigned short ) nMatchOffset ;
matchptr + + ;
}
}
if ( super_ref = = 0 )
break ;
ref = super_ref ;
match_pos = intervals [ ref & POS_MASK ] ;
}
return ( int ) ( matchptr - pMatches ) ;
}
2019-04-02 13:03:21 +00:00
/**
* Skip previously compressed bytes
*
* @ param pCompressor compression context
* @ param nStartOffset current offset in input window ( typically 0 )
* @ param nEndOffset offset to skip to in input window ( typically the number of previously compressed bytes )
*/
2019-04-01 16:04:56 +00:00
static void lzsa_skip_matches ( lsza_compressor * pCompressor , const int nStartOffset , const int nEndOffset ) {
lzsa_match match ;
int i ;
2019-04-02 13:03:21 +00:00
/* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
* we don ' t store the matches . */
2019-04-01 16:04:56 +00:00
for ( i = nStartOffset ; i < nEndOffset ; i + + ) {
lzsa_find_matches_at ( pCompressor , i , & match , 0 ) ;
}
}
2019-04-02 13:03:21 +00:00
/**
* Find all matches for the data to be compressed . Up to NMATCHES_PER_OFFSET matches are stored for each offset , for
* the optimizer to look at .
*
* @ param pCompressor compression context
* @ param nStartOffset current offset in input window ( typically the number of previously compressed bytes )
* @ param nEndOffset offset to end finding matches at ( typically the size of the total input window in bytes
*/
2019-04-01 16:04:56 +00:00
static void lzsa_find_all_matches ( lsza_compressor * pCompressor , const int nStartOffset , const int nEndOffset ) {
lzsa_match * pMatch = pCompressor - > match + ( nStartOffset < < MATCHES_PER_OFFSET_SHIFT ) ;
int i ;
for ( i = nStartOffset ; i < nEndOffset ; i + + ) {
int nMatches = lzsa_find_matches_at ( pCompressor , i , pMatch , NMATCHES_PER_OFFSET ) ;
int m ;
for ( m = 0 ; m < NMATCHES_PER_OFFSET ; m + + ) {
2019-04-05 07:28:16 +00:00
if ( nMatches < = m | | i > ( nEndOffset - LAST_MATCH_OFFSET ) ) {
2019-04-01 16:04:56 +00:00
pMatch - > length = 0 ;
pMatch - > offset = 0 ;
}
else {
2019-04-05 07:28:16 +00:00
int nMaxLen = ( nEndOffset - LAST_LITERALS ) - i ;
2019-04-01 16:04:56 +00:00
if ( nMaxLen < 0 )
nMaxLen = 0 ;
if ( pMatch - > length > nMaxLen )
pMatch - > length = ( unsigned short ) nMaxLen ;
}
pMatch + + ;
}
}
}
2019-04-02 13:03:21 +00:00
/**
2019-04-25 11:01:56 +00:00
* Get the number of extra bits required to represent a literals length
2019-04-02 13:03:21 +00:00
*
* @ param nLength literals length
*
2019-04-25 11:01:56 +00:00
* @ return number of extra bits required
2019-04-02 13:03:21 +00:00
*/
2019-04-01 16:04:56 +00:00
static inline int lzsa_get_literals_varlen_size ( const int nLength ) {
if ( nLength < LITERALS_RUN_LEN ) {
return 0 ;
}
else {
2019-04-24 07:47:40 +00:00
if ( nLength < 256 )
2019-04-25 11:01:56 +00:00
return 8 ;
2019-04-01 16:04:56 +00:00
else {
2019-04-24 07:47:40 +00:00
if ( nLength < 512 )
2019-04-25 11:01:56 +00:00
return 16 ;
2019-04-01 16:04:56 +00:00
else
2019-04-25 11:01:56 +00:00
return 24 ;
2019-04-01 16:04:56 +00:00
}
}
}
2019-04-02 13:03:21 +00:00
/**
* Write extra literals length bytes to output ( compressed ) buffer . The caller must first check that there is enough
* room to write the bytes .
*
* @ param pOutData pointer to output buffer
* @ param nOutOffset current write index into output buffer
* @ param nLength literals length
*/
2019-04-01 16:04:56 +00:00
static inline int lzsa_write_literals_varlen ( unsigned char * pOutData , int nOutOffset , int nLength ) {
if ( nLength > = LITERALS_RUN_LEN ) {
2019-04-24 07:47:40 +00:00
if ( nLength < 256 )
2019-04-05 08:42:06 +00:00
pOutData [ nOutOffset + + ] = nLength - LITERALS_RUN_LEN ;
2019-04-01 16:04:56 +00:00
else {
2019-04-24 07:47:40 +00:00
if ( nLength < 512 ) {
pOutData [ nOutOffset + + ] = 250 ;
pOutData [ nOutOffset + + ] = nLength - 256 ;
2019-04-01 16:04:56 +00:00
}
else {
2019-04-24 07:47:40 +00:00
pOutData [ nOutOffset + + ] = 249 ;
2019-04-05 08:42:06 +00:00
pOutData [ nOutOffset + + ] = nLength & 0xff ;
pOutData [ nOutOffset + + ] = ( nLength > > 8 ) & 0xff ;
2019-04-01 16:04:56 +00:00
}
}
}
return nOutOffset ;
}
2019-04-02 13:03:21 +00:00
/**
2019-04-25 11:01:56 +00:00
* Get the number of extra bits required to represent an encoded match length
2019-04-02 13:03:21 +00:00
*
* @ param nLength encoded match length ( actual match length - MIN_MATCH_SIZE )
*
2019-04-25 11:01:56 +00:00
* @ return number of extra bits required
2019-04-02 13:03:21 +00:00
*/
2019-04-01 16:04:56 +00:00
static inline int lzsa_get_match_varlen_size ( const int nLength ) {
if ( nLength < MATCH_RUN_LEN ) {
return 0 ;
}
else {
2019-04-24 07:47:40 +00:00
if ( ( nLength + MIN_MATCH_SIZE ) < 256 )
2019-04-25 11:01:56 +00:00
return 8 ;
2019-04-01 16:04:56 +00:00
else {
2019-04-24 07:47:40 +00:00
if ( ( nLength + MIN_MATCH_SIZE ) < 512 )
2019-04-25 11:01:56 +00:00
return 16 ;
2019-04-01 16:04:56 +00:00
else
2019-04-25 11:01:56 +00:00
return 24 ;
2019-04-01 16:04:56 +00:00
}
}
}
2019-04-02 13:03:21 +00:00
/**
* Write extra encoded match length bytes to output ( compressed ) buffer . The caller must first check that there is enough
* room to write the bytes .
*
* @ param pOutData pointer to output buffer
* @ param nOutOffset current write index into output buffer
* @ param nLength encoded match length ( actual match length - MIN_MATCH_SIZE )
*/
2019-04-01 16:04:56 +00:00
static inline int lzsa_write_match_varlen ( unsigned char * pOutData , int nOutOffset , int nLength ) {
if ( nLength > = MATCH_RUN_LEN ) {
2019-04-24 07:47:40 +00:00
if ( ( nLength + MIN_MATCH_SIZE ) < 256 )
2019-04-05 08:42:06 +00:00
pOutData [ nOutOffset + + ] = nLength - MATCH_RUN_LEN ;
2019-04-01 16:04:56 +00:00
else {
2019-04-24 07:47:40 +00:00
if ( ( nLength + MIN_MATCH_SIZE ) < 512 ) {
pOutData [ nOutOffset + + ] = 239 ;
pOutData [ nOutOffset + + ] = nLength + MIN_MATCH_SIZE - 256 ;
2019-04-01 16:04:56 +00:00
}
else {
2019-04-24 07:47:40 +00:00
pOutData [ nOutOffset + + ] = 238 ;
pOutData [ nOutOffset + + ] = ( nLength + MIN_MATCH_SIZE ) & 0xff ;
pOutData [ nOutOffset + + ] = ( ( nLength + MIN_MATCH_SIZE ) > > 8 ) & 0xff ;
2019-04-01 16:04:56 +00:00
}
}
}
return nOutOffset ;
}
2019-04-02 13:03:21 +00:00
/**
* Attempt to pick optimal matches , so as to produce the smallest possible output that decompresses to the same input
*
* @ param pCompressor compression context
* @ param nStartOffset current offset in input window ( typically the number of previously compressed bytes )
* @ param nEndOffset offset to end finding matches at ( typically the size of the total input window in bytes
*/
2019-04-01 16:04:56 +00:00
static void lzsa_optimize_matches ( lsza_compressor * pCompressor , const int nStartOffset , const int nEndOffset ) {
2019-04-03 08:16:12 +00:00
int * cost = ( int * ) pCompressor - > pos_data ; /* Reuse */
2019-04-01 16:04:56 +00:00
int nLastLiteralsOffset ;
2019-04-21 07:41:12 +00:00
int nMinMatchSize = pCompressor - > min_match_size ;
2019-05-02 09:23:57 +00:00
const int nFavorRatio = ( pCompressor - > flags & LZSA_FLAG_FAVOR_RATIO ) ? 1 : 0 ;
2019-04-01 16:04:56 +00:00
int i ;
2019-04-25 11:01:56 +00:00
cost [ nEndOffset - 1 ] = 8 ;
2019-04-13 10:33:39 +00:00
nLastLiteralsOffset = nEndOffset ;
2019-04-01 16:04:56 +00:00
for ( i = nEndOffset - 2 ; i ! = ( nStartOffset - 1 ) ; i - - ) {
int nBestCost , nBestMatchLen , nBestMatchOffset ;
int nLiteralsLen = nLastLiteralsOffset - i ;
2019-04-25 11:01:56 +00:00
nBestCost = 8 + cost [ i + 1 ] ;
2019-04-24 07:47:40 +00:00
if ( nLiteralsLen = = LITERALS_RUN_LEN | | nLiteralsLen = = 256 | | nLiteralsLen = = 512 ) {
2019-04-13 10:33:39 +00:00
/* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
* The cost automatically accumulates down the chain . */
2019-04-25 11:01:56 +00:00
nBestCost + = 8 ;
2019-04-13 10:33:39 +00:00
}
2019-04-25 11:01:56 +00:00
if ( pCompressor - > match [ ( i + 1 ) < < MATCHES_PER_OFFSET_SHIFT ] . length > = MIN_MATCH_SIZE )
nBestCost + = MODESWITCH_PENALTY ;
2019-04-01 16:04:56 +00:00
nBestMatchLen = 0 ;
nBestMatchOffset = 0 ;
lzsa_match * pMatch = pCompressor - > match + ( i < < MATCHES_PER_OFFSET_SHIFT ) ;
int m ;
2019-04-21 07:41:12 +00:00
for ( m = 0 ; m < NMATCHES_PER_OFFSET & & pMatch [ m ] . length > = nMinMatchSize ; m + + ) {
2019-04-25 11:01:56 +00:00
int nMatchOffsetSize = ( pMatch [ m ] . offset < = 256 ) ? 8 : 16 ;
2019-04-01 16:04:56 +00:00
if ( pMatch [ m ] . length > = LEAVE_ALONE_MATCH_SIZE ) {
int nCurCost ;
int nMatchLen = pMatch [ m ] . length ;
2019-04-11 21:47:57 +00:00
if ( ( i + nMatchLen ) > ( nEndOffset - LAST_LITERALS ) )
nMatchLen = nEndOffset - LAST_LITERALS - i ;
2019-04-25 11:01:56 +00:00
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size ( nMatchLen - MIN_MATCH_SIZE ) ;
2019-04-11 21:47:57 +00:00
nCurCost + = cost [ i + nMatchLen ] ;
2019-04-25 11:01:56 +00:00
if ( pCompressor - > match [ ( i + nMatchLen ) < < MATCHES_PER_OFFSET_SHIFT ] . length > = MIN_MATCH_SIZE )
nCurCost + = MODESWITCH_PENALTY ;
2019-04-01 16:04:56 +00:00
2019-05-02 09:23:57 +00:00
if ( nBestCost > ( nCurCost - nFavorRatio ) ) {
2019-04-01 16:04:56 +00:00
nBestCost = nCurCost ;
nBestMatchLen = nMatchLen ;
nBestMatchOffset = pMatch [ m ] . offset ;
}
}
else {
2019-04-20 14:24:54 +00:00
int nMatchLen = pMatch [ m ] . length ;
int k , nMatchRunLen ;
2019-04-01 16:04:56 +00:00
2019-04-20 14:24:54 +00:00
if ( ( i + nMatchLen ) > ( nEndOffset - LAST_LITERALS ) )
nMatchLen = nEndOffset - LAST_LITERALS - i ;
2019-04-11 21:47:57 +00:00
2019-04-20 14:24:54 +00:00
nMatchRunLen = nMatchLen ;
if ( nMatchRunLen > MATCH_RUN_LEN )
nMatchRunLen = MATCH_RUN_LEN ;
2019-04-08 07:44:07 +00:00
2019-04-21 07:41:12 +00:00
for ( k = nMinMatchSize ; k < nMatchRunLen ; k + + ) {
2019-04-20 14:24:54 +00:00
int nCurCost ;
2019-04-08 07:44:07 +00:00
2019-04-25 11:01:56 +00:00
nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */ ;
2019-04-20 14:24:54 +00:00
nCurCost + = cost [ i + k ] ;
2019-04-25 11:01:56 +00:00
if ( pCompressor - > match [ ( i + k ) < < MATCHES_PER_OFFSET_SHIFT ] . length > = MIN_MATCH_SIZE )
nCurCost + = MODESWITCH_PENALTY ;
2019-04-08 07:44:07 +00:00
2019-05-02 09:23:57 +00:00
if ( nBestCost > ( nCurCost - nFavorRatio ) ) {
2019-04-20 14:24:54 +00:00
nBestCost = nCurCost ;
nBestMatchLen = k ;
nBestMatchOffset = pMatch [ m ] . offset ;
2019-04-08 07:44:07 +00:00
}
2019-04-20 14:24:54 +00:00
}
2019-04-08 07:44:07 +00:00
2019-04-20 14:24:54 +00:00
for ( ; k < = nMatchLen ; k + + ) {
int nCurCost ;
2019-04-01 16:04:56 +00:00
2019-04-25 11:01:56 +00:00
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size ( k - MIN_MATCH_SIZE ) ;
2019-04-20 14:24:54 +00:00
nCurCost + = cost [ i + k ] ;
2019-04-25 11:01:56 +00:00
if ( pCompressor - > match [ ( i + k ) < < MATCHES_PER_OFFSET_SHIFT ] . length > = MIN_MATCH_SIZE )
nCurCost + = MODESWITCH_PENALTY ;
2019-04-01 16:04:56 +00:00
2019-05-02 09:23:57 +00:00
if ( nBestCost > ( nCurCost - nFavorRatio ) ) {
2019-04-20 14:24:54 +00:00
nBestCost = nCurCost ;
nBestMatchLen = k ;
nBestMatchOffset = pMatch [ m ] . offset ;
2019-04-01 16:04:56 +00:00
}
}
}
}
if ( nBestMatchLen > = MIN_MATCH_SIZE )
nLastLiteralsOffset = i ;
cost [ i ] = nBestCost ;
pMatch - > length = nBestMatchLen ;
pMatch - > offset = nBestMatchOffset ;
}
}
2019-04-05 14:32:11 +00:00
/**
* Attempt to minimize the number of commands issued in the compressed data block , in order to speed up decompression without
* impacting the compression ratio
*
* @ param pCompressor compression context
* @ param nStartOffset current offset in input window ( typically the number of previously compressed bytes )
* @ param nEndOffset offset to end finding matches at ( typically the size of the total input window in bytes
2019-04-20 08:26:45 +00:00
*
* @ return non - zero if the number of tokens was reduced , 0 if it wasn ' t
2019-04-05 14:32:11 +00:00
*/
2019-04-20 08:26:45 +00:00
static int lzsa_optimize_command_count ( lsza_compressor * pCompressor , const int nStartOffset , const int nEndOffset ) {
2019-04-05 14:32:11 +00:00
int i ;
int nNumLiterals = 0 ;
2019-04-20 08:26:45 +00:00
int nDidReduce = 0 ;
2019-04-05 14:32:11 +00:00
for ( i = nStartOffset ; i < nEndOffset ; ) {
lzsa_match * pMatch = pCompressor - > match + ( i < < MATCHES_PER_OFFSET_SHIFT ) ;
if ( pMatch - > length > = MIN_MATCH_SIZE ) {
int nMatchLen = pMatch - > length ;
2019-04-07 13:10:53 +00:00
int nReduce = 0 ;
2019-05-02 09:23:57 +00:00
if ( nMatchLen < = 9 & & ( i + nMatchLen ) < nEndOffset ) /* max reducable command size: <token> <EE> <ll> <ll> <offset> <offset> <EE> <mm> <mm> */ {
2019-04-08 07:44:07 +00:00
int nMatchOffset = pMatch - > offset ;
int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE ;
2019-04-25 11:01:56 +00:00
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size ( nNumLiterals ) + ( ( nMatchOffset < = 256 ) ? 8 : 16 ) /* match offset */ + lzsa_get_match_varlen_size ( nEncodedMatchLen ) ;
2019-04-20 08:26:45 +00:00
if ( pCompressor - > match [ ( i + nMatchLen ) < < MATCHES_PER_OFFSET_SHIFT ] . length > = MIN_MATCH_SIZE ) {
2019-04-25 11:01:56 +00:00
if ( nCommandSize > = ( ( nMatchLen < < 3 ) + lzsa_get_literals_varlen_size ( nNumLiterals + nMatchLen ) ) ) {
2019-04-20 08:26:45 +00:00
/* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
* make the next command eat the cost of encoding the current number of literals , + nMatchLen extra literals . The size of the current match command is
* at least as much as the number of literal bytes + the extra cost of encoding them in the next match command , so we can safely replace the current
* match command by literals , the output size will not increase and it will remove one command . */
nReduce = 1 ;
}
2019-04-07 13:10:53 +00:00
}
2019-04-20 08:26:45 +00:00
else {
2019-04-08 07:44:07 +00:00
int nCurIndex = i + nMatchLen ;
int nNextNumLiterals = 0 ;
do {
nCurIndex + + ;
nNextNumLiterals + + ;
} while ( nCurIndex < nEndOffset & & pCompressor - > match [ nCurIndex < < MATCHES_PER_OFFSET_SHIFT ] . length < MIN_MATCH_SIZE ) ;
2019-04-25 11:01:56 +00:00
if ( nCommandSize > = ( ( nMatchLen < < 3 ) + lzsa_get_literals_varlen_size ( nNumLiterals + nNextNumLiterals + nMatchLen ) - lzsa_get_literals_varlen_size ( nNextNumLiterals ) ) ) {
2019-04-08 07:44:07 +00:00
/* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
* more room than the match , and doesn ' t grow the next match command ' s literals encoding , go ahead and remove the command . */
nReduce = 1 ;
}
}
2019-04-07 13:10:53 +00:00
}
2019-04-05 14:32:11 +00:00
2019-04-07 13:10:53 +00:00
if ( nReduce ) {
2019-04-05 14:32:11 +00:00
int j ;
for ( j = 0 ; j < nMatchLen ; j + + ) {
pCompressor - > match [ ( i + j ) < < MATCHES_PER_OFFSET_SHIFT ] . length = 0 ;
}
nNumLiterals + = nMatchLen ;
i + = nMatchLen ;
2019-04-20 08:26:45 +00:00
nDidReduce = 1 ;
2019-04-05 14:32:11 +00:00
}
else {
2019-05-03 11:45:37 +00:00
if ( ( i + nMatchLen ) < nEndOffset & & nMatchLen > = LCP_MAX & &
pMatch - > offset & & pMatch - > offset < = 32 & & pCompressor - > match [ ( i + nMatchLen ) < < MATCHES_PER_OFFSET_SHIFT ] . offset = = pMatch - > offset & & ( nMatchLen % pMatch - > offset ) = = 0 & &
( nMatchLen + pCompressor - > match [ ( i + nMatchLen ) < < MATCHES_PER_OFFSET_SHIFT ] . length ) < = MAX_OFFSET ) {
2019-04-11 21:47:57 +00:00
/* Join */
pMatch - > length + = pCompressor - > match [ ( i + nMatchLen ) < < MATCHES_PER_OFFSET_SHIFT ] . length ;
pCompressor - > match [ ( i + nMatchLen ) < < MATCHES_PER_OFFSET_SHIFT ] . offset = 0 ;
pCompressor - > match [ ( i + nMatchLen ) < < MATCHES_PER_OFFSET_SHIFT ] . length = - 1 ;
continue ;
}
2019-04-05 14:32:11 +00:00
nNumLiterals = 0 ;
i + = nMatchLen ;
}
}
else {
nNumLiterals + + ;
i + + ;
}
}
2019-04-20 08:26:45 +00:00
return nDidReduce ;
2019-04-05 14:32:11 +00:00
}
2019-04-02 13:03:21 +00:00
/**
* Emit block of compressed data
*
* @ param pCompressor compression context
* @ param pInWindow pointer to input data window ( previously compressed bytes + bytes to compress )
* @ param nStartOffset current offset in input window ( typically the number of previously compressed bytes )
* @ param nEndOffset offset to end finding matches at ( typically the size of the total input window in bytes
* @ param pOutData pointer to output buffer
* @ param nMaxOutDataSize maximum size of output buffer , in bytes
*
* @ return size of compressed data in output buffer , or - 1 if the data is uncompressible
*/
2019-04-01 16:04:56 +00:00
static int lzsa_write_block ( lsza_compressor * pCompressor , const unsigned char * pInWindow , const int nStartOffset , const int nEndOffset , unsigned char * pOutData , const int nMaxOutDataSize ) {
int i ;
int nNumLiterals = 0 ;
int nInFirstLiteralOffset = 0 ;
int nOutOffset = 0 ;
for ( i = nStartOffset ; i < nEndOffset ; ) {
lzsa_match * pMatch = pCompressor - > match + ( i < < MATCHES_PER_OFFSET_SHIFT ) ;
if ( pMatch - > length > = MIN_MATCH_SIZE ) {
int nMatchOffset = pMatch - > offset ;
int nMatchLen = pMatch - > length ;
int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE ;
2019-04-07 13:10:53 +00:00
int nTokenLiteralsLen = ( nNumLiterals > = LITERALS_RUN_LEN ) ? LITERALS_RUN_LEN : nNumLiterals ;
int nTokenMatchLen = ( nEncodedMatchLen > = MATCH_RUN_LEN ) ? MATCH_RUN_LEN : nEncodedMatchLen ;
int nTokenLongOffset = ( nMatchOffset < = 256 ) ? 0x00 : 0x80 ;
2019-04-25 11:01:56 +00:00
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size ( nNumLiterals ) + ( nNumLiterals < < 3 ) + ( nTokenLongOffset ? 16 : 8 ) /* match offset */ + lzsa_get_match_varlen_size ( nEncodedMatchLen ) ;
2019-04-01 16:04:56 +00:00
2019-04-25 11:01:56 +00:00
if ( ( nOutOffset + ( nCommandSize > > 3 ) ) > nMaxOutDataSize )
2019-04-01 16:04:56 +00:00
return - 1 ;
if ( nMatchOffset < MIN_OFFSET | | nMatchOffset > MAX_OFFSET )
return - 1 ;
2019-04-07 13:10:53 +00:00
pOutData [ nOutOffset + + ] = nTokenLongOffset | ( nTokenLiteralsLen < < 4 ) | nTokenMatchLen ;
2019-04-01 16:04:56 +00:00
nOutOffset = lzsa_write_literals_varlen ( pOutData , nOutOffset , nNumLiterals ) ;
if ( nNumLiterals ! = 0 ) {
memcpy ( pOutData + nOutOffset , pInWindow + nInFirstLiteralOffset , nNumLiterals ) ;
nOutOffset + = nNumLiterals ;
nNumLiterals = 0 ;
}
2019-04-24 07:47:40 +00:00
pOutData [ nOutOffset + + ] = ( - nMatchOffset ) & 0xff ;
2019-04-07 13:10:53 +00:00
if ( nTokenLongOffset ) {
2019-04-24 07:47:40 +00:00
pOutData [ nOutOffset + + ] = ( - nMatchOffset ) > > 8 ;
2019-04-05 08:42:06 +00:00
}
2019-04-01 16:04:56 +00:00
nOutOffset = lzsa_write_match_varlen ( pOutData , nOutOffset , nEncodedMatchLen ) ;
i + = nMatchLen ;
2019-04-05 14:32:11 +00:00
pCompressor - > num_commands + + ;
2019-04-01 16:04:56 +00:00
}
else {
if ( nNumLiterals = = 0 )
nInFirstLiteralOffset = i ;
nNumLiterals + + ;
i + + ;
}
}
2019-04-05 07:28:16 +00:00
{
2019-04-07 13:10:53 +00:00
int nTokenLiteralsLen = ( nNumLiterals > = LITERALS_RUN_LEN ) ? LITERALS_RUN_LEN : nNumLiterals ;
2019-04-25 11:01:56 +00:00
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size ( nNumLiterals ) + ( nNumLiterals < < 3 ) ;
2019-04-01 16:04:56 +00:00
2019-04-25 11:01:56 +00:00
if ( ( nOutOffset + ( nCommandSize > > 3 ) ) > nMaxOutDataSize )
2019-04-01 16:04:56 +00:00
return - 1 ;
2019-05-02 09:23:57 +00:00
if ( pCompressor - > flags & LZSA_FLAG_RAW_BLOCK )
pOutData [ nOutOffset + + ] = ( nTokenLiteralsLen < < 4 ) | 0x0f ;
else
pOutData [ nOutOffset + + ] = ( nTokenLiteralsLen < < 4 ) | 0x00 ;
2019-04-01 16:04:56 +00:00
nOutOffset = lzsa_write_literals_varlen ( pOutData , nOutOffset , nNumLiterals ) ;
if ( nNumLiterals ! = 0 ) {
memcpy ( pOutData + nOutOffset , pInWindow + nInFirstLiteralOffset , nNumLiterals ) ;
nOutOffset + = nNumLiterals ;
nNumLiterals = 0 ;
}
2019-04-05 14:32:11 +00:00
pCompressor - > num_commands + + ;
2019-04-01 16:04:56 +00:00
}
2019-05-02 09:23:57 +00:00
if ( pCompressor - > flags & LZSA_FLAG_RAW_BLOCK ) {
/* Emit EOD marker for raw block */
if ( ( nOutOffset + 4 ) > nMaxOutDataSize )
return - 1 ;
pOutData [ nOutOffset + + ] = 0 ;
pOutData [ nOutOffset + + ] = 238 ;
pOutData [ nOutOffset + + ] = 0 ;
pOutData [ nOutOffset + + ] = 0 ;
}
2019-04-01 16:04:56 +00:00
return nOutOffset ;
}
2019-04-02 10:12:12 +00:00
/**
* Compress one block of data
*
* @ param pCompressor compression context
* @ param pInWindow pointer to input data window ( previously compressed bytes + bytes to compress )
* @ param nPreviousBlockSize number of previously compressed bytes ( or 0 for none )
* @ param nInDataSize number of input bytes to compress
* @ param pOutData pointer to output buffer
* @ param nMaxOutDataSize maximum size of output buffer , in bytes
*
* @ return size of compressed data in output buffer , or - 1 if the data is uncompressible
*/
2019-04-01 16:04:56 +00:00
int lzsa_shrink_block ( lsza_compressor * pCompressor , const unsigned char * pInWindow , const int nPreviousBlockSize , const int nInDataSize , unsigned char * pOutData , const int nMaxOutDataSize ) {
2019-04-06 22:01:22 +00:00
if ( lzsa_build_suffix_array ( pCompressor , pInWindow , nPreviousBlockSize + nInDataSize ) )
return - 1 ;
2019-04-01 16:04:56 +00:00
if ( nPreviousBlockSize ) {
lzsa_skip_matches ( pCompressor , 0 , nPreviousBlockSize ) ;
}
lzsa_find_all_matches ( pCompressor , nPreviousBlockSize , nPreviousBlockSize + nInDataSize ) ;
lzsa_optimize_matches ( pCompressor , nPreviousBlockSize , nPreviousBlockSize + nInDataSize ) ;
2019-04-20 08:26:45 +00:00
int nDidReduce ;
int nPasses = 0 ;
do {
nDidReduce = lzsa_optimize_command_count ( pCompressor , nPreviousBlockSize , nPreviousBlockSize + nInDataSize ) ;
nPasses + + ;
} while ( nDidReduce & & nPasses < 20 ) ;
2019-04-01 16:04:56 +00:00
return lzsa_write_block ( pCompressor , pInWindow , nPreviousBlockSize , nPreviousBlockSize + nInDataSize , pOutData , nMaxOutDataSize ) ;
}
2019-04-05 14:32:11 +00:00
/**
* Get the number of compression commands issued in compressed data blocks
*
* @ return number of commands
*/
int lzsa_compressor_get_command_count ( lsza_compressor * pCompressor ) {
return pCompressor - > num_commands ;
}