mirror of
https://github.com/emmanuel-marty/lzsa.git
synced 2024-06-29 09:29:32 +00:00
Reduce token count by 1% without changing the ratio
This commit is contained in:
parent
b7967c3aa1
commit
9e75a55dd3
58
src/shrink.c
58
src/shrink.c
|
@ -49,6 +49,8 @@
|
||||||
#define LAST_MATCH_OFFSET 4
|
#define LAST_MATCH_OFFSET 4
|
||||||
#define LAST_LITERALS 1
|
#define LAST_LITERALS 1
|
||||||
|
|
||||||
|
#define MODESWITCH_PENALTY 1
|
||||||
|
|
||||||
/** One match */
|
/** One match */
|
||||||
typedef struct _lzsa_match {
|
typedef struct _lzsa_match {
|
||||||
unsigned short length;
|
unsigned short length;
|
||||||
|
@ -389,11 +391,11 @@ static void lzsa_find_all_matches(lsza_compressor *pCompressor, const int nStart
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the number of extra bytes required to represent a literals length
|
* Get the number of extra bits required to represent a literals length
|
||||||
*
|
*
|
||||||
* @param nLength literals length
|
* @param nLength literals length
|
||||||
*
|
*
|
||||||
* @return number of extra bytes required
|
* @return number of extra bits required
|
||||||
*/
|
*/
|
||||||
static inline int lzsa_get_literals_varlen_size(const int nLength) {
|
static inline int lzsa_get_literals_varlen_size(const int nLength) {
|
||||||
if (nLength < LITERALS_RUN_LEN) {
|
if (nLength < LITERALS_RUN_LEN) {
|
||||||
|
@ -401,12 +403,12 @@ static inline int lzsa_get_literals_varlen_size(const int nLength) {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (nLength < 256)
|
if (nLength < 256)
|
||||||
return 1;
|
return 8;
|
||||||
else {
|
else {
|
||||||
if (nLength < 512)
|
if (nLength < 512)
|
||||||
return 2;
|
return 16;
|
||||||
else
|
else
|
||||||
return 3;
|
return 24;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -440,11 +442,11 @@ static inline int lzsa_write_literals_varlen(unsigned char *pOutData, int nOutOf
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the number of extra bytes required to represent an encoded match length
|
* Get the number of extra bits required to represent an encoded match length
|
||||||
*
|
*
|
||||||
* @param nLength encoded match length (actual match length - MIN_MATCH_SIZE)
|
* @param nLength encoded match length (actual match length - MIN_MATCH_SIZE)
|
||||||
*
|
*
|
||||||
* @return number of extra bytes required
|
* @return number of extra bits required
|
||||||
*/
|
*/
|
||||||
static inline int lzsa_get_match_varlen_size(const int nLength) {
|
static inline int lzsa_get_match_varlen_size(const int nLength) {
|
||||||
if (nLength < MATCH_RUN_LEN) {
|
if (nLength < MATCH_RUN_LEN) {
|
||||||
|
@ -452,12 +454,12 @@ static inline int lzsa_get_match_varlen_size(const int nLength) {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if ((nLength + MIN_MATCH_SIZE) < 256)
|
if ((nLength + MIN_MATCH_SIZE) < 256)
|
||||||
return 1;
|
return 8;
|
||||||
else {
|
else {
|
||||||
if ((nLength + MIN_MATCH_SIZE) < 512)
|
if ((nLength + MIN_MATCH_SIZE) < 512)
|
||||||
return 2;
|
return 16;
|
||||||
else
|
else
|
||||||
return 3;
|
return 24;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -503,19 +505,21 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
|
||||||
int nMinMatchSize = pCompressor->min_match_size;
|
int nMinMatchSize = pCompressor->min_match_size;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
cost[nEndOffset - 1] = 1;
|
cost[nEndOffset - 1] = 8;
|
||||||
nLastLiteralsOffset = nEndOffset;
|
nLastLiteralsOffset = nEndOffset;
|
||||||
|
|
||||||
for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
|
for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
|
||||||
int nBestCost, nBestMatchLen, nBestMatchOffset;
|
int nBestCost, nBestMatchLen, nBestMatchOffset;
|
||||||
|
|
||||||
int nLiteralsLen = nLastLiteralsOffset - i;
|
int nLiteralsLen = nLastLiteralsOffset - i;
|
||||||
nBestCost = 1 + cost[i + 1];
|
nBestCost = 8 + cost[i + 1];
|
||||||
if (nLiteralsLen == LITERALS_RUN_LEN || nLiteralsLen == 256 || nLiteralsLen == 512) {
|
if (nLiteralsLen == LITERALS_RUN_LEN || nLiteralsLen == 256 || nLiteralsLen == 512) {
|
||||||
/* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
|
/* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
|
||||||
* The cost automatically accumulates down the chain. */
|
* The cost automatically accumulates down the chain. */
|
||||||
nBestCost++;
|
nBestCost += 8;
|
||||||
}
|
}
|
||||||
|
if (pCompressor->match[(i + 1) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
|
||||||
|
nBestCost += MODESWITCH_PENALTY;
|
||||||
nBestMatchLen = 0;
|
nBestMatchLen = 0;
|
||||||
nBestMatchOffset = 0;
|
nBestMatchOffset = 0;
|
||||||
|
|
||||||
|
@ -523,7 +527,7 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
|
||||||
int m;
|
int m;
|
||||||
|
|
||||||
for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
|
for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
|
||||||
int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 1 : 2;
|
int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 8 : 16;
|
||||||
|
|
||||||
if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
|
if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
|
||||||
int nCurCost;
|
int nCurCost;
|
||||||
|
@ -532,8 +536,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
|
||||||
if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
|
if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
|
||||||
nMatchLen = nEndOffset - LAST_LITERALS - i;
|
nMatchLen = nEndOffset - LAST_LITERALS - i;
|
||||||
|
|
||||||
nCurCost = 1 + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE);
|
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE);
|
||||||
nCurCost += cost[i + nMatchLen];
|
nCurCost += cost[i + nMatchLen];
|
||||||
|
if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
|
||||||
|
nCurCost += MODESWITCH_PENALTY;
|
||||||
|
|
||||||
if (nBestCost >= nCurCost) {
|
if (nBestCost >= nCurCost) {
|
||||||
nBestCost = nCurCost;
|
nBestCost = nCurCost;
|
||||||
|
@ -555,8 +561,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
|
||||||
for (k = nMinMatchSize; k < nMatchRunLen; k++) {
|
for (k = nMinMatchSize; k < nMatchRunLen; k++) {
|
||||||
int nCurCost;
|
int nCurCost;
|
||||||
|
|
||||||
nCurCost = 1 + nMatchOffsetSize /* no extra match len bytes */;
|
nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */;
|
||||||
nCurCost += cost[i + k];
|
nCurCost += cost[i + k];
|
||||||
|
if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
|
||||||
|
nCurCost += MODESWITCH_PENALTY;
|
||||||
|
|
||||||
if (nBestCost >= nCurCost) {
|
if (nBestCost >= nCurCost) {
|
||||||
nBestCost = nCurCost;
|
nBestCost = nCurCost;
|
||||||
|
@ -568,8 +576,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
|
||||||
for (; k <= nMatchLen; k++) {
|
for (; k <= nMatchLen; k++) {
|
||||||
int nCurCost;
|
int nCurCost;
|
||||||
|
|
||||||
nCurCost = 1 + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE);
|
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE);
|
||||||
nCurCost += cost[i + k];
|
nCurCost += cost[i + k];
|
||||||
|
if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
|
||||||
|
nCurCost += MODESWITCH_PENALTY;
|
||||||
|
|
||||||
if (nBestCost >= nCurCost) {
|
if (nBestCost >= nCurCost) {
|
||||||
nBestCost = nCurCost;
|
nBestCost = nCurCost;
|
||||||
|
@ -614,10 +624,10 @@ static int lzsa_optimize_command_count(lsza_compressor *pCompressor, const int n
|
||||||
if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <FF> <ll> <ll> <offset> <offset> <FF> <mm> <mm> */ {
|
if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <FF> <ll> <ll> <offset> <offset> <FF> <mm> <mm> */ {
|
||||||
int nMatchOffset = pMatch->offset;
|
int nMatchOffset = pMatch->offset;
|
||||||
int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE;
|
int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE;
|
||||||
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + ((nMatchOffset <= 256) ? 1 : 2) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
|
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + ((nMatchOffset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
|
||||||
|
|
||||||
if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE) {
|
if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE) {
|
||||||
if (nCommandSize >= (nMatchLen + lzsa_get_literals_varlen_size(nNumLiterals + nMatchLen))) {
|
if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nMatchLen))) {
|
||||||
/* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
|
/* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
|
||||||
* make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
|
* make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
|
||||||
* at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
|
* at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
|
||||||
|
@ -634,7 +644,7 @@ static int lzsa_optimize_command_count(lsza_compressor *pCompressor, const int n
|
||||||
nNextNumLiterals++;
|
nNextNumLiterals++;
|
||||||
} while (nCurIndex < nEndOffset && pCompressor->match[nCurIndex << MATCHES_PER_OFFSET_SHIFT].length < MIN_MATCH_SIZE);
|
} while (nCurIndex < nEndOffset && pCompressor->match[nCurIndex << MATCHES_PER_OFFSET_SHIFT].length < MIN_MATCH_SIZE);
|
||||||
|
|
||||||
if (nCommandSize >= (nMatchLen + lzsa_get_literals_varlen_size(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size(nNextNumLiterals))) {
|
if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size(nNextNumLiterals))) {
|
||||||
/* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
|
/* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
|
||||||
* more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
|
* more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
|
||||||
nReduce = 1;
|
nReduce = 1;
|
||||||
|
@ -706,9 +716,9 @@ static int lzsa_write_block(lsza_compressor *pCompressor, const unsigned char *p
|
||||||
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
|
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
|
||||||
int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN) ? MATCH_RUN_LEN : nEncodedMatchLen;
|
int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN) ? MATCH_RUN_LEN : nEncodedMatchLen;
|
||||||
int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
|
int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
|
||||||
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + nNumLiterals + (nTokenLongOffset ? 2 : 1) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
|
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
|
||||||
|
|
||||||
if ((nOutOffset + nCommandSize) > nMaxOutDataSize)
|
if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
|
||||||
return -1;
|
return -1;
|
||||||
if (nMatchOffset < MIN_OFFSET || nMatchOffset > MAX_OFFSET)
|
if (nMatchOffset < MIN_OFFSET || nMatchOffset > MAX_OFFSET)
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -741,9 +751,9 @@ static int lzsa_write_block(lsza_compressor *pCompressor, const unsigned char *p
|
||||||
|
|
||||||
{
|
{
|
||||||
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
|
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
|
||||||
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + nNumLiterals;
|
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3);
|
||||||
|
|
||||||
if ((nOutOffset + nCommandSize) > nMaxOutDataSize)
|
if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x0f;
|
pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x0f;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user