Reduce token count by 1% without changing the ratio

This commit is contained in:
emmanuel-marty 2019-04-25 13:01:56 +02:00
parent b7967c3aa1
commit 9e75a55dd3

View File

@ -49,6 +49,8 @@
#define LAST_MATCH_OFFSET 4
#define LAST_LITERALS 1
#define MODESWITCH_PENALTY 1
/** One match */
typedef struct _lzsa_match {
unsigned short length;
@ -389,11 +391,11 @@ static void lzsa_find_all_matches(lsza_compressor *pCompressor, const int nStart
}
/**
* Get the number of extra bytes required to represent a literals length
* Get the number of extra bits required to represent a literals length
*
* @param nLength literals length
*
* @return number of extra bytes required
* @return number of extra bits required
*/
static inline int lzsa_get_literals_varlen_size(const int nLength) {
if (nLength < LITERALS_RUN_LEN) {
@ -401,12 +403,12 @@ static inline int lzsa_get_literals_varlen_size(const int nLength) {
}
else {
if (nLength < 256)
return 1;
return 8;
else {
if (nLength < 512)
return 2;
return 16;
else
return 3;
return 24;
}
}
}
@ -440,11 +442,11 @@ static inline int lzsa_write_literals_varlen(unsigned char *pOutData, int nOutOf
}
/**
* Get the number of extra bytes required to represent an encoded match length
* Get the number of extra bits required to represent an encoded match length
*
* @param nLength encoded match length (actual match length - MIN_MATCH_SIZE)
*
* @return number of extra bytes required
* @return number of extra bits required
*/
static inline int lzsa_get_match_varlen_size(const int nLength) {
if (nLength < MATCH_RUN_LEN) {
@ -452,12 +454,12 @@ static inline int lzsa_get_match_varlen_size(const int nLength) {
}
else {
if ((nLength + MIN_MATCH_SIZE) < 256)
return 1;
return 8;
else {
if ((nLength + MIN_MATCH_SIZE) < 512)
return 2;
return 16;
else
return 3;
return 24;
}
}
}
@ -503,19 +505,21 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
int nMinMatchSize = pCompressor->min_match_size;
int i;
cost[nEndOffset - 1] = 1;
cost[nEndOffset - 1] = 8;
nLastLiteralsOffset = nEndOffset;
for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
int nBestCost, nBestMatchLen, nBestMatchOffset;
int nLiteralsLen = nLastLiteralsOffset - i;
nBestCost = 1 + cost[i + 1];
nBestCost = 8 + cost[i + 1];
if (nLiteralsLen == LITERALS_RUN_LEN || nLiteralsLen == 256 || nLiteralsLen == 512) {
/* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
* The cost automatically accumulates down the chain. */
nBestCost++;
nBestCost += 8;
}
if (pCompressor->match[(i + 1) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nBestCost += MODESWITCH_PENALTY;
nBestMatchLen = 0;
nBestMatchOffset = 0;
@ -523,7 +527,7 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
int m;
for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 1 : 2;
int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 8 : 16;
if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
int nCurCost;
@ -532,8 +536,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
nMatchLen = nEndOffset - LAST_LITERALS - i;
nCurCost = 1 + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE);
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE);
nCurCost += cost[i + nMatchLen];
if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nCurCost += MODESWITCH_PENALTY;
if (nBestCost >= nCurCost) {
nBestCost = nCurCost;
@ -555,8 +561,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
for (k = nMinMatchSize; k < nMatchRunLen; k++) {
int nCurCost;
nCurCost = 1 + nMatchOffsetSize /* no extra match len bytes */;
nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */;
nCurCost += cost[i + k];
if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nCurCost += MODESWITCH_PENALTY;
if (nBestCost >= nCurCost) {
nBestCost = nCurCost;
@ -568,8 +576,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
for (; k <= nMatchLen; k++) {
int nCurCost;
nCurCost = 1 + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE);
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE);
nCurCost += cost[i + k];
if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nCurCost += MODESWITCH_PENALTY;
if (nBestCost >= nCurCost) {
nBestCost = nCurCost;
@ -614,10 +624,10 @@ static int lzsa_optimize_command_count(lsza_compressor *pCompressor, const int n
if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <FF> <ll> <ll> <offset> <offset> <FF> <mm> <mm> */ {
int nMatchOffset = pMatch->offset;
int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE;
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + ((nMatchOffset <= 256) ? 1 : 2) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + ((nMatchOffset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE) {
if (nCommandSize >= (nMatchLen + lzsa_get_literals_varlen_size(nNumLiterals + nMatchLen))) {
if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nMatchLen))) {
/* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
* make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
* at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
@ -634,7 +644,7 @@ static int lzsa_optimize_command_count(lsza_compressor *pCompressor, const int n
nNextNumLiterals++;
} while (nCurIndex < nEndOffset && pCompressor->match[nCurIndex << MATCHES_PER_OFFSET_SHIFT].length < MIN_MATCH_SIZE);
if (nCommandSize >= (nMatchLen + lzsa_get_literals_varlen_size(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size(nNextNumLiterals))) {
if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size(nNextNumLiterals))) {
/* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
* more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
nReduce = 1;
@ -706,9 +716,9 @@ static int lzsa_write_block(lsza_compressor *pCompressor, const unsigned char *p
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN) ? MATCH_RUN_LEN : nEncodedMatchLen;
int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + nNumLiterals + (nTokenLongOffset ? 2 : 1) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
if ((nOutOffset + nCommandSize) > nMaxOutDataSize)
if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
return -1;
if (nMatchOffset < MIN_OFFSET || nMatchOffset > MAX_OFFSET)
return -1;
@ -741,9 +751,9 @@ static int lzsa_write_block(lsza_compressor *pCompressor, const unsigned char *p
{
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + nNumLiterals;
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3);
if ((nOutOffset + nCommandSize) > nMaxOutDataSize)
if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
return -1;
pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x0f;