mirror of
https://github.com/emmanuel-marty/lzsa.git
synced 2025-01-06 07:32:57 +00:00
Merge pull request #3 from emmanuel-marty/master
Sync with E.Marty's branch
This commit is contained in:
commit
e3d7ec9c40
37
src/lzsa.c
37
src/lzsa.c
@ -46,6 +46,7 @@
|
||||
#define OPT_RAW 2
|
||||
#define OPT_FAVOR_RATIO 4
|
||||
#define OPT_RAW_BACKWARD 8
|
||||
#define OPT_STATS 16
|
||||
|
||||
#define TOOL_VERSION "1.1.0"
|
||||
|
||||
@ -104,6 +105,7 @@ static int do_compress(const char *pszInFilename, const char *pszOutFilename, co
|
||||
int nCommandCount = 0, nSafeDist = 0;
|
||||
int nFlags;
|
||||
lzsa_status_t nStatus;
|
||||
lzsa_stats stats;
|
||||
|
||||
nFlags = 0;
|
||||
if (nOptions & OPT_FAVOR_RATIO)
|
||||
@ -117,7 +119,7 @@ static int do_compress(const char *pszInFilename, const char *pszOutFilename, co
|
||||
nStartTime = do_get_time();
|
||||
}
|
||||
|
||||
nStatus = lzsa_compress_file(pszInFilename, pszOutFilename, pszDictionaryFilename, nFlags, nMinMatchSize, nFormatVersion, compression_progress, &nOriginalSize, &nCompressedSize, &nCommandCount, &nSafeDist);
|
||||
nStatus = lzsa_compress_file(pszInFilename, pszOutFilename, pszDictionaryFilename, nFlags, nMinMatchSize, nFormatVersion, compression_progress, &nOriginalSize, &nCompressedSize, &nCommandCount, &nSafeDist, &stats);
|
||||
|
||||
if ((nOptions & OPT_VERBOSE)) {
|
||||
nEndTime = do_get_time();
|
||||
@ -149,6 +151,32 @@ static int do_compress(const char *pszInFilename, const char *pszOutFilename, co
|
||||
}
|
||||
}
|
||||
|
||||
if (nOptions & OPT_STATS) {
|
||||
if (stats.literals_divisor > 0)
|
||||
fprintf(stdout, "Literals: min: %d avg: %d max: %d count: %d\n", stats.min_literals, stats.total_literals / stats.literals_divisor, stats.max_literals, stats.literals_divisor);
|
||||
else
|
||||
fprintf(stdout, "Literals: none\n");
|
||||
if (stats.match_divisor > 0) {
|
||||
fprintf(stdout, "Offsets: min: %d avg: %d max: %d reps: %d count: %d\n", stats.min_offset, stats.total_offsets / stats.match_divisor, stats.max_offset, stats.num_rep_offsets, stats.match_divisor);
|
||||
fprintf(stdout, "Match lens: min: %d avg: %d max: %d count: %d\n", stats.min_match_len, stats.total_match_lens / stats.match_divisor, stats.max_match_len, stats.match_divisor);
|
||||
}
|
||||
else {
|
||||
fprintf(stdout, "Offsets: none\n");
|
||||
fprintf(stdout, "Match lens: none\n");
|
||||
}
|
||||
if (stats.rle1_divisor > 0) {
|
||||
fprintf(stdout, "RLE1 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle1_len, stats.total_rle1_lens / stats.rle1_divisor, stats.max_rle1_len, stats.rle1_divisor);
|
||||
}
|
||||
else {
|
||||
fprintf(stdout, "RLE1 lens: none\n");
|
||||
}
|
||||
if (stats.rle2_divisor > 0) {
|
||||
fprintf(stdout, "RLE2 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle2_len, stats.total_rle2_lens / stats.rle2_divisor, stats.max_rle2_len, stats.rle2_divisor);
|
||||
}
|
||||
else {
|
||||
fprintf(stdout, "RLE2 lens: none\n");
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1009,6 +1037,13 @@ int main(int argc, char **argv) {
|
||||
else
|
||||
bArgsError = true;
|
||||
}
|
||||
else if (!strcmp(argv[i], "-stats")) {
|
||||
if ((nOptions & OPT_STATS) == 0) {
|
||||
nOptions |= OPT_STATS;
|
||||
}
|
||||
else
|
||||
bArgsError = true;
|
||||
}
|
||||
else {
|
||||
if (!pszInFilename)
|
||||
pszInFilename = argv[i];
|
||||
|
@ -98,7 +98,9 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
|
||||
nLen = 0;
|
||||
if (nLen > LCP_MAX)
|
||||
nLen = LCP_MAX;
|
||||
int nTaggedLen = (nLen << TAG_BITS) | (lzsa_get_index_tag((unsigned int)nIndex) & ((1 << TAG_BITS) - 1));
|
||||
int nTaggedLen = 0;
|
||||
if (nLen)
|
||||
nTaggedLen = (nLen << TAG_BITS) | (lzsa_get_index_tag((unsigned int)nIndex) & ((1 << TAG_BITS) - 1));
|
||||
intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nTaggedLen) << LCP_SHIFT);
|
||||
}
|
||||
}
|
||||
|
@ -380,6 +380,13 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
|
||||
pOutData[nOutOffset++] = nTokenLongOffset | (nTokenLiteralsLen << 4) | nTokenMatchLen;
|
||||
nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals);
|
||||
|
||||
if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
|
||||
pCompressor->stats.min_literals = nNumLiterals;
|
||||
if (nNumLiterals > pCompressor->stats.max_literals)
|
||||
pCompressor->stats.max_literals = nNumLiterals;
|
||||
pCompressor->stats.total_literals += nNumLiterals;
|
||||
pCompressor->stats.literals_divisor++;
|
||||
|
||||
if (nNumLiterals != 0) {
|
||||
memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
|
||||
nOutOffset += nNumLiterals;
|
||||
@ -391,6 +398,37 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
|
||||
pOutData[nOutOffset++] = (-nMatchOffset) >> 8;
|
||||
}
|
||||
nOutOffset = lzsa_write_match_varlen_v1(pOutData, nOutOffset, nEncodedMatchLen);
|
||||
|
||||
if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1)
|
||||
pCompressor->stats.min_offset = nMatchOffset;
|
||||
if (nMatchOffset > pCompressor->stats.max_offset)
|
||||
pCompressor->stats.max_offset = nMatchOffset;
|
||||
pCompressor->stats.total_offsets += nMatchOffset;
|
||||
|
||||
if (nMatchLen < pCompressor->stats.min_match_len || pCompressor->stats.min_match_len == -1)
|
||||
pCompressor->stats.min_match_len = nMatchLen;
|
||||
if (nMatchLen > pCompressor->stats.max_match_len)
|
||||
pCompressor->stats.max_match_len = nMatchLen;
|
||||
pCompressor->stats.total_match_lens += nMatchLen;
|
||||
pCompressor->stats.match_divisor++;
|
||||
|
||||
if (nMatchOffset == 1) {
|
||||
if (nMatchLen < pCompressor->stats.min_rle1_len || pCompressor->stats.min_rle1_len == -1)
|
||||
pCompressor->stats.min_rle1_len = nMatchLen;
|
||||
if (nMatchLen > pCompressor->stats.max_rle1_len)
|
||||
pCompressor->stats.max_rle1_len = nMatchLen;
|
||||
pCompressor->stats.total_rle1_lens += nMatchLen;
|
||||
pCompressor->stats.rle1_divisor++;
|
||||
}
|
||||
else if (nMatchOffset == 2) {
|
||||
if (nMatchLen < pCompressor->stats.min_rle2_len || pCompressor->stats.min_rle2_len == -1)
|
||||
pCompressor->stats.min_rle2_len = nMatchLen;
|
||||
if (nMatchLen > pCompressor->stats.max_rle2_len)
|
||||
pCompressor->stats.max_rle2_len = nMatchLen;
|
||||
pCompressor->stats.total_rle2_lens += nMatchLen;
|
||||
pCompressor->stats.rle2_divisor++;
|
||||
}
|
||||
|
||||
i += nMatchLen;
|
||||
|
||||
if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
|
||||
@ -422,6 +460,13 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char
|
||||
pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x00;
|
||||
nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals);
|
||||
|
||||
if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
|
||||
pCompressor->stats.min_literals = nNumLiterals;
|
||||
if (nNumLiterals > pCompressor->stats.max_literals)
|
||||
pCompressor->stats.max_literals = nNumLiterals;
|
||||
pCompressor->stats.total_literals += nNumLiterals;
|
||||
pCompressor->stats.literals_divisor++;
|
||||
|
||||
if (nNumLiterals != 0) {
|
||||
memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
|
||||
nOutOffset += nNumLiterals;
|
||||
|
@ -192,13 +192,17 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
|
||||
|
||||
memset(arrival + (nStartOffset << MATCHES_PER_OFFSET_SHIFT), 0, sizeof(lzsa_arrival) * ((nEndOffset - nStartOffset) << MATCHES_PER_OFFSET_SHIFT));
|
||||
|
||||
for (i = (nStartOffset << MATCHES_PER_OFFSET_SHIFT); i != (nEndOffset << MATCHES_PER_OFFSET_SHIFT); i++) {
|
||||
arrival[i].cost = 0x40000000;
|
||||
}
|
||||
|
||||
arrival[nStartOffset << MATCHES_PER_OFFSET_SHIFT].from_slot = -1;
|
||||
|
||||
for (i = nStartOffset; i != (nEndOffset - 1); i++) {
|
||||
int m, nMatches;
|
||||
|
||||
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
|
||||
int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost;
|
||||
const int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost & 0x3fffffff;
|
||||
int nCodingChoiceCost = nPrevCost + 8 /* literal */;
|
||||
int nNumLiterals = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals + 1;
|
||||
|
||||
@ -215,34 +219,39 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
|
||||
if (!nFavorRatio && nNumLiterals == 1)
|
||||
nCodingChoiceCost += MODESWITCH_PENALTY;
|
||||
|
||||
int exists = 0;
|
||||
for (n = 0;
|
||||
n < NMATCHES_PER_OFFSET && arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n].from_slot && arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n].cost <= nCodingChoiceCost;
|
||||
n++) {
|
||||
if (arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n].rep_offset == arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset) {
|
||||
exists = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (n = 0; !exists && n < NMATCHES_PER_OFFSET; n++) {
|
||||
lzsa_arrival *pDestArrival = &arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n];
|
||||
if (pDestArrival->from_slot == 0 ||
|
||||
nCodingChoiceCost <= pDestArrival->cost) {
|
||||
|
||||
if (pDestArrival->from_slot) {
|
||||
memmove(&arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n + 1],
|
||||
&arrival[((i + 1) << MATCHES_PER_OFFSET_SHIFT) + n],
|
||||
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
|
||||
lzsa_arrival *pDestSlots = &arrival[(i + 1) << MATCHES_PER_OFFSET_SHIFT];
|
||||
if (nCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
|
||||
int exists = 0;
|
||||
for (n = 0;
|
||||
n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nCodingChoiceCost;
|
||||
n++) {
|
||||
if (pDestSlots[n].rep_offset == arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset) {
|
||||
exists = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pDestArrival->cost = nCodingChoiceCost;
|
||||
pDestArrival->from_pos = i;
|
||||
pDestArrival->from_slot = j + 1;
|
||||
pDestArrival->match_offset = 0;
|
||||
pDestArrival->match_len = 0;
|
||||
pDestArrival->num_literals = nNumLiterals;
|
||||
pDestArrival->rep_offset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
|
||||
break;
|
||||
if (!exists) {
|
||||
for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
|
||||
lzsa_arrival *pDestArrival = &pDestSlots[n];
|
||||
if (nCodingChoiceCost <= pDestArrival->cost) {
|
||||
|
||||
if (pDestArrival->from_slot) {
|
||||
memmove(&pDestSlots[n + 1],
|
||||
&pDestSlots[n],
|
||||
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
|
||||
}
|
||||
|
||||
pDestArrival->cost = nCodingChoiceCost;
|
||||
pDestArrival->from_pos = i;
|
||||
pDestArrival->from_slot = j + 1;
|
||||
pDestArrival->match_offset = 0;
|
||||
pDestArrival->match_len = 0;
|
||||
pDestArrival->num_literals = nNumLiterals;
|
||||
pDestArrival->rep_offset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -254,109 +263,124 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
|
||||
int nMatchOffset = match[m].offset;
|
||||
int nNoRepmatchOffsetCost = (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16));
|
||||
int nStartingMatchLen, k;
|
||||
int nMaxRepLen[NMATCHES_PER_OFFSET];
|
||||
|
||||
if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
|
||||
nMatchLen = nEndOffset - LAST_LITERALS - i;
|
||||
|
||||
if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE)
|
||||
nStartingMatchLen = nMatchLen;
|
||||
else
|
||||
nStartingMatchLen = nMinMatchSize;
|
||||
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
|
||||
const int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost;
|
||||
int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
|
||||
int nMaxRepLen = 0;
|
||||
int nCurMaxRepLen = 0;
|
||||
|
||||
if (nMatchOffset != nRepOffset &&
|
||||
nRepOffset &&
|
||||
i >= nRepOffset &&
|
||||
(i - nRepOffset + nMatchLen) <= (nEndOffset - LAST_LITERALS)) {
|
||||
while (nMaxRepLen < nMatchLen && pInWindow[i - nRepOffset + nMaxRepLen] == pInWindow[i - nMatchOffset + nMaxRepLen])
|
||||
nMaxRepLen++;
|
||||
while (nCurMaxRepLen < nMatchLen && pInWindow[i - nRepOffset + nCurMaxRepLen] == pInWindow[i - nMatchOffset + nCurMaxRepLen])
|
||||
nCurMaxRepLen++;
|
||||
}
|
||||
|
||||
for (k = nStartingMatchLen; k <= nMatchLen; k++) {
|
||||
int nMatchLenCost = lzsa_get_match_varlen_size_v2(k - MIN_MATCH_SIZE_V2);
|
||||
lzsa_arrival *pDestSlots = &arrival[(i + k) << MATCHES_PER_OFFSET_SHIFT];
|
||||
nMaxRepLen[j] = nCurMaxRepLen;
|
||||
}
|
||||
while (j < NMATCHES_PER_OFFSET)
|
||||
nMaxRepLen[j++] = 0;
|
||||
|
||||
if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE)
|
||||
nStartingMatchLen = nMatchLen;
|
||||
else
|
||||
nStartingMatchLen = nMinMatchSize;
|
||||
|
||||
for (k = nStartingMatchLen; k <= nMatchLen; k++) {
|
||||
int nMatchLenCost = lzsa_get_match_varlen_size_v2(k - MIN_MATCH_SIZE_V2);
|
||||
lzsa_arrival *pDestSlots = &arrival[(i + k) << MATCHES_PER_OFFSET_SHIFT];
|
||||
|
||||
for (j = 0; j < NMATCHES_PER_OFFSET && arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].from_slot; j++) {
|
||||
const int nPrevCost = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].cost & 0x3fffffff;
|
||||
int nRepOffset = arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].rep_offset;
|
||||
|
||||
int nMatchOffsetCost = (nMatchOffset == nRepOffset) ? 0 : nNoRepmatchOffsetCost;
|
||||
int nCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchOffsetCost + nMatchLenCost;
|
||||
int exists = 0;
|
||||
int nRepCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + nMatchLenCost;
|
||||
int nCodingChoiceCost = nRepCodingChoiceCost + nMatchOffsetCost;
|
||||
|
||||
if (!nFavorRatio && !arrival[(i << MATCHES_PER_OFFSET_SHIFT) + j].num_literals)
|
||||
nCodingChoiceCost += MODESWITCH_PENALTY;
|
||||
|
||||
for (n = 0;
|
||||
n < NMATCHES_PER_OFFSET && pDestSlots[n].from_slot && pDestSlots[n].cost <= nCodingChoiceCost;
|
||||
n++) {
|
||||
if (pDestSlots[n].rep_offset == nMatchOffset) {
|
||||
exists = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (nRepCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
|
||||
if (nCodingChoiceCost <= pDestSlots[NMATCHES_PER_OFFSET - 1].cost) {
|
||||
int exists = 0;
|
||||
|
||||
for (n = 0; !exists && n < NMATCHES_PER_OFFSET; n++) {
|
||||
lzsa_arrival *pDestArrival = &pDestSlots[n];
|
||||
|
||||
if (pDestArrival->from_slot == 0 ||
|
||||
nCodingChoiceCost <= pDestArrival->cost) {
|
||||
|
||||
if (pDestArrival->from_slot) {
|
||||
memmove(&pDestSlots[n + 1],
|
||||
&pDestSlots[n],
|
||||
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
|
||||
}
|
||||
|
||||
pDestArrival->cost = nCodingChoiceCost;
|
||||
pDestArrival->from_pos = i;
|
||||
pDestArrival->from_slot = j + 1;
|
||||
pDestArrival->match_offset = nMatchOffset;
|
||||
pDestArrival->match_len = k;
|
||||
pDestArrival->num_literals = 0;
|
||||
pDestArrival->rep_offset = nMatchOffset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If this coding choice doesn't rep-match, see if we still get a match by using the current repmatch offset for this arrival. This can occur (and not have the
|
||||
* matchfinder offer the offset in the first place, or have too many choices with the same cost to retain the repmatchable offset) when compressing regions
|
||||
* of identical bytes, for instance. Checking for this provides a big compression win on some files. */
|
||||
|
||||
if (i >= nRepOffset && nMaxRepLen >= k) {
|
||||
/* A match is possible at the rep offset; insert the extra coding choice. */
|
||||
|
||||
nCodingChoiceCost = nPrevCost + 8 /* token */ /* the actual cost of the literals themselves accumulates up the chain */ + /* rep match - no offset cost */ nMatchLenCost;
|
||||
exists = 0;
|
||||
|
||||
for (n = 0;
|
||||
n < NMATCHES_PER_OFFSET && pDestSlots[n].from_slot && pDestSlots[n].cost <= nCodingChoiceCost;
|
||||
n++) {
|
||||
if (pDestSlots[n].rep_offset == nRepOffset) {
|
||||
exists = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (n = 0; !exists && n < NMATCHES_PER_OFFSET; n++) {
|
||||
lzsa_arrival *pDestArrival = &pDestSlots[n];
|
||||
|
||||
if (pDestArrival->from_slot == 0 ||
|
||||
nCodingChoiceCost <= pDestArrival->cost) {
|
||||
|
||||
if (pDestArrival->from_slot) {
|
||||
memmove(&pDestSlots[n + 1],
|
||||
&pDestSlots[n],
|
||||
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
|
||||
for (n = 0;
|
||||
n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nCodingChoiceCost;
|
||||
n++) {
|
||||
if (pDestSlots[n].rep_offset == nMatchOffset) {
|
||||
exists = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pDestArrival->cost = nCodingChoiceCost;
|
||||
pDestArrival->from_pos = i;
|
||||
pDestArrival->from_slot = j + 1;
|
||||
pDestArrival->match_offset = nRepOffset;
|
||||
pDestArrival->match_len = k;
|
||||
pDestArrival->num_literals = 0;
|
||||
pDestArrival->rep_offset = nRepOffset;
|
||||
break;
|
||||
if (!exists) {
|
||||
for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
|
||||
lzsa_arrival *pDestArrival = &pDestSlots[n];
|
||||
|
||||
if (nCodingChoiceCost <= pDestArrival->cost) {
|
||||
if (pDestArrival->from_slot) {
|
||||
memmove(&pDestSlots[n + 1],
|
||||
&pDestSlots[n],
|
||||
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
|
||||
}
|
||||
|
||||
pDestArrival->cost = nCodingChoiceCost;
|
||||
pDestArrival->from_pos = i;
|
||||
pDestArrival->from_slot = j + 1;
|
||||
pDestArrival->match_offset = nMatchOffset;
|
||||
pDestArrival->match_len = k;
|
||||
pDestArrival->num_literals = 0;
|
||||
pDestArrival->rep_offset = nMatchOffset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If this coding choice doesn't rep-match, see if we still get a match by using the current repmatch offset for this arrival. This can occur (and not have the
|
||||
* matchfinder offer the offset in the first place, or have too many choices with the same cost to retain the repmatchable offset) when compressing regions
|
||||
* of identical bytes, for instance. Checking for this provides a big compression win on some files. */
|
||||
|
||||
if (nMaxRepLen[j] >= k) {
|
||||
int exists = 0;
|
||||
|
||||
/* A match is possible at the rep offset; insert the extra coding choice. */
|
||||
|
||||
for (n = 0;
|
||||
n < NMATCHES_PER_OFFSET && pDestSlots[n].cost <= nRepCodingChoiceCost;
|
||||
n++) {
|
||||
if (pDestSlots[n].rep_offset == nRepOffset) {
|
||||
exists = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!exists) {
|
||||
for (n = 0; n < NMATCHES_PER_OFFSET; n++) {
|
||||
lzsa_arrival *pDestArrival = &pDestSlots[n];
|
||||
|
||||
if (nRepCodingChoiceCost <= pDestArrival->cost) {
|
||||
if (pDestArrival->from_slot) {
|
||||
memmove(&pDestSlots[n + 1],
|
||||
&pDestSlots[n],
|
||||
sizeof(lzsa_arrival) * (NMATCHES_PER_OFFSET - n - 1));
|
||||
}
|
||||
|
||||
pDestArrival->cost = nRepCodingChoiceCost;
|
||||
pDestArrival->from_pos = i;
|
||||
pDestArrival->from_slot = j + 1;
|
||||
pDestArrival->match_offset = nRepOffset;
|
||||
pDestArrival->match_len = k;
|
||||
pDestArrival->num_literals = 0;
|
||||
pDestArrival->rep_offset = nRepOffset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -391,7 +415,10 @@ static void lzsa_optimize_forward_v2(lzsa_compressor *pCompressor, const unsigne
|
||||
static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const unsigned char *pInWindow, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset) {
|
||||
int i;
|
||||
int nNumLiterals = 0;
|
||||
int nPrevRepMatchOffset = 0;
|
||||
int nRepMatchOffset = 0;
|
||||
int nRepMatchLen = 0;
|
||||
int nRepIndex = 0;
|
||||
int nDidReduce = 0;
|
||||
|
||||
for (i = nStartOffset; i < nEndOffset; ) {
|
||||
@ -411,15 +438,17 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
|
||||
/* This command is a match, is followed by 'nNextLiterals' literals and then by another match */
|
||||
|
||||
if (nRepMatchOffset && pMatch->offset != nRepMatchOffset && (pBestMatch[nNextIndex].offset != pMatch->offset || pBestMatch[nNextIndex].offset == nRepMatchOffset ||
|
||||
((pMatch->offset <= 32) ? 4 : ((pMatch->offset <= 512) ? 8 : ((pMatch->offset <= (8192 + 512)) ? 12 : 16))) >=
|
||||
((pMatch->offset <= 32) ? 4 : ((pMatch->offset <= 512) ? 8 : ((pMatch->offset <= (8192 + 512)) ? 12 : 16))) >
|
||||
((pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16))))) {
|
||||
/* Check if we can change the current match's offset to be the same as the previous match's offset, and get an extra repmatch. This will occur when
|
||||
* matching large regions of identical bytes for instance, where there are too many offsets to be considered by the parser, and when not compressing to favor the
|
||||
* ratio (the forward arrivals parser already has this covered). */
|
||||
if (i >= nRepMatchOffset &&
|
||||
(i - nRepMatchOffset + pMatch->length) <= (nEndOffset - LAST_LITERALS) &&
|
||||
!memcmp(pInWindow + i - nRepMatchOffset, pInWindow + i - pMatch->offset, pMatch->length))
|
||||
if (i >= nRepMatchOffset &&
|
||||
(i - nRepMatchOffset + pMatch->length) <= (nEndOffset - LAST_LITERALS) &&
|
||||
!memcmp(pInWindow + i - nRepMatchOffset, pInWindow + i - pMatch->offset, pMatch->length)) {
|
||||
pMatch->offset = nRepMatchOffset;
|
||||
nDidReduce = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (pBestMatch[nNextIndex].offset && pMatch->offset != pBestMatch[nNextIndex].offset && nRepMatchOffset != pBestMatch[nNextIndex].offset) {
|
||||
@ -431,6 +460,7 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
|
||||
if (nMaxLen >= pMatch->length) {
|
||||
/* Replace */
|
||||
pMatch->offset = pBestMatch[nNextIndex].offset;
|
||||
nDidReduce = 1;
|
||||
}
|
||||
else if (nMaxLen >= 2 && pMatch->offset != nRepMatchOffset) {
|
||||
int nPartialSizeBefore, nPartialSizeAfter;
|
||||
@ -452,6 +482,7 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
|
||||
pBestMatch[i + j].length = 0;
|
||||
}
|
||||
pMatch->length = nMaxLen;
|
||||
nDidReduce = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -476,6 +507,15 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
|
||||
if (pBestMatch[nNextIndex].offset != nRepMatchOffset)
|
||||
nReducedCommandSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
|
||||
|
||||
int nReplaceRepOffset = 0;
|
||||
if (nRepMatchOffset && nRepMatchOffset != nPrevRepMatchOffset && nRepMatchLen >= MIN_MATCH_SIZE_V2 && nRepMatchOffset != pBestMatch[nNextIndex].offset && nRepIndex >= pBestMatch[nNextIndex].offset &&
|
||||
(nRepIndex - pBestMatch[nNextIndex].offset + nRepMatchLen) <= (nEndOffset - LAST_LITERALS) &&
|
||||
!memcmp(pInWindow + nRepIndex - nRepMatchOffset, pInWindow + nRepIndex - pBestMatch[nNextIndex].offset, nRepMatchLen)) {
|
||||
/* Replacing this match command by literals would let us create a repmatch */
|
||||
nReplaceRepOffset = 1;
|
||||
nReducedCommandSize -= (nRepMatchOffset <= 32) ? 4 : ((nRepMatchOffset <= 512) ? 8 : ((nRepMatchOffset <= (8192 + 512)) ? 12 : 16));
|
||||
}
|
||||
|
||||
if (nOriginalCombinedCommandSize >= nReducedCommandSize) {
|
||||
/* Reduce */
|
||||
int nMatchLen = pMatch->length;
|
||||
@ -486,6 +526,11 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
|
||||
}
|
||||
|
||||
nDidReduce = 1;
|
||||
|
||||
if (nReplaceRepOffset) {
|
||||
pBestMatch[nRepIndex].offset = pBestMatch[nNextIndex].offset;
|
||||
nRepMatchOffset = pBestMatch[nNextIndex].offset;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -502,10 +547,14 @@ static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, const un
|
||||
pMatch->length += pBestMatch[i + nMatchLen].length;
|
||||
pBestMatch[i + nMatchLen].offset = 0;
|
||||
pBestMatch[i + nMatchLen].length = -1;
|
||||
nDidReduce = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
nPrevRepMatchOffset = nRepMatchOffset;
|
||||
nRepMatchOffset = pMatch->offset;
|
||||
nRepMatchLen = pMatch->length;
|
||||
nRepIndex = i;
|
||||
|
||||
i += pMatch->length;
|
||||
nNumLiterals = 0;
|
||||
@ -586,6 +635,13 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
|
||||
nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals);
|
||||
if (nOutOffset < 0) return -1;
|
||||
|
||||
if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
|
||||
pCompressor->stats.min_literals = nNumLiterals;
|
||||
if (nNumLiterals > pCompressor->stats.max_literals)
|
||||
pCompressor->stats.max_literals = nNumLiterals;
|
||||
pCompressor->stats.total_literals += nNumLiterals;
|
||||
pCompressor->stats.literals_divisor++;
|
||||
|
||||
if (nNumLiterals != 0) {
|
||||
memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
|
||||
nOutOffset += nNumLiterals;
|
||||
@ -608,11 +664,45 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
|
||||
pOutData[nOutOffset++] = (-nMatchOffset) >> 8;
|
||||
pOutData[nOutOffset++] = (-nMatchOffset) & 0xff;
|
||||
}
|
||||
|
||||
if (nMatchOffset == nRepMatchOffset)
|
||||
pCompressor->stats.num_rep_offsets++;
|
||||
|
||||
nRepMatchOffset = nMatchOffset;
|
||||
|
||||
nOutOffset = lzsa_write_match_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nEncodedMatchLen);
|
||||
if (nOutOffset < 0) return -1;
|
||||
|
||||
if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1)
|
||||
pCompressor->stats.min_offset = nMatchOffset;
|
||||
if (nMatchOffset > pCompressor->stats.max_offset)
|
||||
pCompressor->stats.max_offset = nMatchOffset;
|
||||
pCompressor->stats.total_offsets += nMatchOffset;
|
||||
|
||||
if (nMatchLen < pCompressor->stats.min_match_len || pCompressor->stats.min_match_len == -1)
|
||||
pCompressor->stats.min_match_len = nMatchLen;
|
||||
if (nMatchLen > pCompressor->stats.max_match_len)
|
||||
pCompressor->stats.max_match_len = nMatchLen;
|
||||
pCompressor->stats.total_match_lens += nMatchLen;
|
||||
pCompressor->stats.match_divisor++;
|
||||
|
||||
if (nMatchOffset == 1) {
|
||||
if (nMatchLen < pCompressor->stats.min_rle1_len || pCompressor->stats.min_rle1_len == -1)
|
||||
pCompressor->stats.min_rle1_len = nMatchLen;
|
||||
if (nMatchLen > pCompressor->stats.max_rle1_len)
|
||||
pCompressor->stats.max_rle1_len = nMatchLen;
|
||||
pCompressor->stats.total_rle1_lens += nMatchLen;
|
||||
pCompressor->stats.rle1_divisor++;
|
||||
}
|
||||
else if (nMatchOffset == 2) {
|
||||
if (nMatchLen < pCompressor->stats.min_rle2_len || pCompressor->stats.min_rle2_len == -1)
|
||||
pCompressor->stats.min_rle2_len = nMatchLen;
|
||||
if (nMatchLen > pCompressor->stats.max_rle2_len)
|
||||
pCompressor->stats.max_rle2_len = nMatchLen;
|
||||
pCompressor->stats.total_rle2_lens += nMatchLen;
|
||||
pCompressor->stats.rle2_divisor++;
|
||||
}
|
||||
|
||||
i += nMatchLen;
|
||||
|
||||
if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) {
|
||||
@ -645,6 +735,13 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa
|
||||
nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals);
|
||||
if (nOutOffset < 0) return -1;
|
||||
|
||||
if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1)
|
||||
pCompressor->stats.min_literals = nNumLiterals;
|
||||
if (nNumLiterals > pCompressor->stats.max_literals)
|
||||
pCompressor->stats.max_literals = nNumLiterals;
|
||||
pCompressor->stats.total_literals += nNumLiterals;
|
||||
pCompressor->stats.literals_divisor++;
|
||||
|
||||
if (nNumLiterals != 0) {
|
||||
memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals);
|
||||
nOutOffset += nNumLiterals;
|
||||
|
@ -69,6 +69,11 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
|
||||
pCompressor->flags = nFlags;
|
||||
pCompressor->safe_dist = 0;
|
||||
pCompressor->num_commands = 0;
|
||||
|
||||
memset(&pCompressor->stats, 0, sizeof(pCompressor->stats));
|
||||
pCompressor->stats.min_literals = -1;
|
||||
pCompressor->stats.min_match_len = -1;
|
||||
pCompressor->stats.min_offset = -1;
|
||||
|
||||
if (!nResult) {
|
||||
pCompressor->intervals = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));
|
||||
@ -88,9 +93,6 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
|
||||
if (pCompressor->best_match) {
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -79,6 +79,35 @@ typedef struct {
|
||||
unsigned short match_len;
|
||||
} lzsa_arrival;
|
||||
|
||||
/** Compression statistics */
|
||||
typedef struct _lzsa_stats {
|
||||
int min_literals;
|
||||
int max_literals;
|
||||
int total_literals;
|
||||
|
||||
int min_offset;
|
||||
int max_offset;
|
||||
int num_rep_offsets;
|
||||
int total_offsets;
|
||||
|
||||
int min_match_len;
|
||||
int max_match_len;
|
||||
int total_match_lens;
|
||||
|
||||
int min_rle1_len;
|
||||
int max_rle1_len;
|
||||
int total_rle1_lens;
|
||||
|
||||
int min_rle2_len;
|
||||
int max_rle2_len;
|
||||
int total_rle2_lens;
|
||||
|
||||
int literals_divisor;
|
||||
int match_divisor;
|
||||
int rle1_divisor;
|
||||
int rle2_divisor;
|
||||
} lzsa_stats;
|
||||
|
||||
/** Compression context */
|
||||
typedef struct _lzsa_compressor {
|
||||
divsufsort_ctx_t divsufsort_context;
|
||||
@ -93,6 +122,7 @@ typedef struct _lzsa_compressor {
|
||||
int safe_dist;
|
||||
int num_commands;
|
||||
lzsa_hashmap_t cost_map;
|
||||
lzsa_stats stats;
|
||||
} lzsa_compressor;
|
||||
|
||||
/**
|
||||
|
@ -71,11 +71,12 @@ static void lzsa_delete_file(const char *pszInFilename) {
|
||||
* @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
|
||||
* @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
|
||||
* @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
|
||||
* @param pStats pointer to compression stats that are filled if this function is successful, or NULL
|
||||
*
|
||||
* @return LZSA_OK for success, or an error value from lzsa_status_t
|
||||
*/
|
||||
lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist) {
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats) {
|
||||
lzsa_stream_t inStream, outStream;
|
||||
void *pDictionaryData = NULL;
|
||||
int nDictionaryDataSize = 0;
|
||||
@ -99,7 +100,7 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi
|
||||
return nStatus;
|
||||
}
|
||||
|
||||
nStatus = lzsa_compress_stream(&inStream, &outStream, pDictionaryData, nDictionaryDataSize, nFlags, nMinMatchSize, nFormatVersion, progress, pOriginalSize, pCompressedSize, pCommandCount, pSafeDist);
|
||||
nStatus = lzsa_compress_stream(&inStream, &outStream, pDictionaryData, nDictionaryDataSize, nFlags, nMinMatchSize, nFormatVersion, progress, pOriginalSize, pCompressedSize, pCommandCount, pSafeDist, pStats);
|
||||
|
||||
lzsa_dictionary_free(&pDictionaryData);
|
||||
outStream.close(&outStream);
|
||||
@ -129,12 +130,13 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi
|
||||
* @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
|
||||
* @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
|
||||
* @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
|
||||
* @param pStats pointer to compression stats that are filled if this function is successful, or NULL
|
||||
*
|
||||
* @return LZSA_OK for success, or an error value from lzsa_status_t
|
||||
*/
|
||||
lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOutStream, const void *pDictionaryData, int nDictionaryDataSize,
|
||||
const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist) {
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats) {
|
||||
unsigned char *pInData, *pOutData;
|
||||
lzsa_compressor compressor;
|
||||
long long nOriginalSize = 0LL, nCompressedSize = 0LL;
|
||||
@ -289,6 +291,10 @@ lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOut
|
||||
|
||||
int nCommandCount = lzsa_compressor_get_command_count(&compressor);
|
||||
int nSafeDist = compressor.safe_dist;
|
||||
|
||||
if (pStats)
|
||||
*pStats = compressor.stats;
|
||||
|
||||
lzsa_compressor_destroy(&compressor);
|
||||
|
||||
free(pOutData);
|
||||
|
@ -41,6 +41,7 @@ extern "C" {
|
||||
|
||||
/* Forward declaration */
|
||||
typedef enum _lzsa_status_t lzsa_status_t;
|
||||
typedef struct _lzsa_stats lzsa_stats;
|
||||
|
||||
/*-------------- File API -------------- */
|
||||
|
||||
@ -58,12 +59,13 @@ typedef enum _lzsa_status_t lzsa_status_t;
|
||||
* @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
|
||||
* @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
|
||||
* @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
|
||||
* @param pStats pointer to compression stats that are filled if this function is successful, or NULL
|
||||
*
|
||||
* @return LZSA_OK for success, or an error value from lzsa_status_t
|
||||
*/
|
||||
lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename,
|
||||
const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist);
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats);
|
||||
|
||||
/*-------------- Streaming API -------------- */
|
||||
|
||||
@ -82,12 +84,13 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi
|
||||
* @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful
|
||||
* @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful
|
||||
* @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful
|
||||
* @param pStats pointer to compression stats that are filled if this function is successful, or NULL
|
||||
*
|
||||
* @return LZSA_OK for success, or an error value from lzsa_status_t
|
||||
*/
|
||||
lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOutStream, const void *pDictionaryData, int nDictionaryDataSize,
|
||||
const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion,
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist);
|
||||
void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user