diff --git a/src/lzsa.c b/src/lzsa.c index 0d85cf8..2574562 100755 --- a/src/lzsa.c +++ b/src/lzsa.c @@ -46,6 +46,7 @@ #define OPT_RAW 2 #define OPT_FAVOR_RATIO 4 #define OPT_RAW_BACKWARD 8 +#define OPT_STATS 16 #define TOOL_VERSION "1.1.0" @@ -104,6 +105,7 @@ static int do_compress(const char *pszInFilename, const char *pszOutFilename, co int nCommandCount = 0, nSafeDist = 0; int nFlags; lzsa_status_t nStatus; + lzsa_stats stats; nFlags = 0; if (nOptions & OPT_FAVOR_RATIO) @@ -117,7 +119,7 @@ static int do_compress(const char *pszInFilename, const char *pszOutFilename, co nStartTime = do_get_time(); } - nStatus = lzsa_compress_file(pszInFilename, pszOutFilename, pszDictionaryFilename, nFlags, nMinMatchSize, nFormatVersion, compression_progress, &nOriginalSize, &nCompressedSize, &nCommandCount, &nSafeDist); + nStatus = lzsa_compress_file(pszInFilename, pszOutFilename, pszDictionaryFilename, nFlags, nMinMatchSize, nFormatVersion, compression_progress, &nOriginalSize, &nCompressedSize, &nCommandCount, &nSafeDist, &stats); if ((nOptions & OPT_VERBOSE)) { nEndTime = do_get_time(); @@ -149,6 +151,32 @@ static int do_compress(const char *pszInFilename, const char *pszOutFilename, co } } + if (nOptions & OPT_STATS) { + if (stats.literals_divisor > 0) + fprintf(stdout, "Literals: min: %d avg: %d max: %d count: %d\n", stats.min_literals, stats.total_literals / stats.literals_divisor, stats.max_literals, stats.literals_divisor); + else + fprintf(stdout, "Literals: none\n"); + if (stats.match_divisor > 0) { + fprintf(stdout, "Offsets: min: %d avg: %d max: %d reps: %d count: %d\n", stats.min_offset, stats.total_offsets / stats.match_divisor, stats.max_offset, stats.num_rep_offsets, stats.match_divisor); + fprintf(stdout, "Match lens: min: %d avg: %d max: %d count: %d\n", stats.min_match_len, stats.total_match_lens / stats.match_divisor, stats.max_match_len, stats.match_divisor); + } + else { + fprintf(stdout, "Offsets: none\n"); + fprintf(stdout, "Match lens: none\n"); + } + if (stats.rle1_divisor > 0) { + fprintf(stdout, "RLE1 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle1_len, stats.total_rle1_lens / stats.rle1_divisor, stats.max_rle1_len, stats.rle1_divisor); + } + else { + fprintf(stdout, "RLE1 lens: none\n"); + } + if (stats.rle2_divisor > 0) { + fprintf(stdout, "RLE2 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle2_len, stats.total_rle2_lens / stats.rle2_divisor, stats.max_rle2_len, stats.rle2_divisor); + } + else { + fprintf(stdout, "RLE2 lens: none\n"); + } + } return 0; } @@ -1009,6 +1037,13 @@ int main(int argc, char **argv) { else bArgsError = true; } + else if (!strcmp(argv[i], "-stats")) { + if ((nOptions & OPT_STATS) == 0) { + nOptions |= OPT_STATS; + } + else + bArgsError = true; + } else { if (!pszInFilename) pszInFilename = argv[i]; diff --git a/src/shrink_block_v1.c b/src/shrink_block_v1.c index c785f70..1182c56 100644 --- a/src/shrink_block_v1.c +++ b/src/shrink_block_v1.c @@ -380,6 +380,13 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char pOutData[nOutOffset++] = nTokenLongOffset | (nTokenLiteralsLen << 4) | nTokenMatchLen; nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals); + if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1) + pCompressor->stats.min_literals = nNumLiterals; + if (nNumLiterals > pCompressor->stats.max_literals) + pCompressor->stats.max_literals = nNumLiterals; + pCompressor->stats.total_literals += nNumLiterals; + pCompressor->stats.literals_divisor++; + if (nNumLiterals != 0) { memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals); nOutOffset += nNumLiterals; @@ -391,6 +398,37 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char pOutData[nOutOffset++] = (-nMatchOffset) >> 8; } nOutOffset = lzsa_write_match_varlen_v1(pOutData, nOutOffset, nEncodedMatchLen); + + if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1) + pCompressor->stats.min_offset = nMatchOffset; + if (nMatchOffset > pCompressor->stats.max_offset) + pCompressor->stats.max_offset = nMatchOffset; + pCompressor->stats.total_offsets += nMatchOffset; + + if (nMatchLen < pCompressor->stats.min_match_len || pCompressor->stats.min_match_len == -1) + pCompressor->stats.min_match_len = nMatchLen; + if (nMatchLen > pCompressor->stats.max_match_len) + pCompressor->stats.max_match_len = nMatchLen; + pCompressor->stats.total_match_lens += nMatchLen; + pCompressor->stats.match_divisor++; + + if (nMatchOffset == 1) { + if (nMatchLen < pCompressor->stats.min_rle1_len || pCompressor->stats.min_rle1_len == -1) + pCompressor->stats.min_rle1_len = nMatchLen; + if (nMatchLen > pCompressor->stats.max_rle1_len) + pCompressor->stats.max_rle1_len = nMatchLen; + pCompressor->stats.total_rle1_lens += nMatchLen; + pCompressor->stats.rle1_divisor++; + } + else if (nMatchOffset == 2) { + if (nMatchLen < pCompressor->stats.min_rle2_len || pCompressor->stats.min_rle2_len == -1) + pCompressor->stats.min_rle2_len = nMatchLen; + if (nMatchLen > pCompressor->stats.max_rle2_len) + pCompressor->stats.max_rle2_len = nMatchLen; + pCompressor->stats.total_rle2_lens += nMatchLen; + pCompressor->stats.rle2_divisor++; + } + i += nMatchLen; if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) { @@ -422,6 +460,13 @@ static int lzsa_write_block_v1(lzsa_compressor *pCompressor, const unsigned char pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x00; nOutOffset = lzsa_write_literals_varlen_v1(pOutData, nOutOffset, nNumLiterals); + if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1) + pCompressor->stats.min_literals = nNumLiterals; + if (nNumLiterals > pCompressor->stats.max_literals) + pCompressor->stats.max_literals = nNumLiterals; + pCompressor->stats.total_literals += nNumLiterals; + pCompressor->stats.literals_divisor++; + if (nNumLiterals != 0) { memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals); nOutOffset += nNumLiterals; diff --git a/src/shrink_block_v2.c b/src/shrink_block_v2.c index edddc10..9003e44 100644 --- a/src/shrink_block_v2.c +++ b/src/shrink_block_v2.c @@ -633,6 +633,13 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals); if (nOutOffset < 0) return -1; + if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1) + pCompressor->stats.min_literals = nNumLiterals; + if (nNumLiterals > pCompressor->stats.max_literals) + pCompressor->stats.max_literals = nNumLiterals; + pCompressor->stats.total_literals += nNumLiterals; + pCompressor->stats.literals_divisor++; + if (nNumLiterals != 0) { memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals); nOutOffset += nNumLiterals; @@ -655,11 +662,45 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa pOutData[nOutOffset++] = (-nMatchOffset) >> 8; pOutData[nOutOffset++] = (-nMatchOffset) & 0xff; } + + if (nMatchOffset == nRepMatchOffset) + pCompressor->stats.num_rep_offsets++; + nRepMatchOffset = nMatchOffset; nOutOffset = lzsa_write_match_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nEncodedMatchLen); if (nOutOffset < 0) return -1; + if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1) + pCompressor->stats.min_offset = nMatchOffset; + if (nMatchOffset > pCompressor->stats.max_offset) + pCompressor->stats.max_offset = nMatchOffset; + pCompressor->stats.total_offsets += nMatchOffset; + + if (nMatchLen < pCompressor->stats.min_match_len || pCompressor->stats.min_match_len == -1) + pCompressor->stats.min_match_len = nMatchLen; + if (nMatchLen > pCompressor->stats.max_match_len) + pCompressor->stats.max_match_len = nMatchLen; + pCompressor->stats.total_match_lens += nMatchLen; + pCompressor->stats.match_divisor++; + + if (nMatchOffset == 1) { + if (nMatchLen < pCompressor->stats.min_rle1_len || pCompressor->stats.min_rle1_len == -1) + pCompressor->stats.min_rle1_len = nMatchLen; + if (nMatchLen > pCompressor->stats.max_rle1_len) + pCompressor->stats.max_rle1_len = nMatchLen; + pCompressor->stats.total_rle1_lens += nMatchLen; + pCompressor->stats.rle1_divisor++; + } + else if (nMatchOffset == 2) { + if (nMatchLen < pCompressor->stats.min_rle2_len || pCompressor->stats.min_rle2_len == -1) + pCompressor->stats.min_rle2_len = nMatchLen; + if (nMatchLen > pCompressor->stats.max_rle2_len) + pCompressor->stats.max_rle2_len = nMatchLen; + pCompressor->stats.total_rle2_lens += nMatchLen; + pCompressor->stats.rle2_divisor++; + } + i += nMatchLen; if (pCompressor->flags & LZSA_FLAG_RAW_BLOCK) { @@ -692,6 +733,13 @@ static int lzsa_write_block_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMa nOutOffset = lzsa_write_literals_varlen_v2(pOutData, nOutOffset, nMaxOutDataSize, &nCurNibbleOffset, &nCurFreeNibbles, nNumLiterals); if (nOutOffset < 0) return -1; + if (nNumLiterals < pCompressor->stats.min_literals || pCompressor->stats.min_literals == -1) + pCompressor->stats.min_literals = nNumLiterals; + if (nNumLiterals > pCompressor->stats.max_literals) + pCompressor->stats.max_literals = nNumLiterals; + pCompressor->stats.total_literals += nNumLiterals; + pCompressor->stats.literals_divisor++; + if (nNumLiterals != 0) { memcpy(pOutData + nOutOffset, pInWindow + nInFirstLiteralOffset, nNumLiterals); nOutOffset += nNumLiterals; diff --git a/src/shrink_context.c b/src/shrink_context.c index 0acc7db..e1cc9c6 100644 --- a/src/shrink_context.c +++ b/src/shrink_context.c @@ -69,6 +69,11 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize, pCompressor->flags = nFlags; pCompressor->safe_dist = 0; pCompressor->num_commands = 0; + + memset(&pCompressor->stats, 0, sizeof(pCompressor->stats)); + pCompressor->stats.min_literals = -1; + pCompressor->stats.min_match_len = -1; + pCompressor->stats.min_offset = -1; if (!nResult) { pCompressor->intervals = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int)); diff --git a/src/shrink_context.h b/src/shrink_context.h index a63ddac..fd8176b 100644 --- a/src/shrink_context.h +++ b/src/shrink_context.h @@ -79,6 +79,35 @@ typedef struct { unsigned short match_len; } lzsa_arrival; +/** Compression statistics */ +typedef struct _lzsa_stats { + int min_literals; + int max_literals; + int total_literals; + + int min_offset; + int max_offset; + int num_rep_offsets; + int total_offsets; + + int min_match_len; + int max_match_len; + int total_match_lens; + + int min_rle1_len; + int max_rle1_len; + int total_rle1_lens; + + int min_rle2_len; + int max_rle2_len; + int total_rle2_lens; + + int literals_divisor; + int match_divisor; + int rle1_divisor; + int rle2_divisor; +} lzsa_stats; + /** Compression context */ typedef struct _lzsa_compressor { divsufsort_ctx_t divsufsort_context; @@ -93,6 +122,7 @@ typedef struct _lzsa_compressor { int safe_dist; int num_commands; lzsa_hashmap_t cost_map; + lzsa_stats stats; } lzsa_compressor; /** diff --git a/src/shrink_streaming.c b/src/shrink_streaming.c index 821f528..2e1cf12 100644 --- a/src/shrink_streaming.c +++ b/src/shrink_streaming.c @@ -71,11 +71,12 @@ static void lzsa_delete_file(const char *pszInFilename) { * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful + * @param pStats pointer to compression stats that are filled if this function is successful, or NULL * * @return LZSA_OK for success, or an error value from lzsa_status_t */ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion, - void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist) { + void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats) { lzsa_stream_t inStream, outStream; void *pDictionaryData = NULL; int nDictionaryDataSize = 0; @@ -99,7 +100,7 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi return nStatus; } - nStatus = lzsa_compress_stream(&inStream, &outStream, pDictionaryData, nDictionaryDataSize, nFlags, nMinMatchSize, nFormatVersion, progress, pOriginalSize, pCompressedSize, pCommandCount, pSafeDist); + nStatus = lzsa_compress_stream(&inStream, &outStream, pDictionaryData, nDictionaryDataSize, nFlags, nMinMatchSize, nFormatVersion, progress, pOriginalSize, pCompressedSize, pCommandCount, pSafeDist, pStats); lzsa_dictionary_free(&pDictionaryData); outStream.close(&outStream); @@ -129,12 +130,13 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful + * @param pStats pointer to compression stats that are filled if this function is successful, or NULL * * @return LZSA_OK for success, or an error value from lzsa_status_t */ lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOutStream, const void *pDictionaryData, int nDictionaryDataSize, const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion, - void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist) { + void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats) { unsigned char *pInData, *pOutData; lzsa_compressor compressor; long long nOriginalSize = 0LL, nCompressedSize = 0LL; @@ -289,6 +291,10 @@ lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOut int nCommandCount = lzsa_compressor_get_command_count(&compressor); int nSafeDist = compressor.safe_dist; + + if (pStats) + *pStats = compressor.stats; + lzsa_compressor_destroy(&compressor); free(pOutData); diff --git a/src/shrink_streaming.h b/src/shrink_streaming.h index 1e66bdc..0920edf 100644 --- a/src/shrink_streaming.h +++ b/src/shrink_streaming.h @@ -41,6 +41,7 @@ extern "C" { /* Forward declaration */ typedef enum _lzsa_status_t lzsa_status_t; +typedef struct _lzsa_stats lzsa_stats; /*-------------- File API -------------- */ @@ -58,12 +59,13 @@ typedef enum _lzsa_status_t lzsa_status_t; * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful + * @param pStats pointer to compression stats that are filled if this function is successful, or NULL * * @return LZSA_OK for success, or an error value from lzsa_status_t */ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion, - void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist); + void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats); /*-------------- Streaming API -------------- */ @@ -82,12 +84,13 @@ lzsa_status_t lzsa_compress_file(const char *pszInFilename, const char *pszOutFi * @param pCompressedSize pointer to returned output(compressed) size, updated when this function is successful * @param pCommandCount pointer to returned token(compression commands) count, updated when this function is successful * @param pSafeDist pointer to return safe distance for raw blocks, updated when this function is successful + * @param pStats pointer to compression stats that are filled if this function is successful, or NULL * * @return LZSA_OK for success, or an error value from lzsa_status_t */ lzsa_status_t lzsa_compress_stream(lzsa_stream_t *pInStream, lzsa_stream_t *pOutStream, const void *pDictionaryData, int nDictionaryDataSize, const unsigned int nFlags, const int nMinMatchSize, const int nFormatVersion, - void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist); + void(*progress)(long long nOriginalSize, long long nCompressedSize), long long *pOriginalSize, long long *pCompressedSize, int *pCommandCount, int *pSafeDist, lzsa_stats *pStats); #ifdef __cplusplus }