diff --git a/src/main.c b/src/main.c index 761f21e..57beb16 100755 --- a/src/main.c +++ b/src/main.c @@ -58,7 +58,7 @@ static long long lzsa_get_time() { /*---------------------------------------------------------------------------*/ -static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename, const unsigned int nOptions) { +static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename, const unsigned int nOptions, const int nMinMatchSize) { FILE *f_in, *f_out; unsigned char *pInData, *pOutData; lsza_compressor compressor; @@ -108,7 +108,7 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename, } memset(pOutData, 0, BLOCK_SIZE); - nResult = lzsa_compressor_init(&compressor, BLOCK_SIZE * 2); + nResult = lzsa_compressor_init(&compressor, BLOCK_SIZE * 2, nMinMatchSize); if (nResult != 0) { free(pOutData); pOutData = NULL; @@ -251,8 +251,8 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename, double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0; double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta; int nCommands = lzsa_compressor_get_command_count(&compressor); - fprintf(stdout, "\rCompressed '%s' in %g seconds, %.02g Mb/s, %d tokens (%lld bytes/token), %lld into %lld bytes ==> %g %%\n", - pszInFilename, fDelta, fSpeed, nCommands, nOriginalSize / ((long long)nCommands), + fprintf(stdout, "\rCompressed '%s' in %g seconds, %.02g Mb/s, %d tokens (%g bytes/token), %lld into %lld bytes ==> %g %%\n", + pszInFilename, fDelta, fSpeed, nCommands, (double)nOriginalSize / (double)nCommands, nOriginalSize, nCompressedSize, (double)(nCompressedSize * 100.0 / nOriginalSize)); fflush(stdout); } @@ -695,7 +695,9 @@ int main(int argc, char **argv) { bool bArgsError = false; bool bCommandDefined = false; bool bVerifyCompression = false; + bool bMinMatchDefined = false; char cCommand = 'z'; + int nMinMatchSize = MIN_MATCH_SIZE; unsigned int nOptions = 0; for (i = 1; i < argc; i++) { @@ -722,6 +724,52 @@ int main(int argc, char **argv) { else bArgsError = true; } + else if (!strcmp(argv[i], "-m")) { + if (!bMinMatchDefined && (i + 1) < argc) { + char *pEnd = NULL; + nMinMatchSize = (int)strtol(argv[i + 1], &pEnd, 10); + if (pEnd && pEnd != argv[i + 1] && (nMinMatchSize >= MIN_MATCH_SIZE || nMinMatchSize < MATCH_RUN_LEN)) { + i++; + bMinMatchDefined = true; + } + else { + bArgsError = true; + } + } + else + bArgsError = true; + } + else if (!strncmp(argv[i], "-m", 2)) { + if (!bMinMatchDefined) { + char *pEnd = NULL; + nMinMatchSize = (int)strtol(argv[i] + 2, &pEnd, 10); + if (pEnd && pEnd != (argv[i]+2) && (nMinMatchSize >= MIN_MATCH_SIZE || nMinMatchSize < MATCH_RUN_LEN)) { + i++; + bMinMatchDefined = true; + } + else { + bArgsError = true; + } + } + else + bArgsError = true; + } + else if (!strcmp(argv[i], "--prefer-ratio")) { + if (!bMinMatchDefined) { + nMinMatchSize = MIN_MATCH_SIZE; + bMinMatchDefined = true; + } + else + bArgsError = true; + } + else if (!strcmp(argv[i], "--prefer-speed")) { + if (!bMinMatchDefined) { + nMinMatchSize = 4; + bMinMatchDefined = true; + } + else + bArgsError = true; + } else if (!strcmp(argv[i], "-v")) { if ((nOptions & OPT_VERBOSE) == 0) { nOptions |= OPT_VERBOSE; @@ -754,11 +802,14 @@ int main(int argc, char **argv) { fprintf(stderr, " -d: decompress (default: compress)\n"); fprintf(stderr, " -v: be verbose\n"); fprintf(stderr, " -r: raw block format (max. 64 Kb files)\n"); + fprintf(stderr, " -m : minimum match size (3-14) (default: 3)\n"); + fprintf(stderr, " --prefer-ratio: favor compression ratio (default, same as -m 3)\n"); + fprintf(stderr, " --prefer-speed: favor decompression speed (same as -m 4)\n"); return 100; } if (cCommand == 'z') { - int nResult = lzsa_compress(pszInFilename, pszOutFilename, nOptions); + int nResult = lzsa_compress(pszInFilename, pszOutFilename, nOptions, nMinMatchSize); if (nResult == 0 && bVerifyCompression) { nResult = lzsa_compare(pszOutFilename, pszInFilename, nOptions); } diff --git a/src/shrink.c b/src/shrink.c index 013eef2..207bf09 100755 --- a/src/shrink.c +++ b/src/shrink.c @@ -60,10 +60,11 @@ typedef struct _lzsa_match { * * @param pCompressor compression context to initialize * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress) + * @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE) * * @return 0 for success, non-zero for failure */ -int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize) { +int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize) { int nResult; nResult = divsufsort_init(&pCompressor->divsufsort_context); @@ -71,6 +72,11 @@ int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize) pCompressor->pos_data = NULL; pCompressor->open_intervals = NULL; pCompressor->match = NULL; + pCompressor->min_match_size = nMinMatchSize; + if (pCompressor->min_match_size < MIN_MATCH_SIZE) + pCompressor->min_match_size = MIN_MATCH_SIZE; + else if (pCompressor->min_match_size > (MATCH_RUN_LEN - 1)) + pCompressor->min_match_size = MATCH_RUN_LEN - 1; pCompressor->num_commands = 0; if (!nResult) { @@ -167,10 +173,11 @@ static int lzsa_build_suffix_array(lsza_compressor *pCompressor, const unsigned * saves us from having to build the inverse suffix array index, as the LCP is calculated without it using this method, * and the interval builder below doesn't need it either. */ intervals[0] &= POS_MASK; + int nMinMatchSize = pCompressor->min_match_size; for (i = 1; i < nInWindowSize - 1; i++) { int nIndex = (int)(intervals[i] & POS_MASK); int nLen = PLCP[nIndex]; - if (nLen < MIN_MATCH_SIZE) + if (nLen < nMinMatchSize) nLen = 0; if (nLen > LCP_MAX) nLen = LCP_MAX; @@ -493,6 +500,7 @@ static inline int lzsa_write_match_varlen(unsigned char *pOutData, int nOutOffse static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) { int *cost = (int*)pCompressor->pos_data; /* Reuse */ int nLastLiteralsOffset; + int nMinMatchSize = pCompressor->min_match_size; int i; cost[nEndOffset - 1] = 1; @@ -514,7 +522,7 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT); int m; - for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= MIN_MATCH_SIZE; m++) { + for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) { int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 1 : 2; if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) { @@ -544,7 +552,7 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart if (nMatchRunLen > MATCH_RUN_LEN) nMatchRunLen = MATCH_RUN_LEN; - for (k = MIN_MATCH_SIZE; k < nMatchRunLen; k++) { + for (k = nMinMatchSize; k < nMatchRunLen; k++) { int nCurCost; nCurCost = 1 + nMatchOffsetSize /* no extra match len bytes */; diff --git a/src/shrink.h b/src/shrink.h index 7487fc8..bc4a3c4 100755 --- a/src/shrink.h +++ b/src/shrink.h @@ -35,6 +35,7 @@ typedef struct { unsigned int *pos_data; unsigned int *open_intervals; lzsa_match *match; + int min_match_size; int num_commands; } lsza_compressor; @@ -43,10 +44,11 @@ typedef struct { * * @param pCompressor compression context to initialize * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress) + * @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE) * * @return 0 for success, non-zero for failure */ -int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize); +int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize); /** * Clean up compression context and free up any associated resources