Add --prefer-ratio, --prefer-speed, -m options

This commit is contained in:
emmanuel-marty 2019-04-21 09:41:12 +02:00
parent f837ed096e
commit 331d6f9911
3 changed files with 71 additions and 10 deletions

View File

@ -58,7 +58,7 @@ static long long lzsa_get_time() {
/*---------------------------------------------------------------------------*/
static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename, const unsigned int nOptions) {
static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename, const unsigned int nOptions, const int nMinMatchSize) {
FILE *f_in, *f_out;
unsigned char *pInData, *pOutData;
lsza_compressor compressor;
@ -108,7 +108,7 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename,
}
memset(pOutData, 0, BLOCK_SIZE);
nResult = lzsa_compressor_init(&compressor, BLOCK_SIZE * 2);
nResult = lzsa_compressor_init(&compressor, BLOCK_SIZE * 2, nMinMatchSize);
if (nResult != 0) {
free(pOutData);
pOutData = NULL;
@ -251,8 +251,8 @@ static int lzsa_compress(const char *pszInFilename, const char *pszOutFilename,
double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
int nCommands = lzsa_compressor_get_command_count(&compressor);
fprintf(stdout, "\rCompressed '%s' in %g seconds, %.02g Mb/s, %d tokens (%lld bytes/token), %lld into %lld bytes ==> %g %%\n",
pszInFilename, fDelta, fSpeed, nCommands, nOriginalSize / ((long long)nCommands),
fprintf(stdout, "\rCompressed '%s' in %g seconds, %.02g Mb/s, %d tokens (%g bytes/token), %lld into %lld bytes ==> %g %%\n",
pszInFilename, fDelta, fSpeed, nCommands, (double)nOriginalSize / (double)nCommands,
nOriginalSize, nCompressedSize, (double)(nCompressedSize * 100.0 / nOriginalSize));
fflush(stdout);
}
@ -695,7 +695,9 @@ int main(int argc, char **argv) {
bool bArgsError = false;
bool bCommandDefined = false;
bool bVerifyCompression = false;
bool bMinMatchDefined = false;
char cCommand = 'z';
int nMinMatchSize = MIN_MATCH_SIZE;
unsigned int nOptions = 0;
for (i = 1; i < argc; i++) {
@ -722,6 +724,52 @@ int main(int argc, char **argv) {
else
bArgsError = true;
}
else if (!strcmp(argv[i], "-m")) {
if (!bMinMatchDefined && (i + 1) < argc) {
char *pEnd = NULL;
nMinMatchSize = (int)strtol(argv[i + 1], &pEnd, 10);
if (pEnd && pEnd != argv[i + 1] && (nMinMatchSize >= MIN_MATCH_SIZE || nMinMatchSize < MATCH_RUN_LEN)) {
i++;
bMinMatchDefined = true;
}
else {
bArgsError = true;
}
}
else
bArgsError = true;
}
else if (!strncmp(argv[i], "-m", 2)) {
if (!bMinMatchDefined) {
char *pEnd = NULL;
nMinMatchSize = (int)strtol(argv[i] + 2, &pEnd, 10);
if (pEnd && pEnd != (argv[i]+2) && (nMinMatchSize >= MIN_MATCH_SIZE || nMinMatchSize < MATCH_RUN_LEN)) {
i++;
bMinMatchDefined = true;
}
else {
bArgsError = true;
}
}
else
bArgsError = true;
}
else if (!strcmp(argv[i], "--prefer-ratio")) {
if (!bMinMatchDefined) {
nMinMatchSize = MIN_MATCH_SIZE;
bMinMatchDefined = true;
}
else
bArgsError = true;
}
else if (!strcmp(argv[i], "--prefer-speed")) {
if (!bMinMatchDefined) {
nMinMatchSize = 4;
bMinMatchDefined = true;
}
else
bArgsError = true;
}
else if (!strcmp(argv[i], "-v")) {
if ((nOptions & OPT_VERBOSE) == 0) {
nOptions |= OPT_VERBOSE;
@ -754,11 +802,14 @@ int main(int argc, char **argv) {
fprintf(stderr, " -d: decompress (default: compress)\n");
fprintf(stderr, " -v: be verbose\n");
fprintf(stderr, " -r: raw block format (max. 64 Kb files)\n");
fprintf(stderr, " -m <value>: minimum match size (3-14) (default: 3)\n");
fprintf(stderr, " --prefer-ratio: favor compression ratio (default, same as -m 3)\n");
fprintf(stderr, " --prefer-speed: favor decompression speed (same as -m 4)\n");
return 100;
}
if (cCommand == 'z') {
int nResult = lzsa_compress(pszInFilename, pszOutFilename, nOptions);
int nResult = lzsa_compress(pszInFilename, pszOutFilename, nOptions, nMinMatchSize);
if (nResult == 0 && bVerifyCompression) {
nResult = lzsa_compare(pszOutFilename, pszInFilename, nOptions);
}

View File

@ -60,10 +60,11 @@ typedef struct _lzsa_match {
*
* @param pCompressor compression context to initialize
* @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
* @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE)
*
* @return 0 for success, non-zero for failure
*/
int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize) {
int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize) {
int nResult;
nResult = divsufsort_init(&pCompressor->divsufsort_context);
@ -71,6 +72,11 @@ int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize)
pCompressor->pos_data = NULL;
pCompressor->open_intervals = NULL;
pCompressor->match = NULL;
pCompressor->min_match_size = nMinMatchSize;
if (pCompressor->min_match_size < MIN_MATCH_SIZE)
pCompressor->min_match_size = MIN_MATCH_SIZE;
else if (pCompressor->min_match_size > (MATCH_RUN_LEN - 1))
pCompressor->min_match_size = MATCH_RUN_LEN - 1;
pCompressor->num_commands = 0;
if (!nResult) {
@ -167,10 +173,11 @@ static int lzsa_build_suffix_array(lsza_compressor *pCompressor, const unsigned
* saves us from having to build the inverse suffix array index, as the LCP is calculated without it using this method,
* and the interval builder below doesn't need it either. */
intervals[0] &= POS_MASK;
int nMinMatchSize = pCompressor->min_match_size;
for (i = 1; i < nInWindowSize - 1; i++) {
int nIndex = (int)(intervals[i] & POS_MASK);
int nLen = PLCP[nIndex];
if (nLen < MIN_MATCH_SIZE)
if (nLen < nMinMatchSize)
nLen = 0;
if (nLen > LCP_MAX)
nLen = LCP_MAX;
@ -493,6 +500,7 @@ static inline int lzsa_write_match_varlen(unsigned char *pOutData, int nOutOffse
static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
int *cost = (int*)pCompressor->pos_data; /* Reuse */
int nLastLiteralsOffset;
int nMinMatchSize = pCompressor->min_match_size;
int i;
cost[nEndOffset - 1] = 1;
@ -514,7 +522,7 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
int m;
for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= MIN_MATCH_SIZE; m++) {
for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 1 : 2;
if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
@ -544,7 +552,7 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
if (nMatchRunLen > MATCH_RUN_LEN)
nMatchRunLen = MATCH_RUN_LEN;
for (k = MIN_MATCH_SIZE; k < nMatchRunLen; k++) {
for (k = nMinMatchSize; k < nMatchRunLen; k++) {
int nCurCost;
nCurCost = 1 + nMatchOffsetSize /* no extra match len bytes */;

View File

@ -35,6 +35,7 @@ typedef struct {
unsigned int *pos_data;
unsigned int *open_intervals;
lzsa_match *match;
int min_match_size;
int num_commands;
} lsza_compressor;
@ -43,10 +44,11 @@ typedef struct {
*
* @param pCompressor compression context to initialize
* @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
* @param nMinMatchSize minimum match size (cannot be less than MIN_MATCH_SIZE)
*
* @return 0 for success, non-zero for failure
*/
int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize);
int lzsa_compressor_init(lsza_compressor *pCompressor, const int nMaxWindowSize, const int nMinMatchSize);
/**
* Clean up compression context and free up any associated resources