From cf51f1775ad3a7d4dcb9a30922ece92fd8101365 Mon Sep 17 00:00:00 2001 From: emmanuel-marty Date: Thu, 11 Apr 2019 23:47:57 +0200 Subject: [PATCH] Fix small literals cost estimate issue in optimizer, gain a small amount of compression and reduction of token count --- src/shrink.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/src/shrink.c b/src/shrink.c index 931801e..e826ac9 100755 --- a/src/shrink.c +++ b/src/shrink.c @@ -502,6 +502,8 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart int nLiteralsLen = nLastLiteralsOffset - i; nBestCost = 1 + cost[i + 1] + lzsa_get_literals_varlen_size(nLiteralsLen); + if (nLiteralsLen > 1) + nBestCost -= lzsa_get_literals_varlen_size(nLiteralsLen - 1); nBestMatchLen = 0; nBestMatchOffset = 0; @@ -518,9 +520,13 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart if (nRemainingLiteralsLen < 0) nRemainingLiteralsLen = 0; + if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS)) + nMatchLen = nEndOffset - LAST_LITERALS - i; + nCurCost = 1 + lzsa_get_literals_varlen_size(nRemainingLiteralsLen) + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE); - if ((i + nMatchLen) < nEndOffset) - nCurCost += cost[i + nMatchLen]; + nCurCost += cost[i + nMatchLen]; + if (nRemainingLiteralsLen > 1) + nCurCost -= lzsa_get_literals_varlen_size(nRemainingLiteralsLen - 1); if (nBestCost >= nCurCost) { nBestCost = nCurCost; @@ -530,9 +536,13 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart } else { if (pMatch[m].length >= MIN_MATCH_SIZE) { + int nMatchLen = pMatch[m].length; int k, nMatchRunLen; - nMatchRunLen = pMatch[m].length; + if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS)) + nMatchLen = nEndOffset - LAST_LITERALS - i; + + nMatchRunLen = nMatchLen; if (nMatchRunLen > MATCH_RUN_LEN) nMatchRunLen = MATCH_RUN_LEN; @@ -543,8 +553,9 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart if (nRemainingLiteralsLen < 0) nRemainingLiteralsLen = 0; nCurCost = 1 + lzsa_get_literals_varlen_size(nRemainingLiteralsLen) + nMatchOffsetSize /* no extra match len bytes */; - if ((i + k) < nEndOffset) - nCurCost += cost[i + k]; + nCurCost += cost[i + k]; + if (nRemainingLiteralsLen > 1) + nCurCost -= lzsa_get_literals_varlen_size(nRemainingLiteralsLen - 1); if (nBestCost >= nCurCost) { nBestCost = nCurCost; @@ -553,15 +564,16 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart } } - for (; k <= pMatch[m].length; k++) { + for (; k <= nMatchLen; k++) { int nCurCost; int nRemainingLiteralsLen = nLastLiteralsOffset - (i + k); if (nRemainingLiteralsLen < 0) nRemainingLiteralsLen = 0; nCurCost = 1 + lzsa_get_literals_varlen_size(nRemainingLiteralsLen) + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE); - if ((i + k) < nEndOffset) - nCurCost += cost[i + k]; + nCurCost += cost[i + k]; + if (nRemainingLiteralsLen > 1) + nCurCost -= lzsa_get_literals_varlen_size(nRemainingLiteralsLen - 1); if (nBestCost >= nCurCost) { nBestCost = nCurCost; @@ -642,6 +654,17 @@ static void lzsa_optimize_command_count(lsza_compressor *pCompressor, const int i += nMatchLen; } else { + if ((i + nMatchLen) < nEndOffset && nMatchLen >= LCP_MAX && pMatch->offset == 1 && + pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset == 1 && + (nMatchLen + pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length) <= 65535) { + /* Join */ + + pMatch->length += pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length; + pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].offset = 0; + pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length = -1; + continue; + } + nNumLiterals = 0; i += nMatchLen; }