Increase LZSA2 ratio. Decrease token count

2024-06-26 05:29:27 +00:00 · 2019-09-17 08:10:52 +02:00 · 2019-09-17 08:10:52 +02:00 · 8b7d0ab04d
commit 8b7d0ab04d
parent b1da9c1aee
5 changed files with 93 additions and 106 deletions
--- a/src/lzsa.c
+++ b/src/lzsa.c
@ -47,7 +47,7 @@
 #define OPT_FAVOR_RATIO    4
 #define OPT_RAW_BACKWARD   8

-#define TOOL_VERSION "1.0.7"
+#define TOOL_VERSION "1.0.8"

 /*---------------------------------------------------------------------------*/

--- a/src/matchfinder.c
+++ b/src/matchfinder.c
@ -35,6 +35,17 @@
 #include "format.h"
 #include "lib.h"

+/**
+ * Hash index into TAG_BITS
+ *
+ * @param nIndex index value
+ *
+ * @return hash
+ */
+static inline int lzsa_get_index_tag(unsigned int nIndex) {
+   return (int)(((unsigned long long)nIndex * 11400714819323198485ULL) >> (64ULL - TAG_BITS));
+}
+
 /**
 * Parse input data, build suffix array and overlaid data structures to speed up match finding
 *
@ -78,15 +89,31 @@ int lzsa_build_suffix_array(lzsa_compressor *pCompressor, const unsigned char *p
    * and the interval builder below doesn't need it either. */
   intervals[0] &= POS_MASK;
   int nMinMatchSize = pCompressor->min_match_size;
-   for (i = 1; i < nInWindowSize - 1; i++) {
-      int nIndex = (int)(intervals[i] & POS_MASK);
-      int nLen = PLCP[nIndex];
-      if (nLen < nMinMatchSize)
-         nLen = 0;
-      if (nLen > LCP_MAX)
-         nLen = LCP_MAX;
-      intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nLen) << LCP_SHIFT);
+
+   if ((pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) && pCompressor->format_version >= 2) {
+      for (i = 1; i < nInWindowSize - 1; i++) {
+         int nIndex = (int)(intervals[i] & POS_MASK);
+         int nLen = PLCP[nIndex];
+         if (nLen < nMinMatchSize)
+            nLen = 0;
+         if (nLen > LCP_MAX)
+            nLen = LCP_MAX;
+         int nTaggedLen = (nLen << TAG_BITS) | (lzsa_get_index_tag((unsigned int)nIndex) & ((1 << TAG_BITS) - 1));
+         intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nTaggedLen) << LCP_SHIFT);
+      }
   }
+   else {
+      for (i = 1; i < nInWindowSize - 1; i++) {
+         int nIndex = (int)(intervals[i] & POS_MASK);
+         int nLen = PLCP[nIndex];
+         if (nLen < nMinMatchSize)
+            nLen = 0;
+         if (nLen > LCP_AND_TAG_MAX)
+            nLen = LCP_AND_TAG_MAX;
+         intervals[i] = ((unsigned int)nIndex) | (((unsigned int)nLen) << LCP_SHIFT);
+      }
+   }
+
   if (i < nInWindowSize)
      intervals[i] &= POS_MASK;

@ -219,7 +246,12 @@ int lzsa_find_matches_at(lzsa_compressor *pCompressor, const int nOffset, lzsa_m
         int nMatchOffset = (int)(nOffset - match_pos);

         if (nMatchOffset <= MAX_OFFSET) {
-            matchptr->length = (unsigned short)(ref >> LCP_SHIFT);
+            if ((pCompressor->flags & LZSA_FLAG_FAVOR_RATIO) && pCompressor->format_version >= 2) {
+               matchptr->length = (unsigned short)(ref >> (LCP_SHIFT + TAG_BITS));
+            }
+            else {
+               matchptr->length = (unsigned short)(ref >> LCP_SHIFT);
+            }
            matchptr->offset = (unsigned short)nMatchOffset;
            matchptr++;
         }
--- a/src/shrink_block_v2.c
+++ b/src/shrink_block_v2.c
@ -600,121 +600,74 @@ static void lzsa_optimize_backward_v2(lzsa_compressor *pCompressor, const int nS
 static int lzsa_optimize_command_count_v2(lzsa_compressor *pCompressor, lzsa_match *pBestMatch, const int nStartOffset, const int nEndOffset) {
   int i;
   int nNumLiterals = 0;
-   int nDidReduce = 0;
-   int nPreviousMatchOffset = -1;
   int nRepMatchOffset = 0;
-   lzsa_repmatch_opt *repmatch_opt = pCompressor->repmatch_opt;
+   int nDidReduce = 0;

   for (i = nStartOffset; i < nEndOffset; ) {
      lzsa_match *pMatch = pBestMatch + i;

      if (pMatch->length >= MIN_MATCH_SIZE_V2) {
-         int nMatchLen = pMatch->length;
-         int nReduce = 0;
-         int nCurrentMatchOffset = i;
+         if (pMatch->length < 9 && /* Don't waste time considering large matches, they will always win over literals */
+            (i + pMatch->length) < nEndOffset /* Don't consider the last match in the block, we can only reduce a match inbetween other tokens */) {
+            int nNextIndex = i + pMatch->length;
+            int nNextLiterals = 0;

-         if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <EE> <ll> <ll> <offset> <offset> <EE> <mm> <mm> */ {
-            int nMatchOffset = pMatch->offset;
-            int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE_V2;
-            int nRepMatchSize = (nRepMatchOffset <= 32) ? 4 : ((nRepMatchOffset <= 512) ? 8 : ((nRepMatchOffset <= (8192 + 512)) ? 12 : 16)) /* match offset */;
-            int nUndoRepMatchCost = (nPreviousMatchOffset < 0 || !repmatch_opt[nPreviousMatchOffset].expected_repmatch) ? 0 : nRepMatchSize;
+            while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < MIN_MATCH_SIZE_V2) {
+               nNextLiterals++;
+               nNextIndex++;
+            }

-            if (pBestMatch[i + nMatchLen].length >= MIN_MATCH_SIZE_V2) {
-               int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + lzsa_get_match_varlen_size_v2(nEncodedMatchLen) - nUndoRepMatchCost;
+            if (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length >= MIN_MATCH_SIZE_V2) {
+               /* This command is a match, is followed by 'nNextLiterals' literals and then by another match. Calculate this command's current cost (excluding 'nNumLiterals' bytes) */

-               if (pBestMatch[i + nMatchLen].offset != nMatchOffset) {
-                  nCommandSize += (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16)) /* match offset */;
-               }
+               int nCurCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + lzsa_get_match_varlen_size_v2(pMatch->length - MIN_MATCH_SIZE_V2);
+               if (pMatch->offset != nRepMatchOffset)
+                  nCurCommandSize += (pMatch->offset <= 32) ? 4 : ((pMatch->offset <= 512) ? 8 : ((pMatch->offset <= (8192 + 512)) ? 12 : 16));

-               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v2(nNumLiterals + nMatchLen))) {
-                  /* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
-                   * make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
-                   * at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
-                   * match command by literals, the output size will not increase and it will remove one command. */
-                  nReduce = 1;
-               }
-               else {
-                  if (nMatchOffset != nRepMatchOffset &&
-                      pBestMatch[i + nMatchLen].offset == nRepMatchOffset) {
+               /* Calculate the next command's current cost */
+               int nNextCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNextLiterals) + (nNextLiterals << 3) + lzsa_get_match_varlen_size_v2(pBestMatch[nNextIndex].length - MIN_MATCH_SIZE_V2);
+               if (pBestMatch[nNextIndex].offset != pMatch->offset)
+                  nNextCommandSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));

-                     if (nCommandSize > ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v2(nNumLiterals + nMatchLen) - nRepMatchSize)) {
-                        /* Same case, replacing this command by literals alone isn't enough on its own to have savings, however this match command is inbetween two matches with
-                         * identical offsets, while this command has a different match offset. Replacing it with literals allows to use a rep-match for the two commands around it, and
-                         * that is enough for some savings. Replace. */
-                        nReduce = 1;
-                     }
+               int nOriginalCombinedCommandSize = nCurCommandSize + nNextCommandSize;
+
+               /* Calculate the cost of replacing this match command by literals + the next command with the cost of encoding these literals (excluding 'nNumLiterals' bytes) */
+               int nReducedCommandSize = (pMatch->length << 3) + 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals + pMatch->length + nNextLiterals) + (nNextLiterals << 3) + lzsa_get_match_varlen_size_v2(pBestMatch[nNextIndex].length - MIN_MATCH_SIZE_V2);
+               if (pBestMatch[nNextIndex].offset != nRepMatchOffset)
+                  nReducedCommandSize += (pBestMatch[nNextIndex].offset <= 32) ? 4 : ((pBestMatch[nNextIndex].offset <= 512) ? 8 : ((pBestMatch[nNextIndex].offset <= (8192 + 512)) ? 12 : 16));
+
+               if (nOriginalCombinedCommandSize >= nReducedCommandSize) {
+                  /* Reduce */
+                  int nMatchLen = pMatch->length;
+                  int j;
+
+                  for (j = 0; j < nMatchLen; j++) {
+                     pBestMatch[i + j].length = 0;
                  }
-               }
-            }
-            else {
-               int nCurIndex = i + nMatchLen;
-               int nNextNumLiterals = 0;
-               int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size_v2(nNumLiterals) + lzsa_get_match_varlen_size_v2(nEncodedMatchLen) - nUndoRepMatchCost;;

-               do {
-                  nCurIndex++;
-                  nNextNumLiterals++;
-               } while (nCurIndex < nEndOffset && pBestMatch[nCurIndex].length < MIN_MATCH_SIZE_V2);
-
-               if (nCurIndex >= nEndOffset || pBestMatch[nCurIndex].length < MIN_MATCH_SIZE_V2 ||
-                  pBestMatch[nCurIndex].offset != nMatchOffset) {
-                  nCommandSize += (nMatchOffset <= 32) ? 4 : ((nMatchOffset <= 512) ? 8 : ((nMatchOffset <= (8192 + 512)) ? 12 : 16)) /* match offset */;
-               }
-
-               if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v2(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size_v2(nNextNumLiterals))) {
-                  /* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
-                   * more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
-                  nReduce = 1;
-               }
-               else {
-                  if (nCurIndex < nEndOffset && pBestMatch[nCurIndex].length >= MIN_MATCH_SIZE_V2 &&
-                     pBestMatch[nCurIndex].offset != nMatchOffset &&
-                     pBestMatch[nCurIndex].offset == nRepMatchOffset) {
-                     if (nCommandSize > ((nMatchLen << 3) + lzsa_get_literals_varlen_size_v2(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size_v2(nNextNumLiterals) - nRepMatchSize)) {
-                        /* Same case, but now replacing this command allows to use a rep-match and get savings, so do it */
-                        nReduce = 1;
-                     }
-                  }
+                  nDidReduce = 1;
+                  continue;
               }
            }
         }

-         if (nReduce) {
-            int j;
+         if ((i + pMatch->length) < nEndOffset && pMatch->length >= LCP_MAX &&
+            pMatch->offset && pMatch->offset <= 32 && pBestMatch[i + pMatch->length].offset == pMatch->offset && (pMatch->length % pMatch->offset) == 0 &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN) {
+            int nMatchLen = pMatch->length;

-            for (j = 0; j < nMatchLen; j++) {
-               pBestMatch[i + j].length = 0;
-            }
-            nNumLiterals += nMatchLen;
-            i += nMatchLen;
+            /* Join */

-            nDidReduce = 1;
-
-            if (nPreviousMatchOffset >= 0) {
-               repmatch_opt[nPreviousMatchOffset].expected_repmatch = 0;
-               nPreviousMatchOffset = -1;
-            }
-         }
-         else {
-            if (pMatch->length)
-               nRepMatchOffset = pMatch->offset;
-
-            if ((i + nMatchLen) < nEndOffset && nMatchLen >= LCP_MAX &&
-               pMatch->offset && pMatch->offset <= 32 && pBestMatch[i + nMatchLen].offset == pMatch->offset && (nMatchLen % pMatch->offset) == 0 &&
-               (nMatchLen + pBestMatch[i + nMatchLen].length) <= MAX_VARLEN) {
-               /* Join */
-
-               pMatch->length += pBestMatch[i + nMatchLen].length;
-               pBestMatch[i + nMatchLen].offset = 0;
-               pBestMatch[i + nMatchLen].length = -1;
-               continue;
-            }
-
-            nNumLiterals = 0;
-            i += nMatchLen;
+            pMatch->length += pBestMatch[i + nMatchLen].length;
+            pBestMatch[i + nMatchLen].offset = 0;
+            pBestMatch[i + nMatchLen].length = -1;
+            continue;
         }

-         nPreviousMatchOffset = nCurrentMatchOffset;
+         nRepMatchOffset = pMatch->offset;
+
+         i += pMatch->length;
+         nNumLiterals = 0;
      }
      else {
         nNumLiterals++;
--- a/src/shrink_context.c
+++ b/src/shrink_context.c
@ -81,7 +81,7 @@ int lzsa_compressor_init(lzsa_compressor *pCompressor, const int nMaxWindowSize,
         pCompressor->pos_data = (unsigned int *)malloc(nMaxWindowSize * sizeof(unsigned int));

         if (pCompressor->pos_data) {
-            pCompressor->open_intervals = (unsigned int *)malloc((LCP_MAX + 1) * sizeof(unsigned int));
+            pCompressor->open_intervals = (unsigned int *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned int));

            if (pCompressor->open_intervals) {
               pCompressor->match = (lzsa_match *)malloc(nMaxWindowSize * NMATCHES_PER_OFFSET * sizeof(lzsa_match));
--- a/src/shrink_context.h
+++ b/src/shrink_context.h
@ -41,7 +41,9 @@ extern "C" {
 #endif

 #define LCP_BITS 14
-#define LCP_MAX (1U<<(LCP_BITS - 1))
+#define TAG_BITS 3
+#define LCP_MAX (1U<<(LCP_BITS - TAG_BITS - 1))
+#define LCP_AND_TAG_MAX (1U<<(LCP_BITS - 1))
 #define LCP_SHIFT (31-LCP_BITS)
 #define LCP_MASK (((1U<<LCP_BITS) - 1) << LCP_SHIFT)
 #define POS_MASK ((1U<<LCP_SHIFT) - 1)