LZB Encoder: produces better compression (actually generates run/length pattern encoding)

2025-01-14 07:29:48 +00:00 · 2020-07-13 16:07:12 -04:00 · 2020-07-13 16:07:12 -04:00 · 0c5c466f64
commit 0c5c466f64
parent e2676ed7bd
1 changed files with 129 additions and 10 deletions
--- a/source/lzb.cpp
+++ b/source/lzb.cpp
@ -16,6 +16,7 @@ static unsigned char Dictionary[ DICTIONARY_SIZE ];
 static int AddDictionary(const std::vector<unsigned char>&data, int dictionarySize);
 static int EmitLiteral(unsigned char *pDest, std::vector<unsigned char>& data);
 static int ConcatLiteral(unsigned char *pDest, std::vector<unsigned char>& data);
 static int EmitReference(unsigned char *pDest, int dictionaryOffset, std::vector<unsigned char>& data);
 static int DictionaryMatch(const std::vector<unsigned char>& data, int dictionarySize);
@ -29,6 +30,10 @@ int LZB_Compress(unsigned char* pDest, unsigned char* pSource, int sourceSize)
 	int bytesInDictionary = 0;
 	int bytesEmitted = 0;
 	// dumb last emit is a literal stuff
 	bool bLastEmitIsLiteral = false;
 	int  lastEmittedLiteralOffset = 0;
 	std::vector<unsigned char> candidate_data;
 	while (processedBytes < sourceSize)
@ -39,8 +44,6 @@ int LZB_Compress(unsigned char* pDest, unsigned char* pSource, int sourceSize)
 		// The dictionary only contains bytes that have been emitted, so we
 		// can't add this byte until we've emitted it?
 		if (candidate_data.size() < 3) continue;
 		if (DictionaryMatch(candidate_data, bytesInDictionary) < 0)
 		{
 			// Was there a dictionary match
@ -49,28 +52,61 @@ int LZB_Compress(unsigned char* pDest, unsigned char* pSource, int sourceSize)
 			int MatchOffset = DictionaryMatch(prev_data, bytesInDictionary);
-			if ((MatchOffset >= 0) && prev_data.size() > 2)
+			if ((MatchOffset >= 0) && prev_data.size() > 3)
 			{
 				bytesInDictionary = AddDictionary(prev_data, bytesInDictionary);
 				bytesEmitted += EmitReference(pDest + bytesEmitted, MatchOffset, prev_data);
 				candidate_data[0] = candidate_data[ candidate_data.size() - 1 ];
 				candidate_data.resize(1);
 				bLastEmitIsLiteral = false;
 			}
 			else
 			{
 				// Add Dictionary
 				bytesInDictionary = AddDictionary(candidate_data, bytesInDictionary);
-				bytesEmitted += EmitLiteral(pDest + bytesEmitted, candidate_data);
+
 				if (bLastEmitIsLiteral)
 				{
 					// If the last emit was a literal, I want to concatenate
 					// this literal into the previous opcode, to save space
 					bytesEmitted += ConcatLiteral(pDest + lastEmittedLiteralOffset, candidate_data);
 				}
 				else
 				{
 					lastEmittedLiteralOffset = bytesEmitted;
 					bytesEmitted += EmitLiteral(pDest + bytesEmitted, candidate_data);
 				}
 				bLastEmitIsLiteral = true;
 			}
 		}
 	}
 	if (candidate_data.size() > 0)
 	{
-		// Emit as a literal? (we have 1 more chance here for a match
+
-		// Add Dictionary
+		int MatchOffset = DictionaryMatch(candidate_data, bytesInDictionary);
-		bytesInDictionary = AddDictionary(candidate_data, bytesInDictionary);
+
-		bytesEmitted += EmitLiteral(pDest + bytesEmitted, candidate_data);
+		if ((MatchOffset >=0) && candidate_data.size() > 2)
 		{
 			bytesInDictionary = AddDictionary(candidate_data, bytesInDictionary);
 			bytesEmitted += EmitReference(pDest + bytesEmitted, MatchOffset, candidate_data);
 		}
 		else
 		{
 			// Add Dictionary
 			bytesInDictionary = AddDictionary(candidate_data, bytesInDictionary);
 			if (bLastEmitIsLiteral)
 			{
 				// If the last emit was a literal, I want to concatenate
 				// this literal into the previous opcode, to save space
 				bytesEmitted += ConcatLiteral(pDest + lastEmittedLiteralOffset, candidate_data);
 			}
 			else
 			{
 				bytesEmitted += EmitLiteral(pDest + bytesEmitted, candidate_data);
 			}
 		}
 	}
 	return bytesEmitted;
@ -98,7 +134,45 @@ static int AddDictionary(const std::vector<unsigned char>&data, int dictionarySi
 //
 static int DictionaryMatch(const std::vector<unsigned char>& data, int dictionarySize)
 {
-	if(dictionarySize < data.size())
+	if( (0 == dictionarySize ) ||
 		(0 == data.size()) ||
 		(data.size() > 16384) ) // 16384 is largest string copy we can encode
 	{
 		return -1;
 	}
 	// Check the end of the dictionary, to see if this data could be a
 	// pattern "run" (where we can repeat a pattern for X many times for free
 	// using the memcpy with overlapping source/dest buffers)
 	// (This is a dictionary based pattern run/length)
 	{
 		// Check for pattern sizes, start small
 		int max_pattern_size = dictionarySize;
 		for (int pattern_size = 1; pattern_size <= max_pattern_size; ++pattern_size)
 		{
 			bool bMatch = true;
 			int pattern_start = dictionarySize - pattern_size;
 			for (int dataIndex = 0; dataIndex < data.size(); ++dataIndex)
 			{
 				if (data[ dataIndex ] == Dictionary[ pattern_start + (dataIndex % pattern_size) ])
 					continue;
 				bMatch = false;
 				break;
 			}
 			if (bMatch)
 			{
 				// Return a RLE Style match result
 				return pattern_start;
 			}
 		}
 	}
 	if (dictionarySize < data.size())
 	{
 		return -1;
 	}
@ -130,6 +204,38 @@ static int DictionaryMatch(const std::vector<unsigned char>& data, int dictionar
 	return result;
 }
 //------------------------------------------------------------------------------
 //
 // Emit a literal, that appends itself to an existing literal
 //
 static int ConcatLiteral(unsigned char *pDest, std::vector<unsigned char>& data)
 {
 	// Return Size
 	int outSize = (int)data.size();
 	int opCode  = pDest[0];
 	    opCode |= (int)(((pDest[1])&0x7F)<<8);
    int skip = opCode;
 	opCode += outSize;
 	// Opcode
 	*pDest++ = (unsigned char)(opCode & 0xFF);
 	*pDest++ = (unsigned char)((opCode >> 8) & 0x7F);
 	pDest += skip;
 	// Literal Data
 	for (int idx = 0; idx < data.size(); ++idx)
 	{
 		*pDest++ = data[ idx ];
 	}
 	data.clear();
 	return outSize;
 }
 //------------------------------------------------------------------------------
 static int EmitLiteral(unsigned char *pDest, std::vector<unsigned char>& data)
@ -171,6 +277,19 @@ static int EmitReference(unsigned char *pDest, int dictionaryOffset, std::vector
 	return outSize;
 }
 //------------------------------------------------------------------------------
 //
 // Std C memcpy seems to be stopping this from happening
 // probably for my protection
 //
 void mymemcpy(u8* pDest, u8* pSrc, int length)
 {
 	while (length-- > 0)
 	{
 		*pDest++ = *pSrc++;
 	}
 }
 //------------------------------------------------------------------------------
 //
 //  Simple Decompress, for validation
@ -193,7 +312,7 @@ void LZB_Decompress(unsigned char* pDest, unsigned char* pSource, int destSize)
 			u16 offset  = *pSource++;
 			    offset |= ((u16)(*pSource++))<<8;
-			memcpy(&pDest[ decompressedBytes ], &pDest[ offset ], opcode);
+			mymemcpy(&pDest[ decompressedBytes ], &pDest[ offset ], opcode);
 			decompressedBytes += opcode;
 		}
 		else