337 lines
9.6 KiB
C++
337 lines
9.6 KiB
C++
/*
|
|
|
|
Arsenic algorithm: BWT and arithmetic coding.
|
|
|
|
The code in this file is based on TheUnarchiver.
|
|
See README.md and docs/licenses/TheUnarchiver.txt for more information.
|
|
|
|
Copyright (C) 2019, Guillaume Gonnet
|
|
|
|
This program is free software: you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
Foundation, either version 3 of the License, or (at your option) any later
|
|
version.
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
|
PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
#include "stuffit/methods/arsenic.h"
|
|
#include "commands.h"
|
|
|
|
#include <make_unique.hpp>
|
|
#include <string.h>
|
|
|
|
namespace maconv {
|
|
namespace stuffit {
|
|
|
|
|
|
|
|
// The randomization table.
|
|
static const uint16_t kRandomizationTable[] = {
|
|
0xEE, 0x56, 0xF8, 0xC3, 0x9D, 0x9F, 0xAE, 0x2C,
|
|
0xAD, 0xCD, 0x24, 0x9D, 0xA6, 0x101, 0x18, 0xB9,
|
|
0xA1, 0x82, 0x75, 0xE9, 0x9F, 0x55, 0x66, 0x6A,
|
|
0x86, 0x71, 0xDC, 0x84, 0x56, 0x96, 0x56, 0xA1,
|
|
0x84, 0x78, 0xB7, 0x32, 0x6A, 0x3, 0xE3, 0x2,
|
|
0x11, 0x101, 0x8, 0x44, 0x83, 0x100, 0x43, 0xE3,
|
|
0x1C, 0xF0, 0x86, 0x6A, 0x6B, 0xF, 0x3, 0x2D,
|
|
0x86, 0x17, 0x7B, 0x10, 0xF6, 0x80, 0x78, 0x7A,
|
|
0xA1, 0xE1, 0xEF, 0x8C, 0xF6, 0x87, 0x4B, 0xA7,
|
|
0xE2, 0x77, 0xFA, 0xB8, 0x81, 0xEE, 0x77, 0xC0,
|
|
0x9D, 0x29, 0x20, 0x27, 0x71, 0x12, 0xE0, 0x6B,
|
|
0xD1, 0x7C, 0xA, 0x89, 0x7D, 0x87, 0xC4, 0x101,
|
|
0xC1, 0x31, 0xAF, 0x38, 0x3, 0x68, 0x1B, 0x76,
|
|
0x79, 0x3F, 0xDB, 0xC7, 0x1B, 0x36, 0x7B, 0xE2,
|
|
0x63, 0x81, 0xEE, 0xC, 0x63, 0x8B, 0x78, 0x38,
|
|
0x97, 0x9B, 0xD7, 0x8F, 0xDD, 0xF2, 0xA3, 0x77,
|
|
0x8C, 0xC3, 0x39, 0x20, 0xB3, 0x12, 0x11, 0xE,
|
|
0x17, 0x42, 0x80, 0x2C, 0xC4, 0x92, 0x59, 0xC8,
|
|
0xDB, 0x40, 0x76, 0x64, 0xB4, 0x55, 0x1A, 0x9E,
|
|
0xFE, 0x5F, 0x6, 0x3C, 0x41, 0xEF, 0xD4, 0xAA,
|
|
0x98, 0x29, 0xCD, 0x1F, 0x2, 0xA8, 0x87, 0xD2,
|
|
0xA0, 0x93, 0x98, 0xEF, 0xC, 0x43, 0xED, 0x9D,
|
|
0xC2, 0xEB, 0x81, 0xE9, 0x64, 0x23, 0x68, 0x1E,
|
|
0x25, 0x57, 0xDE, 0x9A, 0xCF, 0x7F, 0xE5, 0xBA,
|
|
0x41, 0xEA, 0xEA, 0x36, 0x1A, 0x28, 0x79, 0x20,
|
|
0x5E, 0x18, 0x4E, 0x7C, 0x8E, 0x58, 0x7A, 0xEF,
|
|
0x91, 0x2, 0x93, 0xBB, 0x56, 0xA1, 0x49, 0x1B,
|
|
0x79, 0x92, 0xF3, 0x58, 0x4F, 0x52, 0x9C, 0x2,
|
|
0x77, 0xAF, 0x2A, 0x8F, 0x49, 0xD0, 0x99, 0x4D,
|
|
0x98, 0x101, 0x60, 0x93, 0x100, 0x75, 0x31, 0xCE,
|
|
0x49, 0x20, 0x56, 0x57, 0xE2, 0xF5, 0x26, 0x2B,
|
|
0x8A, 0xBF, 0xDE, 0xD0, 0x83, 0x34, 0xF4, 0x17
|
|
};
|
|
|
|
|
|
|
|
|
|
// Initialize the model with some values.
|
|
void ArithmeticModel::Initialize(int first_symbol, int last_symbol, int increment,
|
|
int frequency_limit)
|
|
{
|
|
this->increment = increment;
|
|
this->freq_limit = frequency_limit;
|
|
this->num_symbols = last_symbol - first_symbol + 1;
|
|
|
|
ResetModel();
|
|
for (int i = 0; i < num_symbols; i++)
|
|
symbols[i].symbol = i + first_symbol;
|
|
}
|
|
|
|
|
|
// Reset the model.
|
|
void ArithmeticModel::ResetModel()
|
|
{
|
|
total_freq = increment * num_symbols;
|
|
for (int i = 0; i < num_symbols; i++)
|
|
symbols[i].freq = increment;
|
|
}
|
|
|
|
|
|
// Increase the model frequency at |symindex| by |increment|.
|
|
void ArithmeticModel::IncreaseFrequency(int symindex)
|
|
{
|
|
symbols[symindex].freq += increment;
|
|
|
|
total_freq += increment;
|
|
if (total_freq <= freq_limit)
|
|
return;
|
|
|
|
total_freq = 0;
|
|
for (int i = 0; i < num_symbols; i++) {
|
|
symbols[i].freq++;
|
|
symbols[i].freq >>= 1;
|
|
total_freq += symbols[i].freq;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// Decoder constants.
|
|
constexpr int kDecoderNumBits = 26;
|
|
constexpr int kDecoderOne = (1 << (kDecoderNumBits - 1));
|
|
constexpr int kDecoderHalf = (1 << (kDecoderNumBits - 2));
|
|
|
|
|
|
// Initialize the decoder with some values.
|
|
void ArithmeticDecoder::Initialize(uint8_t *data, uint32_t length)
|
|
{
|
|
input.Load(data, length);
|
|
range = kDecoderOne;
|
|
code = input.ReadLongWord(kDecoderNumBits);
|
|
}
|
|
|
|
|
|
// Get the next arithmetic code.
|
|
void ArithmeticDecoder::NextCode(int symlow, int symsize, int symtot)
|
|
{
|
|
int renormf = range / symtot;
|
|
int lowincr = renormf * symlow;
|
|
|
|
code -= lowincr;
|
|
range = (symlow + symsize == symtot) ? (range - lowincr) : (symsize * renormf);
|
|
|
|
for (; range <= kDecoderHalf; range <<= 1)
|
|
code = (code << 1) | input.ReadBit();
|
|
}
|
|
|
|
|
|
// Get the next arithmetic symbol.
|
|
int ArithmeticDecoder::NextSymbol(ArithmeticModel *model)
|
|
{
|
|
int freq = code / (range / model->total_freq);
|
|
int cumulative = 0, n = 0;
|
|
|
|
for (; n < model->num_symbols - 1; n++) {
|
|
if (cumulative + model->symbols[n].freq > freq) break;
|
|
cumulative += model->symbols[n].freq;
|
|
}
|
|
|
|
NextCode(cumulative, model->symbols[n].freq, model->total_freq);
|
|
model->IncreaseFrequency(n);
|
|
|
|
return model->symbols[n].symbol;
|
|
}
|
|
|
|
|
|
// Get the next word (that has |n| bits).
|
|
int ArithmeticDecoder::NextWord(ArithmeticModel *model, int n)
|
|
{
|
|
int word = 0;
|
|
for (int i = 0; i < n; i++) {
|
|
if (NextSymbol(model))
|
|
word |= (1 << i);
|
|
}
|
|
|
|
return word;
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize the algorithm.
|
|
void ArsenicMethod::Initialize()
|
|
{
|
|
decoder.Initialize(data, end - data);
|
|
|
|
initial_model.Initialize(0, 1, 1, 256);
|
|
selector_model.Initialize(0, 10, 8, 1024);
|
|
mtf_model[0].Initialize(2, 3, 8, 1024);
|
|
mtf_model[1].Initialize(4, 7, 4, 1024);
|
|
mtf_model[2].Initialize(8, 15, 4, 1024);
|
|
mtf_model[3].Initialize(16, 31, 4, 1024);
|
|
mtf_model[4].Initialize(32, 63, 2, 1024);
|
|
mtf_model[5].Initialize(64, 127, 2, 1024);
|
|
mtf_model[6].Initialize(128, 255, 1, 1024);
|
|
|
|
if (decoder.NextWord(&initial_model, 8) != 'A')
|
|
throw ExtractException("Arsenic: invalid compressed data [A]");
|
|
if (decoder.NextWord(&initial_model, 8) != 's')
|
|
throw ExtractException("Arsenic: invalid compressed data [s]");
|
|
|
|
block_bits = decoder.NextWord(&initial_model, 4) + 9;
|
|
block_size = (1 << block_bits);
|
|
|
|
num_bytes = 0; byte_count = 0; repeat = 0;
|
|
crc = 0xFFFFFFFF; compcrc = 0;
|
|
|
|
block = std::make_unique<uint8_t[]>(block_size);
|
|
end_of_blocks = decoder.NextSymbol(&initial_model); // Check first end marker.
|
|
}
|
|
|
|
|
|
|
|
// Read the next block.
|
|
void ArsenicMethod::ReadNextBlock()
|
|
{
|
|
mtf.ResetDecoder();
|
|
|
|
randomized = decoder.NextSymbol(&initial_model);
|
|
transform_index = decoder.NextWord(&initial_model, block_bits);
|
|
num_bytes = 0;
|
|
|
|
while (true) {
|
|
int sel = decoder.NextSymbol(&selector_model);
|
|
if (sel == 0 || sel == 1) { // Zero counting.
|
|
int zero_state = 1, zero_count = 0;
|
|
while (sel < 2) {
|
|
if (sel == 0) zero_count += zero_state;
|
|
else if (sel == 1) zero_count += (2 * zero_state);
|
|
zero_state *= 2;
|
|
sel = decoder.NextSymbol(&selector_model);
|
|
}
|
|
|
|
if (num_bytes + zero_count > block_size)
|
|
throw ExtractException("Arsenic: invalid block [zero count]");
|
|
|
|
memset(&block[num_bytes], mtf.Decode(0), zero_count);
|
|
num_bytes += zero_count;
|
|
}
|
|
|
|
int symbol;
|
|
if (sel == 10) break;
|
|
else if (sel == 2) symbol = 1;
|
|
else symbol = decoder.NextSymbol(&mtf_model[sel - 3]);
|
|
|
|
if (num_bytes >= block_size)
|
|
throw ExtractException("Arsenic: invalid block [num of bytes]");
|
|
block[num_bytes++] = mtf.Decode(symbol);
|
|
}
|
|
|
|
if (transform_index >= num_bytes)
|
|
throw ExtractException("Arsenic: invalid block [transform index]");
|
|
|
|
selector_model.ResetModel();
|
|
for (int i = 0; i < 7;i++)
|
|
mtf_model[i].ResetModel();
|
|
|
|
if (decoder.NextSymbol(&initial_model)) { // End marker.
|
|
compcrc = decoder.NextWord(&initial_model, 32);
|
|
end_of_blocks = true;
|
|
}
|
|
|
|
transform = std::make_unique<uint32_t[]>(num_bytes);
|
|
CalculateInverseBWT(transform.get(), block.get(), num_bytes);
|
|
}
|
|
|
|
|
|
|
|
// Read the next byte.
|
|
int32_t ArsenicMethod::ReadNextByte()
|
|
{
|
|
int byte, out_byte;
|
|
|
|
if (repeat) {
|
|
out_byte = last; repeat--;
|
|
goto end;
|
|
}
|
|
|
|
retry:
|
|
if (byte_count >= num_bytes) {
|
|
if (end_of_blocks) return -1;
|
|
|
|
ReadNextBlock();
|
|
byte_count = 0; count = 0; last = 0;
|
|
rand_index = 0; rand_count = kRandomizationTable[0];
|
|
}
|
|
|
|
transform_index = transform[transform_index];
|
|
byte = block[transform_index];
|
|
|
|
if (randomized && rand_count == byte_count) {
|
|
byte ^= 1;
|
|
rand_index = (rand_index + 1) & 0xFF;
|
|
rand_count += kRandomizationTable[rand_index];
|
|
}
|
|
|
|
byte_count++;
|
|
|
|
if (count == 4) {
|
|
count = 0;
|
|
if (byte == 0) goto retry;
|
|
repeat = byte - 1;
|
|
out_byte = last;
|
|
}
|
|
else {
|
|
if (byte == last) count++;
|
|
else { count = 1; last = byte; }
|
|
out_byte = byte;
|
|
}
|
|
|
|
end:
|
|
crc = CalcCRC(crc, out_byte, CRCTable_edb88320);
|
|
return out_byte;
|
|
}
|
|
|
|
|
|
|
|
// Read the next bytes.
|
|
int32_t ArsenicMethod::ReadBytes(uint8_t *buffer, uint32_t length)
|
|
{
|
|
if (end_of_blocks) {
|
|
// if (compcrc != ~crc) // FIX ME
|
|
// throw ExtractException("Arsenic: invalid CRC after uncompressing");
|
|
return -1;
|
|
}
|
|
|
|
uint8_t *start = buffer;
|
|
uint8_t *end_capacity = buffer + length;
|
|
|
|
int32_t byte;
|
|
while (buffer != end_capacity && (byte = ReadNextByte()) != -1)
|
|
*(buffer++) = byte;
|
|
|
|
return buffer - start;
|
|
}
|
|
|
|
|
|
|
|
} // namespace stuffit
|
|
} // namespace maconv
|