From 460eb6361280feca8c0299036503fd19cc40aaec Mon Sep 17 00:00:00 2001 From: Reid Spencer Date: Mon, 4 Oct 2004 10:49:41 +0000 Subject: [PATCH] First version of a support utility to provide generalized compression in LLVM that handles availability and unavailability of bzip2 and zlib. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@16648 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/Compressor.h | 99 ++++++ lib/Support/Compressor.cpp | 526 ++++++++++++++++++++++++++++++ 2 files changed, 625 insertions(+) create mode 100644 include/llvm/Support/Compressor.h create mode 100644 lib/Support/Compressor.cpp diff --git a/include/llvm/Support/Compressor.h b/include/llvm/Support/Compressor.h new file mode 100644 index 00000000000..eea98ebe13b --- /dev/null +++ b/include/llvm/Support/Compressor.h @@ -0,0 +1,99 @@ +//===- llvm/Support/Compressor.h --------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Reid Spencer and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the llvm::Compressor class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_COMPRESSOR_H +#define LLVM_SUPPORT_COMPRESSOR_H + +#include + +namespace llvm { + + /// This class provides an abstraction for compressing a block of memory using + /// a standard compression utility such as bzip2 or libz. This interface + /// allos us to abstraction the notion of compression and deal with alternate + /// compression scheme availability depending on the configured platform. This + /// facility will always favor a bzip2 implementation if its available. + /// Otherwise, libz will be used if its available. If neither zlib nor bzip2 + /// are available, a very simple algorithm provided by the Compressor class + /// will be used The type of compression used can be determined by inspecting + /// the first byte of the compressed output. ASCII values '0', '1', and '2', + /// denote the compression type as given in the Algorithm enumeration below. + /// The Compressor is intended for use with memory mapped files where the + /// entire data block to be compressed or decompressed is available in + /// memory. Output, however, can be gathered in repeated calls to a callback. + /// @since 1.4 + /// @brief An abstraction for memory to memory data compression + class Compressor { + /// @name Types + /// @{ + public: + enum Algorithm { + COMP_TYPE_SIMPLE = '0', ///< Use simple but ubiquitous algorithm + COMP_TYPE_ZLIB = '1', ///< Use zlib algorithm, if available + COMP_TYPE_BZIP2 = '2', ///< Use bzip2 algorithm (preferred) + }; + + /// A callback function type used by the Compressor to get the next chunk + /// of data to which (de)compressed output will be written. This function + /// must be written by the caller to provide the buffering of the output + /// data. + /// @returns 0 for success, 1 for failure + /// @throws nothing + /// @brief Output callback function type + typedef unsigned (OutputDataCallback)(char*& buffer, unsigned& size); + + /// @} + /// @name Methods + /// @{ + public: + /// This function does the compression work. The block of memory starting + /// at \p in and extending for \p size bytes is compressed. The compressed + /// output is written to memory blocks returned by the \p cb callback. The + /// caller must provide an implementation of the OutputDataCallback + /// function type and provide its address as \p cb. Note that the callback + /// function will be called as many times as necessary to complete the + /// compression of the \p in block but that the total size will generally + /// be less than \p size. It is a good idea to provide as large a value to + /// the callback's \p size parameter as possible so that fewer calls to + /// the callback are made. The \p hint parameter tells the function which + /// kind of compression to start with. However, if its not available on + /// the platform, the algorithm "falls back" from bzip2 -> zlib -> simple. + /// @throws std::string if an error occurs + /// @returns the total size of the compressed data + /// @brief Compress a block of memory. + static uint64_t compress(char* in, unsigned size, OutputDataCallback* cb, + Algorithm hint = COMP_TYPE_BZIP2); + + /// This function does the decompression work. The block of memory + /// starting at \p in and extending for \p size bytes is decompressed. The + /// decompressed output is written to memory blocks returned by the \p cb + /// callback. The caller must provide an implementation of the + /// OutputDataCallback function type and provide its address as \p cb. + /// Note that the callback function will be called as many times as + /// necessary to complete the compression of the \p in block but that the + /// total size will generally be greater than \p size. It is a good idea + /// to provide as large a value to the callback's \p size parameter as + /// possible so that fewer calls to the callback are made. + /// @throws std::string if an error occurs + /// @returns the total size of the decompressed data + /// @brief Decompress a block of memory. + static uint64_t decompress(char *in, unsigned size, + OutputDataCallback* cb); + + /// @} + }; +} + +// vim: sw=2 ai + +#endif diff --git a/lib/Support/Compressor.cpp b/lib/Support/Compressor.cpp new file mode 100644 index 00000000000..972a85d11be --- /dev/null +++ b/lib/Support/Compressor.cpp @@ -0,0 +1,526 @@ +//===- lib/Support/Compressor.cpp -------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Reid Spencer and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the llvm::Compressor class, an abstraction for memory +// block compression. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Config/config.h" +#include "llvm/Support/Compressor.h" +#include "llvm/ADT/StringExtras.h" +#include +#include + +#ifdef HAVE_BZIP2 +#include +#endif + +#ifdef HAVE_ZLIB +#include +#endif + +#ifndef HAVE_BZIP2 +# ifndef HAVE_ZLIB +# warning No compression library is available!! +# endif +#endif + +namespace { + +inline int getdata(char*& buffer, unsigned& size, + llvm::Compressor::OutputDataCallback* cb) { + buffer = 0; + size = 0; + int result = (*cb)(buffer, size); + assert(buffer != 0 && "Invalid result from Compressor callback"); + assert(size != 0 && "Invalid result from Compressor callback"); + return result; +} + +//===----------------------------------------------------------------------===// +//=== RLCOMP - very simple run length compression scheme +//=== The code below transforms the input into blocks that are either +//=== compressed or not. Each block starts with a header byte that provides +//=== the length of the block. Values < 128 are uncompressed, values >128 +//=== are compressed. The value 128 is illegal. Currently, the algorithm is +//=== not completed and is #if'd out. +//===----------------------------------------------------------------------===// + +enum { + MAX_RLCOMP_OUT = 32768 +}; + +struct RLCOMP_stream { + // User provided fields + char* next_in; + unsigned avail_in; + char* next_out; + unsigned avail_out; + + // Information fields + uint64_t output_count; // Total count of output bytes + +#if 0 + // Algorithm fields + unsigned block_len; // Length of current block + unsigned compression; // State of compression 0=no, 1=yes, 2=indeterminate + char buffer[128]; // compression buffer (not used for decompress) + unsigned buflen; // bytes in compression buffer + bool pending; // is data pending to be written + char pending_data; // the pending data + unsigned clen; // length of the compressed block +#endif +}; + +void RLCOMP_init(RLCOMP_stream* s) { + s->output_count = 0; +#if 0 + s->block_len = 0; + s->compression = 2; + s->buflen = 0; + s->pending = false; + s->pending_data = 0; + s->clen = 0; +#endif +} + +inline bool RLCOMP_getchar(RLCOMP_stream* s, unsigned& data) { +#if 0 + if (s->avail_in) { + data = *s->next_in++; + s->avail_in--; + return true; + } +#endif + return false; +} + +inline bool RLCOMP_putchar(RLCOMP_stream* s, unsigned data) { +#if 0 + if (s->avail_out) { + *s->next_out++ = data; + s->avail_out--; + s->output_count++; + return true; + } else { + s->pending = true; + s->pending_data = data; + return false; + } +#else + return false; +#endif +} + +bool RLCOMP_compress(RLCOMP_stream* s) { + assert(s && "Invalid RLCOMP_stream"); + assert(s->next_in != 0); + assert(s->next_out != 0); + assert(s->avail_in >= 1); + assert(s->avail_out >= 1); + +#if 0 + + // Handle pending data from the last time in + if (s->pending) { + RLCOMP_putchar(s,s->pending_data); + s->pending = false; + } + + unsigned c = 0; + unsigned lastc = 0; + // Handle the degenerate len=1 case + if (!RLCOMP_getchar(s,lastc)) { + RLCOMP_putchar(s,1); + return RLCOMP_putchar(s,lastc); + } + + while (RLCOMP_getchar(s,c)) { + switch(s->compression) { + case 0: + if (lastc == c) { + s->compression = 1; + s->clen = 2 ; + } else { + if (!RLCOMP_putchar(s, c)) + return false; + } + break; + + case 1: + if (lastc != c) { + s->compression = 2; + if (!RLCOMP_putchar(s, s->clen)) + return false; + } else { + s->clen++; + } + break; + + case 2: + break; + } + lastc = c; + } +#endif + if (s->avail_out >= s->avail_in) { + ::memcpy(s->next_out, s->next_in, s->avail_in); + s->output_count += s->avail_in; + s->avail_out -= s->avail_in; + s->next_in += s->avail_in; + s->avail_in = 0; + return true; + } else { + ::memcpy(s->next_out, s->next_in, s->avail_out); + s->output_count += s->avail_out; + s->avail_in -= s->avail_out; + s->next_in += s->avail_out; + s->avail_out = 0; + return false; + } +} + +bool RLCOMP_decompress(RLCOMP_stream* s) { + assert(s && "Invalid RLCOMP_stream"); + assert(s->next_in != 0); + assert(s->next_out != 0); + assert(s->avail_in >= 1); + assert(s->avail_out >= 1); + +#if 0 + unsigned c = 0; + while (RLCOMP_getchar(s,c)) { + switch(s->compression) { + case 0: // This is not a compressed block + s->block_len--; + if (!RLCOMP_putchar(s,c)) + return false; + break; + + case 1: // This is a comperssed block + while (s->block_len-- > 0) + if (!RLCOMP_putchar(s,c)) + return false; + break; + + case 2: // This is the length field + if (c < 128) { + s->compression = 0; + s->block_len = c; + } else { + s->compression = 1; + s->block_len = c - 128; + } + continue; + + default: // oops! + throw std::string("Invalid compression state"); + } + if (s->block_len <= 0) + s->compression = 2; + } + + if (s->repeat > 0) + throw std::string("Invalid compression state"); +#endif + if (s->avail_out >= s->avail_in) { + ::memcpy(s->next_out, s->next_in, s->avail_in); + s->output_count += s->avail_in; + s->avail_out -= s->avail_in; + s->next_in += s->avail_in; + s->avail_in = 0; + return true; + } else { + ::memcpy(s->next_out, s->next_in, s->avail_out); + s->output_count += s->avail_out; + s->avail_in -= s->avail_out; + s->next_in += s->avail_out; + s->avail_out = 0; + return false; + } +} + +void RLCOMP_end(RLCOMP_stream* strm) { +} + +} + +namespace llvm { + +// Compress in one of three ways +uint64_t Compressor::compress(char* in, unsigned size, + OutputDataCallback* cb, Algorithm hint) { + assert(in && "Can't compress null buffer"); + assert(size && "Can't compress empty buffer"); + assert(cb && "Can't compress without a callback function"); + + uint64_t result = 0; + + switch (hint) { + case COMP_TYPE_BZIP2: { +#if defined(HAVE_BZIP2) + // Set up the bz_stream + bz_stream bzdata; + bzdata.bzalloc = 0; + bzdata.bzfree = 0; + bzdata.opaque = 0; + bzdata.next_in = in; + bzdata.avail_in = size; + bzdata.next_out = 0; + bzdata.avail_out = 0; + switch ( BZ2_bzCompressInit(&bzdata, 9, 0, 0) ) { + case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled"); + case BZ_PARAM_ERROR: throw std::string("Compressor internal error"); + case BZ_MEM_ERROR: throw std::string("Out of memory"); + case BZ_OK: + default: + break; + } + + // Get a block of memory + if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) { + BZ2_bzCompressEnd(&bzdata); + throw std::string("Can't allocate output buffer"); + } + + // Put compression code in first byte + (*bzdata.next_out++) = COMP_TYPE_BZIP2; + bzdata.avail_out--; + + // Compress it + int bzerr = BZ_FINISH_OK; + while (BZ_FINISH_OK == (bzerr = BZ2_bzCompress(&bzdata, BZ_FINISH))) { + if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) { + BZ2_bzCompressEnd(&bzdata); + throw std::string("Can't allocate output buffer"); + } + } + switch (bzerr) { + case BZ_SEQUENCE_ERROR: + case BZ_PARAM_ERROR: throw std::string("Param/Sequence error"); + case BZ_FINISH_OK: + case BZ_STREAM_END: break; + default: throw std::string("Oops: ") + utostr(unsigned(bzerr)); + } + + // Finish + result = (static_cast(bzdata.total_out_hi32) << 32) | + bzdata.total_out_lo32 + 1; + + BZ2_bzCompressEnd(&bzdata); + break; +#else + // FALL THROUGH +#endif + } + + case COMP_TYPE_ZLIB: { +#if defined(HAVE_ZLIB) + z_stream zdata; + zdata.zalloc = Z_NULL; + zdata.zfree = Z_NULL; + zdata.opaque = Z_NULL; + zdata.next_in = reinterpret_cast(in); + zdata.avail_in = size; + if (Z_OK != deflateInit(&zdata,Z_BEST_COMPRESSION)) + throw std::string(zdata.msg ? zdata.msg : "zlib error"); + + if (0 != getdata((char*&)(zdata.next_out), zdata.avail_out,cb)) { + deflateEnd(&zdata); + throw std::string("Can't allocate output buffer"); + } + + (*zdata.next_out++) = COMP_TYPE_ZLIB; + zdata.avail_out--; + + int flush = 0; + while ( Z_OK == deflate(&zdata,0) && zdata.avail_out == 0) { + if (0 != getdata((char*&)zdata.next_out, zdata.avail_out, cb)) { + deflateEnd(&zdata); + throw std::string("Can't allocate output buffer"); + } + } + + while ( Z_STREAM_END != deflate(&zdata, Z_FINISH)) { + if (0 != getdata((char*&)zdata.next_out, zdata.avail_out, cb)) { + deflateEnd(&zdata); + throw std::string("Can't allocate output buffer"); + } + } + + result = static_cast(zdata.total_out) + 1; + deflateEnd(&zdata); + break; + +#else + // FALL THROUGH +#endif + } + + case COMP_TYPE_SIMPLE: { + RLCOMP_stream sdata; + sdata.next_in = in; + sdata.avail_in = size; + RLCOMP_init(&sdata); + + if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) { + throw std::string("Can't allocate output buffer"); + } + + *(sdata.next_out++) = COMP_TYPE_SIMPLE; + sdata.avail_out--; + + while (!RLCOMP_compress(&sdata)) { + if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) { + throw std::string("Can't allocate output buffer"); + } + } + + result = sdata.output_count + 1; + RLCOMP_end(&sdata); + break; + } + default: + throw std::string("Invalid compression type hint"); + } + return result; +} + +// Decompress in one of three ways +uint64_t Compressor::decompress(char *in, unsigned size, + OutputDataCallback* cb) { + assert(in && "Can't decompress null buffer"); + assert(size > 1 && "Can't decompress empty buffer"); + assert(cb && "Can't decompress without a callback function"); + + uint64_t result = 0; + + switch (*in++) { + case COMP_TYPE_BZIP2: { +#if !defined(HAVE_BZIP2) + throw std::string("Can't decompress BZIP2 data"); +#else + // Set up the bz_stream + bz_stream bzdata; + bzdata.bzalloc = 0; + bzdata.bzfree = 0; + bzdata.opaque = 0; + bzdata.next_in = in; + bzdata.avail_in = size - 1; + bzdata.next_out = 0; + bzdata.avail_out = 0; + switch ( BZ2_bzDecompressInit(&bzdata, 0, 0) ) { + case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled"); + case BZ_PARAM_ERROR: throw std::string("Compressor internal error"); + case BZ_MEM_ERROR: throw std::string("Out of memory"); + case BZ_OK: + default: + break; + } + + // Get a block of memory + if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) { + BZ2_bzDecompressEnd(&bzdata); + throw std::string("Can't allocate output buffer"); + } + + // Decompress it + int bzerr = BZ_OK; + while (BZ_OK == (bzerr = BZ2_bzDecompress(&bzdata))) { + if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) { + BZ2_bzDecompressEnd(&bzdata); + throw std::string("Can't allocate output buffer"); + } + } + + switch (bzerr) { + case BZ_PARAM_ERROR: throw std::string("Compressor internal error"); + case BZ_MEM_ERROR: throw std::string("Out of memory"); + case BZ_DATA_ERROR: throw std::string("Data integrity error"); + case BZ_DATA_ERROR_MAGIC:throw std::string("Data is not BZIP2"); + default: throw("Ooops"); + case BZ_STREAM_END: + break; + } + + // Finish + result = (static_cast(bzdata.total_out_hi32) << 32) | + bzdata.total_out_lo32; + BZ2_bzDecompressEnd(&bzdata); + break; + } +#endif + + case COMP_TYPE_ZLIB: { +#if !defined(HAVE_ZLIB) + throw std::string("Can't decompress ZLIB data"); +#else + z_stream zdata; + zdata.zalloc = Z_NULL; + zdata.zfree = Z_NULL; + zdata.opaque = Z_NULL; + zdata.next_in = reinterpret_cast(in); + zdata.avail_in = size - 1; + if ( Z_OK != inflateInit(&zdata)) + throw std::string(zdata.msg ? zdata.msg : "zlib error"); + + if (0 != getdata((char*&)zdata.next_out, zdata.avail_out,cb)) { + inflateEnd(&zdata); + throw std::string("Can't allocate output buffer"); + } + + int zerr = Z_OK; + while (Z_OK == (zerr = inflate(&zdata,0))) { + if (0 != getdata((char*&)zdata.next_out, zdata.avail_out,cb)) { + inflateEnd(&zdata); + throw std::string("Can't allocate output buffer"); + } + } + + if (zerr != Z_STREAM_END) + throw std::string(zdata.msg?zdata.msg:"zlib error"); + + result = static_cast(zdata.total_out); + inflateEnd(&zdata); + break; +#endif + } + + case COMP_TYPE_SIMPLE: { + RLCOMP_stream sdata; + sdata.next_in = in; + sdata.avail_in = size - 1; + RLCOMP_init(&sdata); + + if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) { + throw std::string("Can't allocate output buffer"); + } + + while (!RLCOMP_decompress(&sdata)) { + if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) { + throw std::string("Can't allocate output buffer"); + } + } + + result = sdata.output_count; + RLCOMP_end(&sdata); + break; + } + + default: + throw std::string("Unknown type of compressed data"); + } + + return result; +} + +} + +// vim: sw=2 ai