2019-10-01 10:09:00 +02:00
import io
2019-09-29 16:06:06 +02:00
import typing
2019-08-22 21:19:10 +02:00
from . import common
# Lookup table for codes in range(0xd5, 0xfe).
# This table was obtained by decompressing a manually created compressed resource with the following contents:
# b'\xa8\x9fer\x00\x12\x08\x01\x00\x00\x00R\x80\x03\x00\x01\x00\x00' + bytes(range(0xd5, 0xfe)) + b'\xff'
TABLE_DATA = (
# First line corresponds to codes in range(0xd5, 0xd8).
b " \x00 \x00 \x00 \x01 \x00 \x02 "
# All following lines correspond to 8 codes each.
b " \x00 \x03 . \x01 > \x01 \x01 \x01 \x1e \x01 \xff \xff \x0e \x01 1 \x00 "
b " \x11 \x12 \x01 \x07 32 \x12 9 \xed \x10 \x01 ' # \" \x01 7 "
b " \x07 \x06 \x01 \x17 \x01 # \x00 \xff \x00 / \x07 \x0e \xfd < \x01 5 "
b " \x01 \x15 \x01 \x02 \x00 \x07 \x00 > \x05 \xd5 \x02 \x01 \x06 \x07 \x07 \x08 "
# Last line corresponds to codes in range(0xf8, 0xfe).
b " 0 \x01 \x01 3 \x00 \x10 \x17 \x16 7>67 "
)
# Note: index 0 in this table corresponds to code 0xd5, index 1 to 0xd6, etc.
TABLE = [ TABLE_DATA [ i : i + 2 ] for i in range ( 0 , len ( TABLE_DATA ) , 2 ) ]
assert len ( TABLE ) == len ( range ( 0xd5 , 0xfe ) )
2019-10-01 10:09:00 +02:00
def decompress_stream_inner ( header_info : common . CompressedHeaderInfo , stream : typing . BinaryIO , * , debug : bool = False ) - > typing . Iterator [ bytes ] :
""" Internal helper function, implements the main decompression algorithm. Only called from decompress_stream, which performs some extra checks and debug logging. """
2019-08-22 21:19:10 +02:00
2019-10-07 10:08:32 +02:00
if not isinstance ( header_info , common . CompressedType8HeaderInfo ) :
2019-09-29 15:33:14 +02:00
raise common . DecompressError ( f " Incorrect header type: { type ( header_info ) . __qualname__ } " )
2019-09-23 23:32:38 +02:00
2019-09-29 16:06:06 +02:00
prev_literals : typing . List [ bytes ] = [ ]
2019-08-22 21:19:10 +02:00
2019-10-01 10:09:00 +02:00
while True : # Loop is terminated when the EOF marker (0xff) is encountered
( byte , ) = common . read_exact ( stream , 1 )
2019-08-22 21:19:10 +02:00
if debug :
2019-10-01 10:09:00 +02:00
print ( f " Tag byte 0x { byte : >02x } " )
2019-08-22 21:19:10 +02:00
if byte in range ( 0x00 , 0x20 ) :
# Literal byte sequence, 1-byte header.
# The length of the literal data is stored in the low nibble of the tag byte.
count = ( byte >> 0 & 0xf ) + 1
# Controls whether or not the literal is stored so that it can be referenced again later.
do_store = byte > = 0x10
2019-10-01 10:09:00 +02:00
literal = common . read_exact ( stream , count )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Literal (1-byte header, storing: { do_store } ) " )
if do_store :
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> storing as literal number 0x { len ( prev_literals ) : x } " )
2019-08-22 21:19:10 +02:00
prev_literals . append ( literal )
2019-10-01 10:09:00 +02:00
yield literal
2019-08-22 21:19:10 +02:00
elif byte in range ( 0x20 , 0xd0 ) :
# Backreference to a previous literal, 1-byte form.
# This can reference literals with indices in range(0xb0).
table_index = byte - 0x20
if debug :
print ( f " Backreference (1-byte form) to 0x { table_index : >02x } " )
2019-10-01 10:09:00 +02:00
yield prev_literals [ table_index ]
2019-08-22 21:19:10 +02:00
elif byte in ( 0xd0 , 0xd1 ) :
# Literal byte sequence, 2-byte header.
# The length of the literal data is stored in the following byte.
2019-10-01 10:09:00 +02:00
( count , ) = common . read_exact ( stream , 1 )
2019-08-22 21:19:10 +02:00
# Controls whether or not the literal is stored so that it can be referenced again later.
do_store = byte == 0xd1
2019-10-01 10:09:00 +02:00
literal = common . read_exact ( stream , count )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Literal (2-byte header, storing: { do_store } ) " )
if do_store :
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> storing as literal number 0x { len ( prev_literals ) : x } " )
2019-08-22 21:19:10 +02:00
prev_literals . append ( literal )
2019-10-01 10:09:00 +02:00
yield literal
2019-08-22 21:19:10 +02:00
elif byte == 0xd2 :
# Backreference to a previous literal, 2-byte form.
# This can reference literals with indices in range(0xb0, 0x1b0).
2019-10-01 10:09:00 +02:00
( next_byte , ) = common . read_exact ( stream , 1 )
table_index = next_byte + 0xb0
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Backreference (2-byte form) to 0x { table_index : >02x } " )
2019-10-01 10:09:00 +02:00
yield prev_literals [ table_index ]
2019-08-22 21:19:10 +02:00
elif byte in range ( 0xd5 , 0xfe ) :
# Reference into a fixed table of two-byte literals.
# All compressed resources use the same table.
table_index = byte - 0xd5
if debug :
print ( f " Fixed table reference to 0x { table_index : >02x } " )
2019-10-01 10:09:00 +02:00
yield TABLE [ table_index ]
2019-08-22 21:19:10 +02:00
elif byte == 0xfe :
# Extended code, whose meaning is controlled by the following byte.
2019-10-01 10:09:00 +02:00
( kind , ) = common . read_exact ( stream , 1 )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Extended code: 0x { kind : >02x } " )
if kind == 0x02 :
# Repeat 1 byte a certain number of times.
byte_count = 1 # Unlike with 'dcmp' (0) compression, there doesn't appear to be a 2-byte repeat (or if there is, it's never used in practice).
if debug :
print ( f " Repeat { byte_count } -byte value " )
# The byte(s) to repeat, stored as a variable-length integer. The value is treated as unsigned, i. e. the integer is never negative.
2019-10-01 10:09:00 +02:00
to_repeat_int = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
try :
to_repeat = to_repeat_int . to_bytes ( byte_count , " big " , signed = False )
except OverflowError :
raise common . DecompressError ( f " Value to repeat out of range for { byte_count } -byte repeat: { to_repeat_int : #x } " )
2019-10-01 10:09:00 +02:00
count = common . read_variable_length_integer ( stream ) + 1
2019-08-22 21:19:10 +02:00
if count < = 0 :
raise common . DecompressError ( f " Repeat count must be positive: { count } " )
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> { to_repeat } * { count } " )
yield to_repeat * count
2019-08-22 21:19:10 +02:00
else :
raise common . DecompressError ( f " Unknown extended code: 0x { kind : >02x } " )
elif byte == 0xff :
# End of data marker, always occurs exactly once as the last byte of the compressed data.
if debug :
print ( " End marker " )
2019-10-01 10:09:00 +02:00
# Check that there really is no more data left.
extra = stream . read ( 1 )
if extra :
raise common . DecompressError ( f " Extra data encountered after end of data marker (first extra byte: { extra } ) " )
break
2019-08-22 21:19:10 +02:00
else :
2019-10-01 10:09:00 +02:00
raise common . DecompressError ( f " Unknown tag byte: 0x { byte : >02x } " )
def decompress_stream ( header_info : common . CompressedHeaderInfo , stream : typing . BinaryIO , * , debug : bool = False ) - > typing . Iterator [ bytes ] :
""" Decompress compressed data in the format used by ' dcmp ' (1). """
2019-08-22 21:19:10 +02:00
2019-10-01 10:09:00 +02:00
decompressed_length = 0
for chunk in decompress_stream_inner ( header_info , stream , debug = debug ) :
if debug :
print ( f " \t -> { chunk } " )
decompressed_length + = len ( chunk )
yield chunk
if debug :
print ( f " Decompressed { decompressed_length : #x } bytes so far " )