2019-10-01 10:09:00 +02:00
import io
2019-09-29 16:06:06 +02:00
import typing
2019-08-22 21:19:10 +02:00
from . import common
# Lookup table for codes in range(0x4b, 0xfe).
# This table was obtained by decompressing a manually created compressed resource with the following contents:
# b'\xa8\x9fer\x00\x12\x08\x01\x00\x00\x01f\x80\x03\x00\x00\x00\x00' + bytes(range(0x4b, 0xfe)) + b'\xff'
TABLE_DATA = (
# First line corresponds to codes in range(0x4b, 0x50).
b " \x00 \x00 N \xba \x00 \x08 Nu \x00 \x0c "
# All following lines correspond to 8 codes each.
b " N \xad S/ \x0b a \x00 \x00 \x10 p \x00 / \x00 Hn "
b " P n/. \xff \xfc H \xe7 ?< \x00 \x04 \xff \xf8 "
b " / \x0c \x06 N \xed NV hN^ \x00 \x01 X \x8f "
b " O \xef \x00 \x02 \x00 \x18 ` \x00 \xff \xff P \x8f N \x90 \x00 \x06 "
b " &n \x00 \x14 \xff \xf4 L \xee \x00 \n \x00 \x0e A \xee L \xdf "
b " H \xc0 \xff \xf0 -@ \x00 \x12 0.p \x01 /( T "
b " g \x00 \x00 \x00 \x1c _ \x18 \x00 &oHx \x00 \x16 "
b " A \xfa 0<(@r \x00 (n \x0c f \x00 k "
b " / \x07 U \x8f \x00 ( \xff \xfe \xff \xec \" \xd8 \x0b \x00 \x0f "
b " Y \x8f /< \xff \x00 \x01 \x18 \x81 \xe1 J \x00 N \xb0 \xff \xe8 "
b " H \xc7 \x00 \x03 \x00 \" \x00 \x07 \x00 \x1a g \x06 g \x08 N \xf9 "
b " \x00 $ x \x08 \x00 f \x04 \x00 *N \xd0 0(&_ "
b " g \x04 \x00 0C \xee ? \x00 \x1f \x00 \x1e \xff \xf6 . "
b " B \xa7 \x07 \xff \xfa ` \x02 =@ \x0c @f \x06 \x00 & "
b " -H/ \x01 p \xff ` \x04 \x18 \x80 J@ \x00 @ \x00 , "
b " / \x08 \x00 \x11 \xff \xe4 !@&@ \xff \xf2 BnN \xb9 "
b " =| \x00 8 \x00 \r ` \x06 B. <g \x0c -h "
b " f \x08 J.J \xae \x00 .H@ \" _ \" \x00 g \n "
b " 0 \x07 Bg \x00 2 ( \x00 \t Hz \x02 \x00 /+ "
b " \x00 \x05 \" nf \x02 \xe5 \x80 g \x0e f \n \x00 P> \x00 "
b " f \x0c . \x00 \xff \xee m @ \xff \xe0 S@` \x08 "
# Last line corresponds to codes in range(0xf8, 0xfe).
b " \x04 \x80 \x00 h \x0b |D \x00 A \xe8 HA "
)
# Note: index 0 in this table corresponds to code 0x4b, index 1 to 0x4c, etc.
TABLE = [ TABLE_DATA [ i : i + 2 ] for i in range ( 0 , len ( TABLE_DATA ) , 2 ) ]
assert len ( TABLE ) == len ( range ( 0x4b , 0xfe ) )
2019-10-01 10:09:00 +02:00
def decompress_stream_inner ( header_info : common . CompressedHeaderInfo , stream : typing . BinaryIO , * , debug : bool = False ) - > typing . Iterator [ bytes ] :
""" Internal helper function, implements the main decompression algorithm. Only called from decompress_stream, which performs some extra checks and debug logging. """
2019-08-22 21:19:10 +02:00
2019-10-07 10:08:32 +02:00
if not isinstance ( header_info , common . CompressedType8HeaderInfo ) :
2019-09-29 15:33:14 +02:00
raise common . DecompressError ( f " Incorrect header type: { type ( header_info ) . __qualname__ } " )
2019-09-23 23:32:38 +02:00
2019-09-29 16:06:06 +02:00
prev_literals : typing . List [ bytes ] = [ ]
2019-08-22 21:19:10 +02:00
2019-10-01 10:09:00 +02:00
while True : # Loop is terminated when the EOF marker (0xff) is encountered
( byte , ) = common . read_exact ( stream , 1 )
2019-08-22 21:19:10 +02:00
if debug :
2019-10-01 10:09:00 +02:00
print ( f " Tag byte 0x { byte : >02x } " )
2019-08-22 21:19:10 +02:00
if byte in range ( 0x00 , 0x20 ) :
# Literal byte sequence.
if byte in ( 0x00 , 0x10 ) :
# The length of the literal data is stored in the next byte.
2019-10-01 10:09:00 +02:00
( count_div2 , ) = common . read_exact ( stream , 1 )
2019-08-22 21:19:10 +02:00
else :
# The length of the literal data is stored in the low nibble of the tag byte.
count_div2 = byte >> 0 & 0xf
2019-10-01 10:09:00 +02:00
count = 2 * count_div2
2019-08-22 21:19:10 +02:00
# Controls whether or not the literal is stored so that it can be referenced again later.
do_store = byte > = 0x10
2019-10-01 10:09:00 +02:00
literal = common . read_exact ( stream , count )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Literal (storing: { do_store } ) " )
if do_store :
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> storing as literal number 0x { len ( prev_literals ) : x } " )
2019-08-22 21:19:10 +02:00
prev_literals . append ( literal )
2019-10-01 10:09:00 +02:00
yield literal
2019-08-22 21:19:10 +02:00
elif byte in ( 0x20 , 0x21 ) :
# Backreference to a previous literal, 2-byte form.
# This can reference literals with index in range(0x28, 0x228).
2019-10-01 10:09:00 +02:00
( next_byte , ) = common . read_exact ( stream , 1 )
table_index = 0x28 + ( ( byte - 0x20 ) << 8 | next_byte )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Backreference (2-byte form) to 0x { table_index : >02x } " )
2019-10-01 10:09:00 +02:00
yield prev_literals [ table_index ]
2019-08-22 21:19:10 +02:00
elif byte == 0x22 :
# Backreference to a previous literal, 3-byte form.
# This can reference any literal with index 0x28 and higher, but is only necessary for literals with index 0x228 and higher.
2019-10-01 10:09:00 +02:00
table_index = 0x28 + int . from_bytes ( common . read_exact ( stream , 2 ) , " big " , signed = False )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Backreference (3-byte form) to 0x { table_index : >02x } " )
2019-10-01 10:09:00 +02:00
yield prev_literals [ table_index ]
2019-08-22 21:19:10 +02:00
elif byte in range ( 0x23 , 0x4b ) :
# Backreference to a previous literal, 1-byte form.
# This can reference literals with indices in range(0x28).
table_index = byte - 0x23
if debug :
print ( f " Backreference (1-byte form) to 0x { table_index : >02x } " )
2019-10-01 10:09:00 +02:00
yield prev_literals [ table_index ]
2019-08-22 21:19:10 +02:00
elif byte in range ( 0x4b , 0xfe ) :
# Reference into a fixed table of two-byte literals.
2019-08-24 23:38:07 +02:00
# All compressed resources use the same table.
2019-08-22 21:19:10 +02:00
table_index = byte - 0x4b
if debug :
print ( f " Fixed table reference to 0x { table_index : >02x } " )
2019-10-01 10:09:00 +02:00
yield TABLE [ table_index ]
2019-08-22 21:19:10 +02:00
elif byte == 0xfe :
# Extended code, whose meaning is controlled by the following byte.
2019-10-01 10:09:00 +02:00
( kind , ) = common . read_exact ( stream , 1 )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " Extended code: 0x { kind : >02x } " )
if kind == 0x00 :
# Compact representation of (part of) a segment loader jump table, as used in 'CODE' (0) resources.
if debug :
print ( f " Segment loader jump table entries " )
# All generated jump table entries have the same segment number.
2019-10-01 10:09:00 +02:00
segment_number_int = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
if debug :
print ( f " \t -> segment number: { segment_number_int : #x } " )
# The tail part of all jump table entries (i. e. everything except for the address).
entry_tail = b " ?< " + segment_number_int . to_bytes ( 2 , " big " , signed = True ) + b " \xa9 \xf0 "
# The tail is output once *without* an address in front, i. e. the first entry's address must be generated manually by a previous code.
2019-10-01 10:09:00 +02:00
yield entry_tail
2019-08-22 21:19:10 +02:00
2019-10-01 10:09:00 +02:00
count = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
if count < = 0 :
raise common . DecompressError ( f " Jump table entry count must be greater than 0, not { count } " )
# The second entry's address is stored explicitly.
2019-10-01 10:09:00 +02:00
current_int = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> address of second entry: { current_int : #x } " )
yield current_int . to_bytes ( 2 , " big " , signed = False ) + entry_tail
2019-08-22 21:19:10 +02:00
for _ in range ( 1 , count ) :
# All further entries' addresses are stored as differences relative to the previous entry's address.
2019-10-01 10:09:00 +02:00
diff = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
# For some reason, each difference is 6 higher than it should be.
diff - = 6
# Simulate 16-bit integer wraparound.
current_int = ( current_int + diff ) & 0xffff
if debug :
print ( f " \t -> difference { diff : #x } : { current_int : #x } " )
2019-10-01 10:09:00 +02:00
yield current_int . to_bytes ( 2 , " big " , signed = False ) + entry_tail
2019-08-22 21:19:10 +02:00
elif kind in ( 0x02 , 0x03 ) :
# Repeat 1 or 2 bytes a certain number of times.
if kind == 0x02 :
byte_count = 1
elif kind == 0x03 :
byte_count = 2
else :
raise AssertionError ( )
if debug :
print ( f " Repeat { byte_count } -byte value " )
# The byte(s) to repeat, stored as a variable-length integer. The value is treated as unsigned, i. e. the integer is never negative.
2019-10-01 10:09:00 +02:00
to_repeat_int = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
try :
to_repeat = to_repeat_int . to_bytes ( byte_count , " big " , signed = False )
except OverflowError :
raise common . DecompressError ( f " Value to repeat out of range for { byte_count } -byte repeat: { to_repeat_int : #x } " )
2019-10-01 10:09:00 +02:00
count = common . read_variable_length_integer ( stream ) + 1
2019-08-22 21:19:10 +02:00
if count < = 0 :
raise common . DecompressError ( f " Repeat count must be positive: { count } " )
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> { to_repeat } * { count } " )
yield to_repeat * count
2019-08-22 21:19:10 +02:00
elif kind == 0x04 :
# A sequence of 16-bit signed integers, with each integer encoded as a difference relative to the previous integer. The first integer is stored explicitly.
if debug :
print ( f " Difference-encoded 16-bit integers " )
# The first integer is stored explicitly, as a signed value.
2019-10-01 10:09:00 +02:00
initial_int = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
try :
initial = initial_int . to_bytes ( 2 , " big " , signed = True )
except OverflowError :
raise common . DecompressError ( f " Initial value out of range for 16-bit integer difference encoding: { initial_int : #x } " )
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> initial: 0x { initial_int : >04x } " )
yield initial
2019-08-22 21:19:10 +02:00
2019-10-01 10:09:00 +02:00
count = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
if count < 0 :
raise common . DecompressError ( f " Count cannot be negative: { count } " )
# To make the following calculations simpler, the signed initial_int value is converted to unsigned.
current_int = initial_int & 0xffff
for _ in range ( count ) :
# The difference to the previous integer is stored as an 8-bit signed integer.
# The usual variable-length integer format is *not* used here.
2019-10-01 10:09:00 +02:00
diff = int . from_bytes ( common . read_exact ( stream , 1 ) , " big " , signed = True )
2019-08-22 21:19:10 +02:00
# Simulate 16-bit integer wraparound.
current_int = ( current_int + diff ) & 0xffff
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> difference { diff : #x } : 0x { current_int : >04x } " )
yield current_int . to_bytes ( 2 , " big " , signed = False )
2019-08-22 21:19:10 +02:00
elif kind == 0x06 :
# A sequence of 32-bit signed integers, with each integer encoded as a difference relative to the previous integer. The first integer is stored explicitly.
if debug :
2019-09-30 21:35:44 +02:00
print ( f " Difference-encoded 32-bit integers " )
2019-08-22 21:19:10 +02:00
# The first integer is stored explicitly, as a signed value.
2019-10-01 10:09:00 +02:00
initial_int = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
try :
initial = initial_int . to_bytes ( 4 , " big " , signed = True )
except OverflowError :
raise common . DecompressError ( f " Initial value out of range for 32-bit integer difference encoding: { initial_int : #x } " )
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> initial: 0x { initial_int : >08x } " )
yield initial
2019-08-22 21:19:10 +02:00
2019-10-01 10:09:00 +02:00
count = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
assert count > = 0
# To make the following calculations simpler, the signed initial_int value is converted to unsigned.
current_int = initial_int & 0xffffffff
for _ in range ( count ) :
# The difference to the previous integer is stored as a variable-length integer, whose value may be negative.
2019-10-01 10:09:00 +02:00
diff = common . read_variable_length_integer ( stream )
2019-08-22 21:19:10 +02:00
# Simulate 32-bit integer wraparound.
current_int = ( current_int + diff ) & 0xffffffff
if debug :
2019-10-01 10:09:00 +02:00
print ( f " \t -> difference { diff : #x } : 0x { current_int : >08x } " )
yield current_int . to_bytes ( 4 , " big " , signed = False )
2019-08-22 21:19:10 +02:00
else :
raise common . DecompressError ( f " Unknown extended code: 0x { kind : >02x } " )
elif byte == 0xff :
# End of data marker, always occurs exactly once as the last byte of the compressed data.
if debug :
print ( " End marker " )
2019-10-01 10:09:00 +02:00
# Check that there really is no more data left.
extra = stream . read ( 1 )
if extra :
raise common . DecompressError ( f " Extra data encountered after end of data marker (first extra byte: { extra } ) " )
break
2019-08-22 21:19:10 +02:00
else :
2019-10-01 10:09:00 +02:00
raise common . DecompressError ( f " Unknown tag byte: 0x { byte : >02x } " )
def decompress_stream ( header_info : common . CompressedHeaderInfo , stream : typing . BinaryIO , * , debug : bool = False ) - > typing . Iterator [ bytes ] :
""" Decompress compressed data in the format used by ' dcmp ' (0). """
decompressed_length = 0
for chunk in decompress_stream_inner ( header_info , stream , debug = debug ) :
if debug :
print ( f " \t -> { chunk } " )
if header_info . decompressed_length % 2 != 0 and decompressed_length + len ( chunk ) == header_info . decompressed_length + 1 :
# Special case: if the decompressed data length stored in the header is odd and one less than the length of the actual decompressed data, drop the last byte.
# This is necessary because nearly all codes generate data in groups of 2 or 4 bytes, so it is basically impossible to represent data with an odd length using this compression format.
decompressed_length + = len ( chunk ) - 1
yield chunk [ : - 1 ]
else :
decompressed_length + = len ( chunk )
yield chunk
if debug :
print ( f " Decompressed { decompressed_length : #x } bytes so far " )