From 2fb1d02064501434fcc38b7e7aa63dc236046bd4 Mon Sep 17 00:00:00 2001
From: dgelessus <dgelessus@users.noreply.github.com>
Date: Sun, 14 Jul 2019 02:16:49 +0200
Subject: [PATCH] Add initial support for compressed resources

Not all compression formats are supported yet.
---
 README.rst           |   6 +-
 rsrcfork/__init__.py |   3 +-
 rsrcfork/__main__.py |  63 +++++++++----
 rsrcfork/api.py      |  43 +++++++--
 rsrcfork/compress.py | 209 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 298 insertions(+), 26 deletions(-)
 create mode 100644 rsrcfork/compress.py

diff --git a/README.rst b/README.rst
index 45b781e..8aff7e1 100644
--- a/README.rst
+++ b/README.rst
@@ -121,7 +121,11 @@ Changelog
 Version 1.2.0 (next version)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-* (no changes yet)
+* Added support for compressed resources.
+	* Compressed resource data is automatically decompressed, both in the Python API and on the command like.
+	* This is technically a breaking change, since in previous versions the compressed resource data was returned directly. However, this change will not affect end users negatively, unless one has already implemented custom handling for compressed resources.
+	* Currently, only one of the two standard Mac OS resource compression format is supported. Attempting to access a resource compressed in an unsupported format results in a ``DecompressError``.
+	* To access the raw resource data as stored in the file, without automatic decompression, use the ``res.data_raw`` attribute (for the Python API), or the ``--no-decompress`` option (for the command-line interface). This can be used to read the resource data in its compressed form, even if the compression format is not supported.
 
 Version 1.1.3.post1
 ^^^^^^^^^^^^^^^^^^^
diff --git a/rsrcfork/__init__.py b/rsrcfork/__init__.py
index cceed40..184ef2d 100644
--- a/rsrcfork/__init__.py
+++ b/rsrcfork/__init__.py
@@ -12,10 +12,11 @@ __all__ = [
 	"ResourceAttrs",
 	"ResourceFile",
 	"ResourceFileAttrs",
+	"compress",
 	"open",
 ]
 
-from . import api
+from . import api, compress
 from .api import Resource, ResourceAttrs, ResourceFile, ResourceFileAttrs
 
 # noinspection PyShadowingBuiltins
diff --git a/rsrcfork/__main__.py b/rsrcfork/__main__.py
index 0f9863c..b383a1d 100644
--- a/rsrcfork/__main__.py
+++ b/rsrcfork/__main__.py
@@ -5,7 +5,7 @@ import sys
 import textwrap
 import typing
 
-from . import __version__, api
+from . import __version__, api, compress
 
 # Translation table to replace ASCII non-printable characters with periods.
 _TRANSLATE_NONPRINTABLES = {k: "." for k in [*range(0x20), 0x7f]}
@@ -186,6 +186,7 @@ def main():
 	ap.add_argument("--version", action="version", version=__version__, help="Display version information and exit")
 	ap.add_argument("-a", "--all", action="store_true", help="When no filters are given, show all resources in full, instead of an overview")
 	ap.add_argument("-f", "--fork", choices=["auto", "data", "rsrc"], default="auto", help="The fork from which to read the resource data, or auto to guess (default: %(default)s)")
+	ap.add_argument("--no-decompress", action="store_false", dest="decompress", help="Do not decompress compressed resources, output compressed resource data as-is")
 	ap.add_argument("--format", choices=["dump", "hex", "raw", "derez"], default="dump", help="How to output the resources - human-readable info with hex dump (dump), data only as hex (hex), data only as raw bytes (raw), or like DeRez with no resource definitions (derez)")
 	ap.add_argument("--header-system", action="store_true", help="Output system-reserved header data and nothing else")
 	ap.add_argument("--header-application", action="store_true", help="Output application-specific header data and nothing else")
@@ -239,6 +240,11 @@ def main():
 				sys.exit(1)
 			
 			for res in resources:
+				if ns.decompress:
+					data = res.data
+				else:
+					data = res.data_raw
+				
 				if ns.format == "dump":
 					# Human-readable info and hex dump
 					
@@ -254,24 +260,37 @@ def main():
 					else:
 						attrdesc = "no attributes"
 					
+					if ns.decompress:
+						length_desc = f"{len(res.data)} bytes (stored in {len(res.data_raw)} bytes)"
+					else:
+						length_desc = f"{len(data)} bytes"
+					
 					restype = _bytes_escape(res.resource_type, quote="'")
-					print(f"Resource '{restype}' ({res.resource_id}), {name}, {attrdesc}, {len(res.data)} bytes:")
-					_hexdump(res.data)
+					print(f"Resource '{restype}' ({res.resource_id}), {name}, {attrdesc}, {length_desc}:")
+					_hexdump(data)
 					print()
 				elif ns.format == "hex":
 					# Data only as hex
 					
-					_raw_hexdump(res.data)
+					_raw_hexdump(data)
 				elif ns.format == "raw":
 					# Data only as raw bytes
 					
-					sys.stdout.buffer.write(res.data)
+					sys.stdout.buffer.write(data)
 				elif ns.format == "derez":
 					# Like DeRez with no resource definitions
 					
-					attrs = [_REZ_ATTR_NAMES[attr] for attr in _decompose_flags(res.attributes)]
-					if None in attrs:
-						attrs[:] = [f"${res.attributes.value:02X}"]
+					attrs = list(_decompose_flags(res.attributes))
+					
+					if ns.decompress and api.ResourceAttrs.resCompressed in attrs:
+						attrs.remove(api.ResourceAttrs.resCompressed)
+						attrs_comment = " /* was compressed */"
+					else:
+						attrs_comment = ""
+					
+					attr_descs = [_REZ_ATTR_NAMES[attr] for attr in attrs]
+					if None in attr_descs:
+						attr_descs[:] = [f"${res.attributes.value:02X}"]
 					
 					parts = [str(res.resource_id)]
 					
@@ -279,24 +298,24 @@ def main():
 						name = _bytes_escape(res.name, quote='"')
 						parts.append(f'"{name}"')
 					
-					parts += attrs
+					parts += attr_descs
 					
 					restype = _bytes_escape(res.resource_type, quote="'")
-					print(f"data '{restype}' ({', '.join(parts)}) {{")
+					print(f"data '{restype}' ({', '.join(parts)}{attrs_comment}) {{")
 					
-					for i in range(0, len(res.data), 16):
+					for i in range(0, len(data), 16):
 						# Two-byte grouping is really annoying to implement.
 						groups = []
 						for j in range(0, 16, 2):
-							if i+j >= len(res.data):
+							if i+j >= len(data):
 								break
-							elif i+j+1 >= len(res.data):
-								groups.append(f"{res.data[i+j]:02X}")
+							elif i+j+1 >= len(data):
+								groups.append(f"{data[i+j]:02X}")
 							else:
-								groups.append(f"{res.data[i+j]:02X}{res.data[i+j+1]:02X}")
+								groups.append(f"{data[i+j]:02X}{data[i+j+1]:02X}")
 						
 						s = f'$"{" ".join(groups)}"'
-						comment = "/* " + res.data[i:i + 16].decode("MacRoman").translate(_TRANSLATE_NONPRINTABLES) + " */"
+						comment = "/* " + data[i:i + 16].decode("MacRoman").translate(_TRANSLATE_NONPRINTABLES) + " */"
 						print(f"\t{s:<54s}{comment}")
 					
 					print("};")
@@ -340,7 +359,17 @@ def main():
 						else:
 							attrdesc = "no attributes"
 						
-						print(f"({resid}), {name}, {attrdesc}, {len(res.data)} bytes")
+						if ns.decompress and api.ResourceAttrs.resCompressed in attrs:
+							try:
+								res.data
+							except compress.DecompressError:
+								length_desc = f"decompression failed ({len(res.data_raw)} bytes compressed)"
+							else:
+								length_desc = f"{len(res.data)} bytes ({len(res.data_raw)} bytes compressed)"
+						else:
+							length_desc = f"{len(res.data_raw)} bytes"
+						
+						print(f"({resid}), {name}, {attrdesc}, {length_desc}")
 					print()
 			else:
 				print("No resource types (empty resource file)")
diff --git a/rsrcfork/api.py b/rsrcfork/api.py
index 2e14058..ac2dd16 100644
--- a/rsrcfork/api.py
+++ b/rsrcfork/api.py
@@ -5,6 +5,8 @@ import os
 import struct
 import typing
 
+from . import compress
+
 # The formats of all following structures is as described in the Inside Macintosh book (see module docstring).
 # Signedness and byte order of the integers is never stated explicitly in IM.
 # All integers are big-endian, as this is the native byte order of the 68k and PowerPC processors used in old Macs.
@@ -89,9 +91,9 @@ class ResourceAttrs(enum.Flag):
 class Resource(object):
 	"""A single resource from a resource file."""
 	
-	__slots__ = ("resource_type", "resource_id", "name", "attributes", "data")
+	__slots__ = ("resource_type", "resource_id", "name", "attributes", "data_raw", "_data_decompressed")
 	
-	def __init__(self, resource_type: bytes, resource_id: int, name: typing.Optional[bytes], attributes: ResourceAttrs, data: bytes):
+	def __init__(self, resource_type: bytes, resource_id: int, name: typing.Optional[bytes], attributes: ResourceAttrs, data_raw: bytes):
 		"""Create a new resource with the given type code, ID, name, attributes, and data."""
 		
 		super().__init__()
@@ -100,15 +102,42 @@ class Resource(object):
 		self.resource_id: int = resource_id
 		self.name: typing.Optional[bytes] = name
 		self.attributes: ResourceAttrs = attributes
-		self.data: bytes = data
+		self.data_raw: bytes = data_raw
 	
 	def __repr__(self):
-		if len(self.data) > 32:
-			data = f"<{len(self.data)} bytes: {self.data[:32]}...>"
+		try:
+			data = self.data
+		except compress.DecompressError:
+			decompress_ok = False
+			data = self.data_raw
 		else:
-			data = repr(self.data)
+			decompress_ok = True
 		
-		return f"{type(self).__module__}.{type(self).__qualname__}(resource_type={self.resource_type}, resource_id={self.resource_id}, name={self.name}, attributes={self.attributes}, data={data})"
+		if len(data) > 32:
+			data_repr = f"<{len(data)} bytes: {data[:32]}...>"
+		else:
+			data_repr = repr(data)
+		
+		if not decompress_ok:
+			data_repr = f"<decompression failed - compressed data: {data_repr}>"
+		
+		return f"{type(self).__module__}.{type(self).__qualname__}(resource_type={self.resource_type}, resource_id={self.resource_id}, name={self.name}, attributes={self.attributes}, data={data_repr})"
+	
+	@property
+	def data(self) -> bytes:
+		"""The resource data, decompressed if necessary.
+		
+		Accessing this attribute may raise a DecompressError if the resource data is compressed and could not be decompressed. To access the compressed resource data, use the data_raw attribute.
+		"""
+		
+		if ResourceAttrs.resCompressed in self.attributes:
+			try:
+				return self._data_decompressed
+			except AttributeError:
+				self._data_decompressed = compress.decompress(self.data_raw)
+				return self._data_decompressed
+		else:
+			return self.data_raw
 
 class ResourceFile(collections.abc.Mapping):
 	"""A resource file reader operating on a byte stream."""
diff --git a/rsrcfork/compress.py b/rsrcfork/compress.py
new file mode 100644
index 0000000..eff07e2
--- /dev/null
+++ b/rsrcfork/compress.py
@@ -0,0 +1,209 @@
+import enum
+import struct
+import typing
+
+__all__ = [
+	"DecompressError",
+	"decompress",
+]
+
+# The signature of all compressed resource data, 0xa89f6572 in hex, or "®üer" in MacRoman.
+COMPRESSED_SIGNATURE = b"\xa8\x9fer"
+# The compression type commonly used for System file resources.
+COMPRESSED_TYPE_SYSTEM = 0x0901
+
+# Header for a compressed resource.
+# 4 bytes: Signature (see above).
+# 2 bytes: Length of the header. (This meaning is just a guess - the field's value is always 0x0012, so there's no way to know for certain what it means.)
+# 2 bytes: Compression type. Known so far: 0x0901 is used in the System file's resources. 0x0801 is used in other files' resources. Currently only the first type is supported.
+# 4 bytes: Length of the data after decompression.
+# 2 bytes: The ID of the 'dcmp' resource that can decompress this resource. Currently only ID 2 is supported.
+# 2 bytes: Unknown meaning, doesn't appear to have any effect on the decompression algorithm. Usually zero, sometimes set to a small integer (< 10). On 'lpch' resources, the value is always nonzero, and sometimes larger than usual.
+# 1 byte: Number of entries in the custom lookup table minus one. Set to zero if the default lookup table is used.
+# 1 byte: Flags. See the CompressedFlags enum below for details.
+STRUCT_COMPRESSED_HEADER = struct.Struct(">4sHHIhHBB")
+
+# Default lookup table for compressed resources.
+# If the custom table flag is set, a custom table (usually with fewer than 256 entries) is used instead of this one.
+# This table was obtained by decompressing a manually created compressed resource that refers to every possible table entry. Detailed steps:
+# 1. Create a file with a resource fork
+# 2. Add a resource with the following contents: b'\xa8\x9fer\x00\x12\t\x01\x00\x00\x02\x00\x00\x02\x00\x00\x00\x00' + bytes(range(256))
+# 3. Set the "compressed" flag (0x01) on the resource
+# 4. Open the file in ResEdit
+# 5. Duplicate the resource - this will decompress the original resource and write its contents uncompressed into the duplicate
+# 6. Read the data from the duplicated resource
+COMPRESSED_DEFAULT_TABLE_DATA = (
+	b"\x00\x00\x00\x08N\xba nNu\x00\x0c\x00\x04p\x00"
+	b"\x00\x10\x00\x02Hn\xff\xfc`\x00\x00\x01H\xe7/."
+	b"NV\x00\x06N^/\x00a\x00\xff\xf8/\x0b\xff\xff"
+	b"\x00\x14\x00\n\x00\x18 _\x00\x0e P?<\xff\xf4"
+	b"L\xee0.g\x00L\xdf&n\x00\x12\x00\x1cBg"
+	b"\xff\xf00</\x0c\x00\x03N\xd0\x00 p\x01\x00\x16"
+	b"-@H\xc0 xr\x00X\x8ff\x00O\xefB\xa7"
+	b"g\x06\xff\xfaU\x8f(n?\x00\xff\xfe/<g\x04"
+	b"Y\x8f k\x00$ \x1fA\xfa\x81\xe1f\x04g\x08"
+	b"\x00\x1aN\xb9P\x8f .\x00\x07N\xb0\xff\xf2=@"
+	b"\x00\x1e hf\x06\xff\xf6N\xf9\x08\x00\x0c@=|"
+	b"\xff\xec\x00\x05 <\xff\xe8\xde\xfcJ.\x000\x00("
+	b"/\x08 \x0b`\x02Bn-H S @\x18\x00"
+	b"`\x04A\xee/(/\x01g\nH@ \x07f\x08"
+	b"\x01\x18/\x070(?.0+\"n/+\x00,"
+	b"g\x0c\"_`\x06\x00\xff0\x07\xff\xeeS@\x00@"
+	b"\xff\xe4J@f\n\x00\x0fN\xadp\xff\"\xd8Hk"
+	b"\x00\" Kg\x0eJ\xaeN\x90\xff\xe0\xff\xc0\x00*"
+	b"'@g\x02Q\xc8\x02\xb6Hz\"x\xb0n\xff\xe6"
+	b"\x00\t2.>\x00HA\xff\xeaC\xeeNqt\x00"
+	b"/, l\x00<\x00&\x00P\x18\x800\x1f\"\x00"
+	b"f\x0c\xff\xda\x008f\x020, \x0c-nB@"
+	b"\xff\xe2\xa9\xf0\xff\x007|\xe5\x80\xff\xdcHhYO"
+	b"\x004>\x1f`\x08/\x06\xff\xde`\np\x02\x002"
+	b"\xff\xcc\x00\x80\"Q\x10\x1f1|\xa0)\xff\xd8R@"
+	b"\x01\x00g\x10\xa0#\xff\xce\xff\xd4 \x06Hx\x00."
+	b"POC\xfag\x12v\x00A\xe8Jn \xd9\x00Z"
+	b"\x7f\xffQ\xca\x00\\.\x00\x02@H\xc7g\x14\x0c\x80"
+	b".\x9f\xff\xd6\x80\x00\x10\x00HBJk\xff\xd2\x00H"
+	b"JGN\xd1 o\x00A`\x0c*xB.2\x00"
+	b"etg\x16\x00DHm \x08Hl\x0b|&@"
+	b"\x04\x00\x00h m\x00\r*@\x00\x0b\x00>\x02 "
+)
+COMPRESSED_DEFAULT_TABLE = [COMPRESSED_DEFAULT_TABLE_DATA[i:i + 2] for i in range(0, len(COMPRESSED_DEFAULT_TABLE_DATA), 2)]
+
+
+class CompressedFlags(enum.Flag):
+	TAGGED = 1 << 1 # The compressed data is tagged, meaning that it consists of "blocks" of a tag byte followed by 8 table references and/or literals. See comments in the decompress function for details.
+	CUSTOM_TABLE = 1 << 0 # A custom lookup table is included before the compressed data, which is used instead of the default table.
+
+
+class DecompressError(Exception):
+	"""Raised when resource data decompression fails, because the data is invalid or the compression type is not supported."""
+
+
+def _split_bits(i: int) -> typing.Tuple[bool, bool, bool, bool, bool, bool, bool, bool]:
+	"""Split a byte (an int) into its 8 bits (a tuple of 8 bools)."""
+	
+	assert i in range(256)
+	return (
+		bool(i & (1 << 7)),
+		bool(i & (1 << 6)),
+		bool(i & (1 << 5)),
+		bool(i & (1 << 4)),
+		bool(i & (1 << 3)),
+		bool(i & (1 << 2)),
+		bool(i & (1 << 1)),
+		bool(i & (1 << 0)),
+	)
+
+
+def _decompress_untagged(data: bytes, decompressed_length: int, table: typing.Sequence[bytes], *, debug: bool=False) -> bytes:
+	parts = []
+	i = 0
+	while i < len(data):
+		if i == len(data) - 1 and decompressed_length % 2 != 0:
+			# Special case: if we are at the last byte of the compressed data, and the decompressed data has an odd length, the last byte is a single literal byte, and not a table reference.
+			if debug:
+				print(f"Last byte: {data[-1:]}")
+			parts.append(data[-1:])
+			break
+		
+		# Compressed data is untagged, every byte is a table reference.
+		if debug:
+			print(f"Reference: {data[i]} -> {table[data[i]]}")
+		parts.append(table[data[i]])
+		i += 1
+	
+	return b"".join(parts)
+
+def _decompress_tagged(data: bytes, decompressed_length: int, table: typing.Sequence[bytes], *, debug: bool=False) -> bytes:
+	parts = []
+	i = 0
+	while i < len(data):
+		if i == len(data) - 1 and decompressed_length % 2 != 0:
+			# Special case: if we are at the last byte of the compressed data, and the decompressed data has an odd length, the last byte is a single literal byte, and not a tag or a table reference.
+			if debug:
+				print(f"Last byte: {data[-1:]}")
+			parts.append(data[-1:])
+			break
+		
+		# Compressed data is tagged, each tag byte is followed by 8 table references and/or literals.
+		tag = data[i]
+		if debug:
+			print(f"Tag: 0b{tag:>08b}")
+		i += 1
+		for is_ref in _split_bits(tag):
+			if is_ref:
+				# This is a table reference (a single byte that is an index into the table).
+				if debug:
+					print(f"Reference: {data[i]} -> {table[data[i]]}")
+				parts.append(table[data[i]])
+				i += 1
+			else:
+				# This is a literal (two uncompressed bytes that are literally copied into the output).
+				# Note: if i == len(data)-1, the literal is actually only a single byte long.
+				# This case is handled automatically - the slice extends one byte past the end of the data, and only one byte is returned.
+				if debug:
+					print(f"Literal: {data[i:i+2]}")
+				parts.append(data[i:i + 2])
+				i += 2
+			
+			# If the end of the compressed data is reached in the middle of a chunk, all further tag bits are ignored (they should be zero) and decompression ends.
+			if i >= len(data):
+				break
+	
+	return b"".join(parts)
+
+
+def decompress(data: bytes, *, debug: bool=False) -> bytes:
+	"""Decompress the given compressed resource data."""
+	
+	try:
+		signature, header_length, compression_type, decompressed_length, dcmp_id, unknown, table_count_m1, flags_raw = STRUCT_COMPRESSED_HEADER.unpack_from(data, offset=0)
+	except struct.error:
+		raise DecompressError(f"Invalid header")
+	if signature != COMPRESSED_SIGNATURE:
+		raise DecompressError(f"Invalid signature: {signature!r}, expected {COMPRESSED_SIGNATURE}")
+	if header_length != STRUCT_COMPRESSED_HEADER.size:
+		raise DecompressError(f"Unsupported header length: 0x{header_length:>04x}, expected 0x{STRUCT_COMPRESSED_HEADER.size:>04x}")
+	if compression_type != COMPRESSED_TYPE_SYSTEM:
+		raise DecompressError(f"Unsupported compression type: 0x{compression_type:>04x}, expected 0x{COMPRESSED_TYPE_SYSTEM:>04x}")
+	if dcmp_id != 2:
+		raise DecompressError(f"Unsupported 'dcmp' ID: {dcmp_id}, expected 2")
+	if debug:
+		print(f"Value of unknown field at bytes 0xc-0xe: 0x{unknown:>04x}")
+	
+	table_count = table_count_m1 + 1
+	if debug:
+		print(f"Table has {table_count} entries")
+	
+	try:
+		flags = CompressedFlags(flags_raw)
+	except ValueError:
+		raise DecompressError(f"Unsupported flags set: 0b{flags_raw:>08b}, currently only bits 0 and 1 are supported")
+	
+	if debug:
+		print(f"Flags: {flags}")
+	
+	if CompressedFlags.CUSTOM_TABLE in flags:
+		table_start = STRUCT_COMPRESSED_HEADER.size
+		data_start = table_start + table_count * 2
+		table = []
+		for i in range(table_start, data_start, 2):
+			table.append(data[i:i + 2])
+		if debug:
+			print(f"Using custom table: {table}")
+	else:
+		if table_count_m1 != 0:
+			raise DecompressError(f"table_count_m1 field is {table_count_m1}, but must be zero when the default table is used")
+		table = COMPRESSED_DEFAULT_TABLE
+		data_start = STRUCT_COMPRESSED_HEADER.size
+		if debug:
+			print("Using default table")
+	
+	if CompressedFlags.TAGGED in flags:
+		decompress_func = _decompress_tagged
+	else:
+		decompress_func = _decompress_untagged
+	
+	decompressed = decompress_func(data[data_start:], decompressed_length, table, debug=debug)
+	if len(decompressed) != decompressed_length:
+		raise DecompressError(f"Actual length of decompressed data ({len(decompressed)}) does not match length stored in resource ({decompressed_length})")
+	return decompressed