From 11264214ab3bb4afab95e935060db8061760b571 Mon Sep 17 00:00:00 2001 From: Elliot Nunn Date: Sat, 9 Mar 2019 15:53:05 +0800 Subject: [PATCH] Initial commit --- .gitignore | 8 + bin/prclc | 24 +++ bin/prcldump | 53 ++++++ setup.py | 31 ++++ speedups/fast_lzss.c | 370 +++++++++++++++++++++++++++++++++++++++ tbxi/__init__.py | 0 tbxi/lowlevel.py | 13 ++ tbxi/namedtuplestruct.py | 29 +++ tbxi/prclc.py | 199 +++++++++++++++++++++ tbxi/prcldump.py | 190 ++++++++++++++++++++ tbxi/slow_lzss.py | 263 ++++++++++++++++++++++++++++ tbxi/stringstruct.py | 24 +++ 12 files changed, 1204 insertions(+) create mode 100644 .gitignore create mode 100644 bin/prclc create mode 100644 bin/prcldump create mode 100644 setup.py create mode 100644 speedups/fast_lzss.c create mode 100644 tbxi/__init__.py create mode 100644 tbxi/lowlevel.py create mode 100644 tbxi/namedtuplestruct.py create mode 100644 tbxi/prclc.py create mode 100644 tbxi/prcldump.py create mode 100644 tbxi/slow_lzss.py create mode 100644 tbxi/stringstruct.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1907893 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +MANIFEST +dist/ +build/ +*.dmg +*.pyc +*egg-info/ +__pycache__/ +.DS_Store diff --git a/bin/prclc b/bin/prclc new file mode 100644 index 0000000..eba45cd --- /dev/null +++ b/bin/prclc @@ -0,0 +1,24 @@ +import argparse +import os +from os import path +from sys import stderr + +from tbxi.prclc import compile + + +parser = argparse.ArgumentParser(description=''' + Parcel blob compiler +''') + +parser.add_argument('source', nargs='?', default=os.getcwd(), help='Parcelfile or directory') +parser.add_argument('-o', metavar='dest-file', default='MacOSROM', help='output file (default: MacOSROM)') + +args = parser.parse_args() + +if path.isdir(args.source): + args.source = path.join(args.source, 'Parcelfile') + +result = compile(args.source) + +with open(args.o, 'wb') as f: + f.write(result) diff --git a/bin/prcldump b/bin/prcldump new file mode 100644 index 0000000..9ee73c4 --- /dev/null +++ b/bin/prcldump @@ -0,0 +1,53 @@ +import argparse +import os +from os import path +from sys import stderr + +from tbxi.lowlevel import MAGIC +from tbxi.prcldump import dump + + +parser = argparse.ArgumentParser(description=''' + Dump a MacOS parcel blob (magic number 0x7072636C 'prcl') to a + plain-text Parcelfile and several decompressed binaries. This output + can be rebuilt using the Parcel Compiler (prclc). Usually parcel + blobs are found embedded inside a file called "Mac OS ROM", although + the Blue Box uses them in isolation. As a convenience this utility + will search for the magic number inside any input file (with a + warning). +''') + +parser.add_argument('source', nargs=1, help='file to be decompiled') + +meg = parser.add_mutually_exclusive_group() +meg.add_argument('-d', metavar='dest-dir', help='output directory (Parcelfile will be created within)') +meg.add_argument('-f', metavar='dest-file', help='output file (binaries will go in parent directory)') + +args = parser.parse_args() + +with open(args.source[0], 'rb') as f: + binary = f.read() + +if not binary.startswith(MAGIC): + try: + offset = binary.index(MAGIC) + except ValueError: + print('Not a parcels file', file=stderr) + exit(1) + else: + print('Warning: parcel blob wrapped at offset 0x%x' % offset) + binary = binary[offset:] + +if args.f: + dest_file = path.abspath(args.f) + dest_dir = path.dirname(dest_file) +elif args.d: + dest_dir = path.abspath(args.d) + dest_file = path.join(dest_dir, 'Parcelfile') +else: + dest_dir = path.abspath(args.source[0].rstrip(path.sep) + '-dump') + dest_file = path.join(dest_dir, 'Parcelfile') + +os.makedirs(dest_dir, exist_ok=True) + +dump(binary, dest_file, dest_dir) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4c3bbc9 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +from setuptools import setup, Extension + +setup_args = dict( + name='tbxi', + version='0.1', + author='Elliot Nunn', + author_email='elliotnunn@fastmail.com', + description='Tools to compile and inspect Mac OS 8/9 NewWorld ROM images', + url='https://github.com/elliotnunn/tbxi', + classifiers=[ + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: C', + 'Operating System :: OS Independent', + 'Development Status :: 3 - Alpha', + ], + packages=['tbxi'], + scripts=['bin/prclc', 'bin/prcldump'], + ext_modules=[Extension('tbxi.fast_lzss', ['speedups/fast_lzss.c'])], +) + +# http://charlesleifer.com/blog/misadventures-in-python-packaging-optional-c-extensions/ + +# Yes, it might be a bit extreme to catch SystemExit to find a compiler error... + +try: + setup(**setup_args) +except (SystemExit, Exception): + setup_args.pop('ext_modules') + setup(**setup_args) +else: + exit() diff --git a/speedups/fast_lzss.c b/speedups/fast_lzss.c new file mode 100644 index 0000000..357f8ca --- /dev/null +++ b/speedups/fast_lzss.c @@ -0,0 +1,370 @@ +#define PY_SSIZE_T_CLEAN 1 +#include + +#define LARGE_BUFFER 0x1000000 + +#include +#include +#include + +/************************************************************** + LZSS.C -- A Data Compression Program +*************************************************************** + 4/6/1989 Haruhiko Okumura + Use, distribute, and modify this program freely. + Please send me your improved versions. + PC-VAN SCIENCE + NIFTY-Serve PAF01022 + CompuServe 74050,1022 +**************************************************************/ + +#define N 4096 /* size of ring buffer - must be power of 2 */ +#define F 18 /* upper limit for match_length */ +#define THRESHOLD 2 /* encode string into position and length + if match_length is greater than this */ +#define NIL N /* index for root of binary search trees */ + +struct encode_state { + /* + * left & right children & parent. These constitute binary search trees. + */ + int lchild[N + 1], rchild[N + 257], parent[N + 1]; + + /* ring buffer of size N, with extra F-1 bytes to aid string comparison */ + uint8_t text_buf[N + F - 1]; + + /* + * match_length of longest match. + * These are set by the insert_node() procedure. + */ + int match_position, match_length; +}; + + +int +decompress_lzss(uint8_t *dst, uint8_t *src, uint32_t srclen) +{ + /* ring buffer of size N, with extra F-1 bytes to aid string comparison */ + uint8_t text_buf[N + F - 1]; + uint8_t *dststart = dst; + uint8_t *srcend = src + srclen; + int i, j, k, r, c; + unsigned int flags; + + dst = dststart; + srcend = src + srclen; + for (i = 0; i < N - F; i++) + text_buf[i] = ' '; + r = N - F; + flags = 0; + for ( ; ; ) { + if (((flags >>= 1) & 0x100) == 0) { + if (src < srcend) c = *src++; else break; + flags = c | 0xFF00; /* uses higher byte cleverly */ + } /* to count eight */ + if (flags & 1) { + if (src < srcend) c = *src++; else break; + *dst++ = c; + text_buf[r++] = c; + r &= (N - 1); + } else { + if (src < srcend) i = *src++; else break; + if (src < srcend) j = *src++; else break; + i |= ((j & 0xF0) << 4); + j = (j & 0x0F) + THRESHOLD; + for (k = 0; k <= j; k++) { + c = text_buf[(i + k) & (N - 1)]; + *dst++ = c; + text_buf[r++] = c; + r &= (N - 1); + } + } + } + + return dst - dststart; +} + +/* + * initialize state, mostly the trees + * + * For i = 0 to N - 1, rchild[i] and lchild[i] will be the right and left + * children of node i. These nodes need not be initialized. Also, parent[i] + * is the parent of node i. These are initialized to NIL (= N), which stands + * for 'not used.' For i = 0 to 255, rchild[N + i + 1] is the root of the + * tree for strings that begin with character i. These are initialized to NIL. + * Note there are 256 trees. */ +static void init_state(struct encode_state *sp) +{ + int i; + + memset(sp, 0, sizeof(*sp)); + + for (i = 0; i < N - F; i++) + sp->text_buf[i] = ' '; + for (i = N + 1; i <= N + 256; i++) + sp->rchild[i] = NIL; + for (i = 0; i < N; i++) + sp->parent[i] = NIL; +} + +/* + * Inserts string of length F, text_buf[r..r+F-1], into one of the trees + * (text_buf[r]'th tree) and returns the longest-match position and length + * via the global variables match_position and match_length. + * If match_length = F, then removes the old node in favor of the new one, + * because the old one will be deleted sooner. Note r plays double role, + * as tree node and position in buffer. + */ +static void insert_node(struct encode_state *sp, int r) +{ + int i, p, cmp; + uint8_t *key; + + cmp = 1; + key = &sp->text_buf[r]; + p = N + 1 + key[0]; + sp->rchild[r] = sp->lchild[r] = NIL; + sp->match_length = 0; + for ( ; ; ) { + if (cmp >= 0) { + if (sp->rchild[p] != NIL) + p = sp->rchild[p]; + else { + sp->rchild[p] = r; + sp->parent[r] = p; + return; + } + } else { + if (sp->lchild[p] != NIL) + p = sp->lchild[p]; + else { + sp->lchild[p] = r; + sp->parent[r] = p; + return; + } + } + for (i = 1; i < F; i++) { + if ((cmp = key[i] - sp->text_buf[p + i]) != 0) + break; + } + if (i > sp->match_length) { + sp->match_position = p; + if ((sp->match_length = i) >= F) + break; + } + } + sp->parent[r] = sp->parent[p]; + sp->lchild[r] = sp->lchild[p]; + sp->rchild[r] = sp->rchild[p]; + sp->parent[sp->lchild[p]] = r; + sp->parent[sp->rchild[p]] = r; + if (sp->rchild[sp->parent[p]] == p) + sp->rchild[sp->parent[p]] = r; + else + sp->lchild[sp->parent[p]] = r; + sp->parent[p] = NIL; /* remove p */ +} + +/* deletes node p from tree */ +static void delete_node(struct encode_state *sp, int p) +{ + int q; + + if (sp->parent[p] == NIL) + return; /* not in tree */ + if (sp->rchild[p] == NIL) + q = sp->lchild[p]; + else if (sp->lchild[p] == NIL) + q = sp->rchild[p]; + else { + q = sp->lchild[p]; + if (sp->rchild[q] != NIL) { + do { + q = sp->rchild[q]; + } while (sp->rchild[q] != NIL); + sp->rchild[sp->parent[q]] = sp->lchild[q]; + sp->parent[sp->lchild[q]] = sp->parent[q]; + sp->lchild[q] = sp->lchild[p]; + sp->parent[sp->lchild[p]] = q; + } + sp->rchild[q] = sp->rchild[p]; + sp->parent[sp->rchild[p]] = q; + } + sp->parent[q] = sp->parent[p]; + if (sp->rchild[sp->parent[p]] == p) + sp->rchild[sp->parent[p]] = q; + else + sp->lchild[sp->parent[p]] = q; + sp->parent[p] = NIL; +} + +uint8_t * +compress_lzss(uint8_t *dst, size_t dstlen, uint8_t *src, size_t srcLen) +{ + /* Encoding state, mostly tree but some current match stuff */ + struct encode_state *sp; + + int i, c, len, r, s, last_match_length, code_buf_ptr; + uint8_t code_buf[17], mask; + uint8_t *srcend = src + srcLen; + uint8_t *dstend = dst + dstlen; + + /* initialize trees */ + sp = (struct encode_state *) malloc(sizeof(*sp)); + init_state(sp); + + /* + * code_buf[1..16] saves eight units of code, and code_buf[0] works + * as eight flags, "1" representing that the unit is an unencoded + * letter (1 byte), "" a position-and-length pair (2 bytes). + * Thus, eight units require at most 16 bytes of code. + */ + code_buf[0] = 0; + code_buf_ptr = mask = 1; + + /* Clear the buffer with any character that will appear often. */ + s = 0; r = N - F; + + /* Read F bytes into the last F bytes of the buffer */ + for (len = 0; len < F && src < srcend; len++) + sp->text_buf[r + len] = *src++; + if (!len) { + free(sp); + return (void *) 0; /* text of size zero */ + } + /* + * Insert the F strings, each of which begins with one or more + * 'space' characters. Note the order in which these strings are + * inserted. This way, degenerate trees will be less likely to occur. + */ + for (i = 1; i <= F; i++) + insert_node(sp, r - i); + + /* + * Finally, insert the whole string just read. + * The global variables match_length and match_position are set. + */ + insert_node(sp, r); + do { + /* match_length may be spuriously long near the end of text. */ + if (sp->match_length > len) + sp->match_length = len; + if (sp->match_length <= THRESHOLD) { + sp->match_length = 1; /* Not long enough match. Send one byte. */ + code_buf[0] |= mask; /* 'send one byte' flag */ + code_buf[code_buf_ptr++] = sp->text_buf[r]; /* Send uncoded. */ + } else { + /* Send position and length pair. Note match_length > THRESHOLD. */ + code_buf[code_buf_ptr++] = (uint8_t) sp->match_position; + code_buf[code_buf_ptr++] = (uint8_t) + ( ((sp->match_position >> 4) & 0xF0) + | (sp->match_length - (THRESHOLD + 1)) ); + } + if ((mask <<= 1) == 0) { /* Shift mask left one bit. */ + /* Send at most 8 units of code together */ + for (i = 0; i < code_buf_ptr; i++) + if (dst < dstend) + *dst++ = code_buf[i]; + else { + free(sp); + return (void *) 0; + } + code_buf[0] = 0; + code_buf_ptr = mask = 1; + } + last_match_length = sp->match_length; + for (i = 0; i < last_match_length && src < srcend; i++) { + delete_node(sp, s); /* Delete old strings and */ + c = *src++; + sp->text_buf[s] = c; /* read new bytes */ + + /* + * If the position is near the end of buffer, extend the buffer + * to make string comparison easier. + */ + if (s < F - 1) + sp->text_buf[s + N] = c; + + /* Since this is a ring buffer, increment the position modulo N. */ + s = (s + 1) & (N - 1); + r = (r + 1) & (N - 1); + + /* Register the string in text_buf[r..r+F-1] */ + insert_node(sp, r); + } + while (i++ < last_match_length) { + delete_node(sp, s); + + /* After the end of text, no need to read, */ + s = (s + 1) & (N - 1); + r = (r + 1) & (N - 1); + /* but buffer may not be empty. */ + if (--len) + insert_node(sp, r); + } + } while (len > 0); /* until length of string to be processed is zero */ + + if (code_buf_ptr > 1) { /* Send remaining code. */ + for (i = 0; i < code_buf_ptr; i++) + if (dst < dstend) + *dst++ = code_buf[i]; + else { + free(sp); + return (void *) 0; + } + } + + free(sp); + return dst; +} + + +/* Python wrapper stuff happens here */ + + +static PyObject *wrap_compress(PyObject *self, PyObject *args) +{ + uint8_t *src, *dst, *returned; + Py_ssize_t src_len, dst_len; + + if(!PyArg_ParseTuple(args, "y#", &src, &src_len)) { + PyErr_SetString(PyExc_ValueError, "bad args"); return NULL; + } + + /* Now, we guess how long the object goes (naughty!) */ + dst = malloc(LARGE_BUFFER); + if (dst == NULL) { + return PyErr_NoMemory(); + } + + returned = compress_lzss(dst, (size_t)LARGE_BUFFER, src, (size_t)src_len); + if (returned == NULL) { + free(dst); + return PyErr_NoMemory(); + } + + dst_len = returned - dst; + + PyObject *retval = PyBytes_FromStringAndSize((const char *)dst, dst_len); + free(dst); + + return retval; +} + +static PyMethodDef module_methods[] = { + {"compress", wrap_compress, METH_VARARGS, NULL}, + {NULL, NULL, 0, NULL} +}; + +static struct PyModuleDef this_module = { + PyModuleDef_HEAD_INIT, + "fast_lzss", + "Fast Lempel-Ziv-Storer-Szymanski compression", + -1, + module_methods +}; + +PyMODINIT_FUNC PyInit_fast_lzss(void) +{ + return PyModule_Create(&this_module); +} diff --git a/tbxi/__init__.py b/tbxi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tbxi/lowlevel.py b/tbxi/lowlevel.py new file mode 100644 index 0000000..1ac329e --- /dev/null +++ b/tbxi/lowlevel.py @@ -0,0 +1,13 @@ +from .stringstruct import StringStruct +from .namedtuplestruct import NamedTupleStruct + +class MyParcelStruct(NamedTupleStruct, StringStruct): + pass + +MAGIC = b'prcl\x01\x00\x00\x00' + +PrclNodeStruct = MyParcelStruct('>I 4s I I I I 32s 32s', name='PrclNodeStruct', + fields=['link', 'ostype', 'hdr_size', 'flags', 'n_children', 'child_size', 'a', 'b']) + +PrclChildStruct = MyParcelStruct('>4s I 4s I I I I 32s', name='PrclChildStruct', + fields=['ostype', 'flags', 'compress', 'unpackedlen', 'cksum', 'packedlen', 'ptr', 'name']) diff --git a/tbxi/namedtuplestruct.py b/tbxi/namedtuplestruct.py new file mode 100644 index 0000000..9b1991d --- /dev/null +++ b/tbxi/namedtuplestruct.py @@ -0,0 +1,29 @@ +import struct +from collections import namedtuple + +class NamedTupleStruct(struct.Struct): + """A Struct that works with namedtuple instead of tuple""" + + def __init__(self, *args, name=None, fields=None, **kwargs): + self.__namedtuple = namedtuple(name, fields) + super().__init__(*args, **kwargs) + + def __tuplify(self, *args, **kwargs): + kwargs = {k:v for (k,v) in kwargs.items() if k in self.__namedtuple._fields} + return self.__namedtuple(*args, **kwargs) + + def unpack(self, *args, **kwargs): + orig = super().unpack(*args, **kwargs) + return self.__namedtuple(*orig) + + def unpack_from(self, *args, **kwargs): + orig = super().unpack_from(*args, **kwargs) + return self.__namedtuple(*orig) + + def pack(self, *args, **kwargs): + nt = self.__tuplify(*args, **kwargs) + return super().pack(*nt) + + def pack_into(self, buf, offset, *args, **kwargs): + nt = self.__tuplify(*args, **kwargs) + return super().pack_into(buf, offset, *nt) diff --git a/tbxi/prclc.py b/tbxi/prclc.py new file mode 100644 index 0000000..85766d8 --- /dev/null +++ b/tbxi/prclc.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 + +from shlex import split +from os import path +import struct +from binascii import crc32 + +from .lowlevel import PrclNodeStruct, PrclChildStruct, MAGIC + +try: + from .fast_lzss import compress +except ImportError: + from .slow_lzss import compress + +class CodeLine(dict): + def __getattr__(self, attrname): + return self[attrname] + + def __setattr__(self, attrname, attrval): + self[attrname] = attrval + + +def get_indent_level(from_str): + if from_str.startswith('\t\t'): + return 2 + elif from_str.startswith('\t'): + return 1 + else: + return 0 + +def get_keys(from_list, **available): + ret = CodeLine() + + for k, v in available.items(): + ret[k] = v('') + + for i in from_list: + k, _, v = i.partition('=') + fmt = available[k] + ret[k] = fmt(v) + + return ret + +def gethex(from_str): + if not from_str: return 0 + if from_str.lower().startswith('0x'): + return int(from_str[2:], base=16) + else: + return int(from_str) + +def getbool(from_str): + from_str = from_str.lower() + if from_str.strip() in ('', 'no', 'n', 'false', 'f', '0'): + return False + else: + return True + +class PdslParseError(Exception): + pass + +def load_and_cache_path(from_path): + # No compression, easy + if not from_path.lower().endswith('.lzss'): + with open(from_path, 'rb') as f: + return f.read() + + # Compression, first try to read cached file + try: + f = open(from_path, 'rb') + except FileNotFoundError: + pass + else: + try: + orig_t = path.getmtime(from_path[:-5]) + except FileNotFoundError: + orig_t = None + + if orig_t is None or orig_t < path.getmtime(from_path): + data = f.read() + f.close() + return data + + # Compression, no valid cached file available + with open(from_path[:-5], 'rb') as f: + data = compress(f.read()) + + with open(from_path, 'wb') as f: + f.write(data) + + return data + +def compile(src): + parent = path.dirname(path.abspath(src)) + node_list = [] + + with open(src) as f: + try: + for line_num, line in enumerate(f, start=1): + level = get_indent_level(line) + pieces = split(line, comments=True, posix=True) + + if not pieces: continue + + if level == 0: + # parcel node + new = get_keys(pieces[1:], flags=gethex, a=str, b=str) + new.ostype = pieces[0] + new.children = [] + node_list.append(new) + + elif level == 1: + # parcel child + new = get_keys(pieces[1:], flags=gethex, name=str, src=str, deduplicate=getbool) + new.ostype = pieces[0] + new.data = bytearray() + new.compress = '' + + if new.src: + if not path.isabs(new.src): # look rel to Parcelfile + new.src = path.join(path.dirname(src), new.src) + + if new.src.lower().endswith('.lzss'): + new.compress = 'lzss' + + new.data = load_and_cache_path(new.src) + + node_list[-1].children.append(new) + + elif level == 2: + # some C strings to add to the data + assert not node_list[-1].children[-1].src + for x in pieces: + node_list[-1].children[-1].data.extend(x.encode('mac_roman') + b'\0') + + except: + raise PdslParseError('Line %d' % line_num) + + # Great! Now that we have this cool data structure, turn it into parcels... + accum = bytearray() + + accum.extend(MAGIC) + accum.extend(b'\x00\x00\x00\x14') + hdr_ptr = len(accum) + accum.extend(bytes(4)) + accum.extend(bytes(4)) + + dedup_dict = {} + + for node in node_list: + # Link previous member to this one + struct.pack_into('>I', accum, hdr_ptr, len(accum)) + + hdr_ptr = len(accum) + hdr_size = PrclNodeStruct.size + len(node.children)*PrclChildStruct.size + accum.extend(b'!' * hdr_size) + + # okay, now start blatting data! + for child in node.children: + child.data = bytes(child.data) # no more mutability + + dedup_tpl = (child.compress, child.data) + + child.unpackedlen = len(child.data) + + if child.deduplicate and dedup_tpl in dedup_dict: + child.ptr, child.packedlen = dedup_dict[dedup_tpl] + continue + + child.ptr = len(accum) + + accum.extend(child.data) + + child.packedlen = len(accum) - child.ptr + + while len(accum) % 4 != 0: + accum.append(0x99) # this is the only place we pad + + if child.deduplicate: + dedup_dict[dedup_tpl] = (child.ptr, child.packedlen) + + PrclNodeStruct.pack_into(accum, hdr_ptr, + link=0, ostype=node.ostype, hdr_size=hdr_size, flags=node.flags, + n_children=len(node.children), child_size=PrclChildStruct.size, + a=node.a, b=node.b, + ) + + pack_ptr = hdr_ptr + PrclNodeStruct.size + + for child in node.children: + if child.flags & 4: + data = accum[child.ptr:child.ptr+child.packedlen] + child.cksum = crc32(data) + else: + child.cksum = 0 + + PrclChildStruct.pack_into(accum, pack_ptr, **child) + pack_ptr += PrclChildStruct.size + + return bytes(accum) diff --git a/tbxi/prcldump.py b/tbxi/prcldump.py new file mode 100644 index 0000000..9067185 --- /dev/null +++ b/tbxi/prcldump.py @@ -0,0 +1,190 @@ +from collections import defaultdict, Counter +import os +from os import path +from shlex import quote +import struct + +from .lzss import decompress +from .lowlevel import PrclNodeStruct, PrclChildStruct + +def walk_tree(binary): + """Get low level representation of tree + + e.g. [(prclnodetuple, [prclchildtuple, ...]), ...] + """ + + if not binary.startswith(b'prcl'): + raise ValueError('binary does not start with magic number') + + prclnode = None + + parents = [] + for i in iter(lambda: prclnode.link if prclnode else struct.unpack_from('>12xI', binary)[0], 0): + prclnode = PrclNodeStruct.unpack_from(binary, offset=i) + + children = [] + for j in range(i + PrclNodeStruct.size, i + prclnode.hdr_size, prclnode.child_size): + prclchild = PrclChildStruct.unpack_from(binary, offset=j) + + children.append(prclchild) + + parents.append((prclnode, children)) + + return parents + + +def unique_binary_tpl(prclchild): + return (prclchild.ptr, prclchild.packedlen, prclchild.compress) + + +def suggest_names_to_dump(parent, child, code_name): + # We yield heaps of suggested filenames, and the shortest non-empty unique one gets chosen + + if parent.ostype == child.ostype == 'rom ': + yield 'ROM' + return + + if 'AAPL,MacOS,PowerPC' in child.name and code_name == 'PowerMgrPlugin': + if parent.a == 'cuda' and parent.b == 'via-cuda': + yield 'PowerMgrPlugin.CUDA' + elif parent.a == 'pmu' and parent.b == 'power-mgt': + yield 'PowerMgrPlugin.PMU' + elif parent.a == 'via-pmu-99' and parent.b == 'power-mgt': + yield 'PowerMgrPlugin.PMU99' + elif parent.a == 'via-pmu-2000' and parent.b == 'power-mgt': + yield 'PowerMgrPlugin.PMU2000' + elif parent.a == 'bal' and parent.b == 'power-mgt': + yield 'PowerMgrPlugin.BlueBox' + + if ',' not in child.name: # All property names except driver,AAPL,MacOS,pef et al + yield child.name + + if child.flags & 0x80: # special-node stuff + yield child.name + yield squish_name(child.name, parent.a, parent.b) + + if 'AAPL,MacOS,PowerPC' in child.name: + if code_name: + yield squish_name(code_name, parent.a, parent.b) + else: + yield squish_name(parent.a, parent.b) + + +def squish_name(*parts): + squeeze = lambda x: x.lower().replace('-', '').replace('_', '') + + parts = list(parts) + keepmask = [True] * len(parts) + + for i in range(len(parts)): + for j in range(len(parts)): + if i == j: continue + if squeeze(parts[j]) == squeeze(parts[i]): + if j > i: keepmask[j] = False + elif squeeze(parts[j]) in squeeze(parts[i]): + keepmask[j] = False + + truelist = [] + for i in range(len(parts)): + if keepmask[i]: truelist.append(parts[i]) + + return '.'.join(truelist) + + +def settle_name_votes(vote_dict): + # Forbid duplicate names + duplicate_names = set(['']) + for ka, va in vote_dict.items(): + for kb, vb in vote_dict.items(): + if ka is kb: continue + + for x in va: + if x in vb: + duplicate_names.add(x) + + # Pick the shortest non-duplicate name + decision = {} + for k, v in vote_dict.items(): + allowed_names = [x for x in v if x not in duplicate_names] + if allowed_names: + decision[k] = min(allowed_names, key=len) + + return decision + + +def dump(binary, dest, dest_dir): + if path.isdir(dest) or dest.endswith(os.sep): + dest = path.join(dest, 'Parcelfile') + + basic_structure = walk_tree(binary) + + # Decompress everything + unpacked_dict = {} + binary_counts = Counter() + for prclnode, children in basic_structure: + for prclchild in children: + binary_counts[unique_binary_tpl(prclchild)] += 1 + + data = binary[prclchild.ptr:prclchild.ptr+prclchild.packedlen] + if prclchild.compress == 'lzss': data = decompress(data) + + unpacked_dict[unique_binary_tpl(prclchild)] = data + + # Suggest possible filenames for each blob + name_vote_dict = defaultdict(list) + for prclnode, children in basic_structure: + # is there a prop that gives contextual name information? + for check_child in children: + if check_child.name == 'code,AAPL,MacOS,name': + code_name = unpacked_dict[unique_binary_tpl(check_child)].rstrip(b'\0').decode('ascii') + break + else: + code_name = None + + # now use that name to suggest names for all the children + for prclchild in children: + if prclchild.ostype in ('cstr', 'csta'): continue + votes = suggest_names_to_dump(prclnode, prclchild, code_name) + name_vote_dict[unique_binary_tpl(prclchild)].extend(votes) + + # Decide on filenames + decision = settle_name_votes(name_vote_dict) + + # Dump blobs to disk + for tpl, filename in decision.items(): + with open(path.join(dest_dir, filename), 'wb') as f: + f.write(unpacked_dict[tpl]) + + # Get printing!!! + with open(dest, 'w') as f: + for prclnode, children in basic_structure: + line = quote(prclnode.ostype) + line += ' flags=0x%05x' % prclnode.flags + if prclnode.a: line += ' a=%s' % quote(prclnode.a) + if prclnode.b: line += ' b=%s' % quote(prclnode.b) + + print(line, file=f) + + for prclchild in children: + line = '\t%s' % quote(prclchild.ostype) + line += ' flags=0x%05x' % prclchild.flags + if prclchild.name: line += ' name=%s' % quote(prclchild.name) + + if prclchild.ostype not in ('cstr', 'csta'): + filename = decision[unique_binary_tpl(prclchild)] + if prclchild.compress == 'lzss': filename += '.lzss' + line += ' src=%s' % quote(path.relpath(path.join(dest_dir, filename), path.dirname(dest))) + + if binary_counts[unique_binary_tpl(prclchild)] > 1: + line += ' deduplicate=1' + + print(line, file=f) + + if prclchild.ostype in ('cstr', 'csta'): + strangs = unpacked_dict[unique_binary_tpl(prclchild)].split(b'\0')[:-1] + for s in strangs: + line = '\t\t%s' % quote(s.decode('ascii')) + + print(line, file=f) + + print(file=f) diff --git a/tbxi/slow_lzss.py b/tbxi/slow_lzss.py new file mode 100644 index 0000000..ae81d6d --- /dev/null +++ b/tbxi/slow_lzss.py @@ -0,0 +1,263 @@ +# This file is adapted from LZSS.C by Haruhiko Okumura 4/6/1989 + +# Decompression is pretty quick +# Compression is pretty slow: +# about 50s to compress a 4 MB rom on my machine + +from warnings import warn +have_warned_about_slowness = False + +N = 0x1000 +F = 18 +THRESHOLD = 2 +NIL = N + + +def memset(buf, start, stop, to): + for i in range(start, stop): + buf[i] = to + + +def decompress(lzss): + lzss = iter(lzss) + plain = bytearray() + + lzdict = bytearray(b' ' * N) + + dict_i = N - F + def push(byte): + nonlocal dict_i + lzdict[dict_i % N] = byte + dict_i += 1 + + plain.append(byte) + + # Iterate through byte-headed "runs" + try: + for headerbyte in lzss: + for bitnum in range(8): + if (headerbyte >> bitnum) & 1: + # Copy a single byte verbatim + push(next(lzss)) + else: + # Copy 3-18 bytes from the dictionary + byte1 = next(lzss) + byte2 = next(lzss) + lookup_i = (byte2 << 4) & 0xf00 | byte1 + lookup_len = (byte2 & 0x0f) + 3 + + for i in range(lookup_i, lookup_i+lookup_len): + push(lzdict[i % N]) + + except StopIteration: + # Means the last header had <8 real bits, no problem + pass + + return bytes(plain) + + +def compress(plain): + global have_warned_about_slowness + + if not have_warned_about_slowness: + have_warned_about_slowness = True + warn('Using slow pure-Python LZSS compression') + + if not plain: return b'' + + # Init the variables that get shared with the two closures below + lchild = [0] * (N + 1) + rchild = [0] * (N + 257); memset(rchild, N + 1, N + 256 + 1, NIL) + parent = [0] * (N + 1); memset(parent, 0, N, NIL) + text_buf = bytearray(N + F - 1); memset(text_buf, 0, N - F, ord(' ')) + match_length = match_position = 0 + + + # Inserts string of length F, text_buf[r..r+F-1], into one of the trees + # (text_buf[r]'th tree) and returns the longest-match position and length + # via the global variables match_position and match_length. + # If match_length = F, then removes the old node in favor of the new one, + # because the old one will be deleted sooner. Note r plays double role, + # as tree node and position in buffer. + def insert_node(r): + nonlocal lchild, rchild, parent, text_buf, match_length, match_position + + cmp = 1 + key = text_buf[r:] + p = N + 1 + key[0] + rchild[r] = lchild[r] = NIL + + match_length = 0 + + while 1: + if cmp >= 0: + if rchild[p] != NIL: + p = rchild[p] + else: + rchild[p] = r + parent[r] = p + return + else: + if lchild[p] != NIL: + p = lchild[p] + else: + lchild[p] = r + parent[r] = p + return + + i = 1 + while i < F: + cmp = key[i] - text_buf[p + i] + if cmp != 0: break + i += 1 + + if i > match_length: + match_position = p + match_length = i + if match_length >= F: break # out of while loop + + parent[r] = parent[p] + lchild[r] = lchild[p] + rchild[r] = rchild[p] + parent[lchild[p]] = r + parent[rchild[p]] = r + + if rchild[parent[p]] == p: + rchild[parent[p]] = r + else: + lchild[parent[p]] = r + + parent[p] = NIL; + + + # deletes node p from tree + def delete_node(p): + nonlocal lchild, rchild, parent + + if parent[p] == NIL: return + + if rchild[p] == NIL: + q = lchild[p] + elif lchild[p] == NIL: + q = rchild[p] + else: + q = lchild[p] + if rchild[q] != NIL: + while 1: + q = rchild[q] + if rchild[q] == NIL: break + + rchild[parent[q]] = lchild[q] + parent[lchild[q]] = parent[q] + lchild[q] = lchild[p] + parent[lchild[p]] = q + + rchild[q] = rchild[p] + parent[rchild[p]] = q + + parent[q] = parent[p] + + if rchild[parent[p]] == p: + rchild[parent[p]] = q + else: + lchild[parent[p]] = q + + parent[p] = NIL + + + # End of function defs, now onto the main attraction + plain_len = len(plain) + plain_i = 0 + + # code_buf[1..16] saves eight units of code, and code_buf[0] works + # as eight flags, "1" representing that the unit is an unencoded + # letter (1 byte), "" a position-and-length pair (2 bytes). + # Thus, eight units require at most 16 bytes of code. + code_buf = bytearray(1) + code_buf_list = [code_buf] + mask = 1 + + # Clear the buffer with any character that will appear often. + s = 0; r = N - F + + # Read F bytes into the last F bytes of the buffer + tblen = 0 + while tblen < F and plain_i < plain_len: + text_buf[r + tblen] = plain[plain_i] + tblen += 1 + plain_i += 1 + + # Insert the F strings, each of which begins with one or more + # 'space' characters. Note the order in which these strings are + # inserted. This way, degenerate trees will be less likely to occur. + for i in range(1, F+1): + insert_node(r - i) + + # Finally, insert the whole string just read. + # The global variables match_length and match_position are set. + insert_node(r) + while 1: + match_length = min(match_length, tblen) + + if match_length <= THRESHOLD: + # Not long enough match. Send one byte. + match_length = 1 + code_buf[0] |= mask # 'send one byte' flag + code_buf.append(text_buf[r]) # Send uncoded. + else: + # Send position and length pair. Note match_length > THRESHOLD. + byte1 = match_position & 0xFF + byte2 = (match_position >> 4 & 0xF0) | (match_length - THRESHOLD - 1) + code_buf.append(byte1) + code_buf.append(byte2) + + # Shift mask left one bit. + mask = (mask << 1) & 0xFF + # Send at most 8 units of code together + if mask == 0: + code_buf = bytearray(1) + code_buf_list.append(code_buf) + mask = 1 + + last_match_length = match_length + i = 0 + while i < last_match_length and plain_i < plain_len: + delete_node(s) # Delete old strings and + c = plain[plain_i]; plain_i += 1 + text_buf[s] = c # read new bytes + + # If the position is near the end of buffer, extend the buffer + # to make string comparison easier. + if s < F - 1: + text_buf[s + N] = c + + # Since this is a ring buffer, increment the position modulo N. + s = (s + 1) % N + r = (r + 1) % N + + # Register the string in text_buf[r..r+F-1] + insert_node(r) + + i += 1 + + while i < last_match_length: + delete_node(s) + + # After the end of text, no need to read, + s = (s + 1) % N + r = (r + 1) % N + + # but buffer may not be empty. + tblen -= 1 + if tblen: + insert_node(r) + + i += 1 + + # until length of string to be processed is zero + if tblen == 0: break + + if len(code_buf_list[-1]) == 1: + code_buf_list.pop() + + return b''.join(code_buf_list) diff --git a/tbxi/stringstruct.py b/tbxi/stringstruct.py new file mode 100644 index 0000000..392e839 --- /dev/null +++ b/tbxi/stringstruct.py @@ -0,0 +1,24 @@ +import struct + +def tuple_str2bytes(tpl): + return tuple(x.encode('ascii') if isinstance(x, str) else x for x in tpl) + +def tuple_bytes2str(tpl): + return tuple(x.rstrip(b'\0').decode('ascii') if isinstance(x, bytes) else x for x in tpl) + +class StringStruct(struct.Struct): + """A Struct that works with str instead of bytes""" + + def unpack(self, *args, **kwargs): + orig = super().unpack(*args, **kwargs) + return orig.__class__(tuple_bytes2str(orig)) + + def unpack_from(self, *args, **kwargs): + orig = super().unpack_from(*args, **kwargs) + return orig.__class__(tuple_bytes2str(orig)) + + def pack(self, *args, **kwargs): + return super().pack(*tuple_str2bytes(args), **kwargs) + + def pack_into(self, buf, offset, *args, **kwargs): + return super().pack_into(buf, offset, *tuple_str2bytes(args), **kwargs)