New Python library/tool for manipulating PEFs

2025-03-03 19:29:20 +00:00 · 2019-10-03 15:48:02 +08:00 · 2019-10-03 15:48:02 +08:00 · 1b38a27766
commit 1b38a27766
parent a2dfc8d98d
1 changed files with 392 additions and 0 deletions
--- a/cfmtool.py
+++ b/cfmtool.py
@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+
+# This is a single-file library for manipulating Preferred Executable Format files
+# A command line-interface is available (just call cfmtool.py --help)
+
+
+import argparse
+import datetime
+import struct
+import os
+import textwrap
+from os import path
+from ast import literal_eval as eval
+
+
+def dump(from_binary_or_path, to_path):
+    """Dump a CFM/PEF binary to a directory
+
+    Command line usage: cfmtool.py BINARY DIRECTORY
+
+    The first argument can be a bytes-like object, or a path to read from.
+    """
+
+    def write_txt(name, text):
+        with open(path.join(to_path, name + '.txt'), 'w') as f:
+            f.write(text + '\n')
+
+    try:
+        bytes(from_binary_or_path)
+        from_binary = from_binary_or_path
+    except TypeError:
+        with open(from_binary_or_path, 'rb') as f:
+            from_binary = f.read()
+
+    if not from_binary.startswith(b'J o y ! peffpwpc\x00\x00\x00\x01'.replace(b' ', b'')):
+        raise ValueError('not a pef (PowerPC, v1)')
+
+    os.makedirs(to_path, exist_ok=True)
+
+    dateTimeStamp, *versions = struct.unpack_from('>4L', from_binary, 16)
+
+    write_txt('date', format_mac_date(dateTimeStamp))
+    write_txt('version', _fmt_dict(zip(('oldDefVersion', 'oldImpVersion', 'currentVersion'), versions)))
+
+    section_list = []
+    section_count, = struct.unpack_from('>H', from_binary, 32)
+    offset = 40
+    for i in range(section_count):
+        sec = dict(zip(
+            ('name', 'defaultAddress', 'totalLength', 'unpackedLength', 'containerLength',
+                'containerOffset', 'sectionKind', 'shareKind', 'alignment'),
+            struct.unpack_from('>lLLLLLbbb', from_binary, offset)))
+
+        section_list.append(sec)
+
+        offset += 28
+
+    # Now offset points to the nasty table of section names
+
+    for i, sec in enumerate(section_list):
+        if sec['name'] > 0:
+            name_offset = offset
+            for j in range(sec['name']): name_offset = from_binary.index(b'\0', name_offset) + 1
+            sec['name'] = from_binary[name_offset:from_binary.index(b'\0', name_offset)].decode('mac_roman')
+        else:
+            sec['name'] = ''
+
+    for i, sec in enumerate(section_list):
+        sec['sectionKind'] = ('code', 'data', 'pidata', 'rodata', 'loader',
+            'debug', 'codedata', 'exception', 'traceback')[sec['sectionKind']]
+
+        sec['shareKind'] = ('', 'process', '', '', 'global', 'protected')[sec['shareKind']]
+
+    # What to call the final file...
+    used_basenames = []
+    for i, sec in enumerate(section_list):
+        basename = sec['sectionKind']
+        used_basenames.append(basename)
+        if used_basenames.count(basename) > 1:
+            basename += '-%d' % used_basenames.count(basename)
+
+        sec['filename'] = basename
+
+    # Now the conversion of sec keys to their readable form is complete
+
+    # Are the damn sections ordered the wrong way?
+    sorted_section_list = sorted(section_list, key=lambda sec: sec['containerOffset'])
+    if sorted_section_list != section_list:
+        for i, sec in enumerate(sorted_section_list):
+            sec['_hackPackOrder'] = i
+
+    should_end = sorted_section_list[-1]['containerOffset'] + sorted_section_list[-1]['containerLength']
+    if should_end < len(from_binary):
+        sorted_section_list[-1]['_hackPostAlign'] = _possible_intended_alignments(len(from_binary))[-1]
+
+    for i, sec in enumerate(section_list):
+        raw = from_binary[sec['containerOffset']:sec['containerOffset']+sec['containerLength']]
+
+        possible_aligns = _possible_intended_alignments(sec['containerOffset'])
+        if possible_aligns[-1] > _sec_kind_min_align(sec['sectionKind']):
+            sec['_hackUnexpectedAlign'] = possible_aligns[-1]
+
+        # Do we need to keep the packed data around?    
+        unpacked = packed = raw
+
+        if sec['sectionKind'] == 'pidata':
+            packed = raw
+            unpacked = unpack_pidata(raw)
+        else:
+            packed = None
+            unpacked = raw
+
+            if unpacked.endswith(b'\0'):
+                sec['_hackExplicitTrailingZeros'] = len(unpacked) - len(unpacked.rstrip(b'\0'))
+
+        if sec['unpackedLength']:
+            zeropad = sec['totalLength'] - len(unpacked); unpacked += bytes(zeropad)
+
+        with open(path.join(to_path, sec['filename']), 'wb') as f: f.write(unpacked)
+
+        if packed is not None:
+            with open(path.join(to_path, 'packed-' + sec['filename']), 'wb') as f: f.write(packed)
+
+        del sec['totalLength']
+        del sec['unpackedLength']
+        del sec['containerLength']
+        del sec['containerOffset']
+
+    write_txt('sections', _fmt_list(_fmt_dict(d) for d in section_list))
+
+
+def build(from_path, to_path=None):
+    """Rebuild a directory into a CFM/PEF binary
+
+    Command line usage: cfmtool.py DIRECTORY BINARY
+
+    If a second argument is supplied, the result will be written to that path
+    instead of being returned as a bytes object.
+    """
+
+    def read_txt(name):
+        with open(path.join(from_path, name + '.txt')) as f:
+            return f.read()
+
+    try:
+        dateTimeStamp = parse_mac_date(read_txt('date'))
+    except:
+        raise
+        dateTimeStamp = 0
+
+    try:
+        versions = eval(read_txt('version'))
+        versions = (versions['oldDefVersion'], versions['oldImpVersion'], versions['currentVersion'])
+    except:
+        raise
+        versions = (0, 0, 0)
+
+    section_list = eval(read_txt('sections'))
+
+    # Hit the ground running
+    pef = bytearray(b'J o y ! peffpwpc\x00\x00\x00\x01'.replace(b' ', b''))
+
+    pef.extend(struct.pack('>4L', dateTimeStamp, *versions)) # leaves us at offset 0x20
+    instSectionCount = len([sec for sec in section_list if _sec_kind_is_instantiated(sec['sectionKind'])])
+    pef.extend(struct.pack('>HHL', len(section_list), instSectionCount, 0)) # leaves us at offset 0x28, ready for the sections
+
+    # Pad the section headers out with zeroes, and fill in a bit later
+    offset = 40
+    for sec in section_list:
+        sec['_hack_header_offset'] = offset
+        offset += 28
+    pef.extend(bytes(offset - len(pef)))
+
+    # Now do the stupid section name table (yuck)
+    namecnt = 0
+    for sec in section_list:
+        if sec['name']:
+            pef.extend(sec['name'].encode('mac_roman') + b'\0')
+            sec['name'] = namecnt
+            namecnt += 1
+        else:
+            sec['name'] = -1
+
+    # Stable sort, so won't do anything if unnecessary
+    section_list.sort(key=lambda sec: sec.get('_hackPackOrder', 0))
+
+    # Now put in the section data (easier said than done!)
+    for sec in section_list:
+        with open(path.join(from_path, sec['filename']), 'rb') as f:
+            data_total = f.read()
+
+        data_packed = data_inited = _strip_zeroes_leaving_some(data_total, sec.get('_hackExplicitTrailingZeros', 0))
+
+        # Special case the damned pidata
+        if sec['sectionKind'] == 'pidata':
+            with open(path.join(from_path, 'packed-' + sec['filename']), 'rb') as f:
+                data_packed = f.read()
+                data_inited = unpack_pidata(data_packed)
+
+            # Check that we got that right (we cannot pack the data ourselves)
+            if not data_total.startswith(data_inited) or any(data_total[len(data_inited):]):
+                data_packed = data_inited = _strip_zeroes_leaving_some(data_total, 0)
+                sec['sectionKind'] = 'data'
+
+        align_now = max(_sec_kind_min_align(sec['sectionKind']), sec.get('_hackUnexpectedAlign', 1))
+
+        while len(pef) % align_now != 0: pef.append(0)
+
+        struct.pack_into('>l5L3B', pef, sec['_hack_header_offset'],
+            sec['name'],
+            sec['defaultAddress'],
+            len(data_total) if _sec_kind_is_instantiated(sec['sectionKind']) else 0,
+            len(data_inited) if _sec_kind_is_instantiated(sec['sectionKind']) else 0,
+            len(data_packed),
+            len(pef),
+            ('code', 'data', 'pidata', 'rodata', 'loader',
+            'debug', 'codedata', 'exception', 'traceback').index(sec['sectionKind']),
+            ('', 'process', '', '', 'global', 'protected').index(sec['shareKind']),
+            sec['alignment'],
+        )
+
+        pef.extend(data_packed)
+
+    post_align = max(sec.get('_hackPostAlign', 1) for sec in section_list)
+    while len(pef) % post_align != 0: pef.append(0)
+
+    if to_path is None:
+        return bytes(pef)
+    else:
+        with open(to_path, 'wb') as f:
+            f.write(pef)
+
+
+def unpack_pidata(packed):
+    """Unpack pattern-initialized (compressed) data
+    """
+
+    def pullarg(from_iter):
+        arg = 0
+        for i in range(4):
+            cont = next(from_iter)
+            arg <<= 7
+            arg |= cont & 0x7f
+            if not (cont & 0x80): break
+        else:
+            raise ValueError('arg spread over too many bytes')
+        return arg
+
+    packed = iter(packed)
+    unpacked = bytearray()
+
+    for b in packed:
+        opcode = b >> 5
+        arg = b & 0b11111 or pullarg(packed)
+
+        if opcode == 0b000: # zero
+            count = arg
+            unpacked.extend(b'\0' * count)
+
+        elif opcode == 0b001: # blockCopy
+            blockSize = arg
+            for i in range(blockSize):
+                unpacked.append(next(packed))
+
+        elif opcode == 0b010: # repeatedBlock
+            blockSize = arg
+            repeatCount = pullarg(packed) + 1
+            rawData = bytes(next(packed) for n in range(blockSize))
+            for n in range(repeatCount):
+                unpacked.extend(rawData)
+
+        elif opcode == 0b011 or opcode == 0b100: # interleaveRepeatBlockWithBlockCopy
+            commonSize = arg                     # or interleaveRepeatBlockWithZero
+            customSize = pullarg(packed)
+            repeatCount = pullarg(packed)
+
+            if opcode == 0b011:
+                commonData = bytes(next(packed) for n in range(commonSize))
+            else:
+                commonData = b'\0' * commonSize
+
+            for i in range(repeatCount):
+                unpacked.extend(commonData)
+                for j in range(customSize):
+                    unpacked.append(next(packed))
+            unpacked.extend(commonData)
+
+        else:
+            raise ValueError('unknown pidata opcode/arg %s/%d' % (bin(opcode), arg))
+            return
+
+    return bytes(unpacked)
+
+
+def format_mac_date(srcint):
+    """Render a 32-bit MacOS date to ISO 8601 format
+    """
+
+    dt = datetime.datetime(1904, 1, 1) + datetime.timedelta(seconds=srcint)
+    return dt.isoformat().replace('T', ' ')
+
+
+def parse_mac_date(x):
+    """Pack an ISO 8601 date into a 32-bit MacOS date
+    """
+
+    epoch = '19040101000000' # ISO8601 with the non-numerics stripped
+
+    # strip non-numerics and pad out using the epoch (cheeky)
+    stripped = ''.join(c for c in x if c in '0123456789')
+    stripped = stripped[:len(epoch)] + epoch[len(stripped):]
+
+    tformat = '%Y%m%d%H%M%S'
+
+    delta = datetime.datetime.strptime(stripped, tformat) - datetime.datetime.strptime(epoch, tformat)
+    delta = int(delta.total_seconds())
+
+    delta = min(delta, 0xFFFFFFFF)
+    delta = max(delta, 0)
+
+    return delta
+
+
+def _fmt_dict(tuple_iterator):
+    try:
+        tuple_iterator = tuple_iterator.items()
+    except AttributeError:
+        pass
+
+    accum = '{\n'
+    for k, v in tuple_iterator:
+        if k == 'defaultAddress':
+            v = ('0x%08x' % v)
+        elif k in '_hackUnexpectedAlign _hackPostAlign':
+            v = hex(v)
+        else:
+            v = repr(v)
+        accum += textwrap.indent('%r: %s,' % (k, v), '  ') + '\n'
+    accum += '}'
+    return accum
+
+
+def _fmt_list(iterator):
+    accum = '[\n'
+    for el in iterator:
+        accum += textwrap.indent(el + ',', '  ') + '\n'
+    accum += ']'
+    return accum
+
+
+def _sec_kind_is_instantiated(sec_kind):
+    return sec_kind not in ('loader', 'debug', 'exception', 'traceback')
+
+
+def _strip_zeroes_leaving_some(data, leaving):
+    stripped = data.rstrip(b'\0')
+
+    while len(stripped) < len(data) and data[len(stripped)] == 0:
+        stripped += b'\0'
+
+    return stripped
+
+
+def _possible_intended_alignments(offset):
+    possible = list(1 << n for n in range(32))
+
+    possible = [p for p in possible if offset % p == 0]
+
+    return possible
+
+
+def _sec_kind_min_align(sec_kind):
+    if sec_kind in ('code', 'data', 'rodata', 'codedata'):
+        return 16
+    else:
+        return 4
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='''
+        Convert between a Code Fragment Manager binary and an easily-edited dump directory.
+    ''')
+
+    parser.add_argument('src', metavar='SOURCE', action='store', help='Binary or directory')
+    parser.add_argument('dest', metavar='DEST', action='store', help='Directory or binary')
+
+    args = parser.parse_args()
+
+    if path.isdir(args.src):
+        build(args.src, args.dest)
+    else:
+        dump(args.src, args.dest)