From 9a320c71f7898c74a09fc480cdc15ae9e05da1c2 Mon Sep 17 00:00:00 2001
From: Elliot Nunn <elliotnunn@me.com>
Date: Tue, 24 Sep 2019 16:40:26 +0800
Subject: [PATCH] Use the actual names of ndrv parcels, don't guess

---
 tbxi/parcels_dump.py | 143 +++++++++++--------------------
 tbxi/pef_info.py     | 194 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 243 insertions(+), 94 deletions(-)
 create mode 100644 tbxi/pef_info.py

diff --git a/tbxi/parcels_dump.py b/tbxi/parcels_dump.py
index 4ee6eb2..bef4d0b 100644
--- a/tbxi/parcels_dump.py
+++ b/tbxi/parcels_dump.py
@@ -3,11 +3,13 @@ import os
 from os import path
 from shlex import quote
 import struct
+import hashlib
 
 from . import dispatcher
 
 from .slow_lzss import decompress
 from .lowlevel import PrclNodeStruct, PrclChildStruct
+from .pef_info import suggest_name
 
 
 HEADER_COMMENT = """
@@ -44,6 +46,10 @@ HEADER_COMMENT = """
 """.strip()
 
 
+def quickhash(foo):
+    return hashlib.sha512(foo).hexdigest()
+
+
 def walk_tree(binary):
     """Get low level representation of tree
 
@@ -71,83 +77,28 @@ def unique_binary_tpl(prclchild):
     return (prclchild.ptr, prclchild.packedlen, prclchild.compress)
 
 
-def suggest_names_to_dump(parent, child, code_name):
-    # We yield heaps of suggested filenames, and the shortest non-empty unique one gets chosen
+def guess_binary_name(parent_struct, child_struct, adjacent_name, data):
+    # 4 MB ROM-in-RAM image
+    if parent_struct.ostype == child_struct.ostype == 'rom ':
+        return 'MacROM'
 
-    if parent.ostype == child.ostype == 'rom ':
-        yield 'MacROM'
-        return
+    # Native (PCI) driver with an embedded name and version
+    ndrv_name = suggest_name(data)
+    if ndrv_name: return ndrv_name
 
-    if 'AAPL,MacOS,PowerPC' in child.name and code_name == 'PowerMgrPlugin':
-        if parent.a == 'cuda' and parent.b == 'via-cuda':
-            yield 'PowerMgrPlugin.CUDA'
-        elif parent.a == 'pmu' and parent.b == 'power-mgt':
-            yield 'PowerMgrPlugin.PMU'
-        elif parent.a == 'via-pmu-99' and parent.b == 'power-mgt':
-            yield 'PowerMgrPlugin.PMU99'
-        elif parent.a == 'via-pmu-2000' and parent.b == 'power-mgt':
-            yield 'PowerMgrPlugin.PMU2000'
-        elif parent.a == 'bal' and parent.b == 'power-mgt':
-            yield 'PowerMgrPlugin.BlueBox'
+    # A "special" property called by its actual name
+    if parent_struct.flags & 0xF0000 or child_struct.flags & 0x80:
+        return child_struct.name
 
-    if ',' not in child.name: # All property names except driver,AAPL,MacOS,pef et al
-        yield child.name
+    # A driver property with an adjacent name property
+    if 'AAPL,MacOS,PowerPC' in child_struct.name and adjacent_name:
+        return adjacent_name
 
-    if child.flags & 0x80: # special-node stuff
-        yield child.name
-        yield squish_name(child.name, parent.a, parent.b)
+    # A lanLib (for netbooting)
+    if child_struct.name == 'lanLib,AAPL,MacOS,PowerPC':
+        return parent_struct.a
 
-    if 'AAPL,MacOS,PowerPC' in child.name:
-        if code_name:
-            yield squish_name(code_name, parent.a, parent.b)
-        else:
-            yield squish_name(parent.a, parent.b)
-
-
-def squish_name(*parts):
-    squeeze = lambda x: x.lower().replace('-', '').replace('_', '')
-
-    parts = list(parts)
-    keepmask = [True] * len(parts)
-
-    for i in range(len(parts)):
-        for j in range(len(parts)):
-            if i == j: continue
-            if squeeze(parts[j]) == squeeze(parts[i]):
-                if j > i: keepmask[j] = False
-            elif squeeze(parts[j]) in squeeze(parts[i]):
-                keepmask[j] = False
-
-    truelist = []
-    for i in range(len(parts)):
-        if keepmask[i]: truelist.append(parts[i])
-
-    return '.'.join(truelist)
-
-
-def settle_name_votes(vote_dict):
-    # Forbid duplicate names
-    duplicate_names = set([''])
-    for ka, va in vote_dict.items():
-        for kb, vb in vote_dict.items():
-            if ka is kb: continue
-
-            for x in va:
-                if x in vb:
-                    duplicate_names.add(x)
-
-    # Pick the shortest non-duplicate name
-    decision = {}
-    for k, v in vote_dict.items():
-        allowed_names = [x for x in v if x not in duplicate_names]
-        if allowed_names:
-            decision[k] = min(allowed_names, key=len)
-
-    return decision
-
-
-def is_parcels(binary):
-    return binary.startswith(b'prcl')
+    return ''
 
 
 def dump(binary, dest_dir):
@@ -159,6 +110,7 @@ def dump(binary, dest_dir):
 
     # Decompress everything
     unpacked_dict = {}
+    binary_of = lambda child: unpacked_dict[unique_binary_tpl(child)]
     binary_counts = Counter()
     for prclnode, children in basic_structure:
         for prclchild in children:
@@ -169,36 +121,39 @@ def dump(binary, dest_dir):
 
             unpacked_dict[unique_binary_tpl(prclchild)] = data
 
-    # Suggest possible filenames for each blob
-    name_vote_dict = defaultdict(list)
+    filename_dict = {} # maps binary data to a filename
     for prclnode, children in basic_structure:
-        # is there a prop that gives contextual name information?
+        # A fragment prop may have an adjacent prop giving it a name, get this ready
+        adjacent_name = None
         for check_child in children:
             if check_child.name == 'code,AAPL,MacOS,name':
-                code_name = unpacked_dict[unique_binary_tpl(check_child)].rstrip(b'\0').decode('ascii')
-                break
-        else:
-            code_name = None
+                adjacent_name = unpacked_dict[unique_binary_tpl(check_child)].rstrip(b'\0').decode('ascii')
 
-        # now use that name to suggest names for all the children
+        # Best guess original-ish name for this binary
         for prclchild in children:
-            if prclchild.ostype in ('cstr', 'csta'): continue
-            votes = suggest_names_to_dump(prclnode, prclchild, code_name)
-            if unpacked_dict[unique_binary_tpl(prclchild)].startswith(b'Joy!'):
-                votes = [v + '.pef' for v in votes]
-            name_vote_dict[unique_binary_tpl(prclchild)].extend(votes)
+            if prclchild.ostype not in ('cstr', 'csta'):
+                base = guess_binary_name(
+                    parent_struct=prclnode,
+                    child_struct=prclchild,
+                    adjacent_name=adjacent_name,
+                    data=binary_of(prclchild),
+                )
+                filename_dict[binary_of(prclchild)] = base
 
-    # Decide on filenames
-    decision = settle_name_votes(name_vote_dict)
+    # Post-process to ensure that all names are unique
+    used_names = Counter(filename_dict.values())
+    for binary, filename in list(filename_dict.items()):
+        if used_names[filename] > 1:
+            if filename: filename += '-'
+            filename += quickhash(binary)
+            filename_dict[binary] = filename 
+
+    filename_dict = {b: (fn+'.pef' if b.startswith(b'Joy!peff') else fn) for (b, fn) in filename_dict.items()}
 
     # Dump blobs to disk
-    for tpl, filename in decision.items():
-        keep_this = True
-
-        data = unpacked_dict[tpl]
+    for data, filename in filename_dict.items():
         dispatcher.dump(data, path.join(dest_dir, filename))
 
-
     # Get printing!!!
     with open(path.join(dest_dir, 'Parcelfile'), 'w') as f:
         f.write(HEADER_COMMENT + '\n\n')
@@ -217,9 +172,9 @@ def dump(binary, dest_dir):
                 if prclchild.name: line += ' name=%s' % quote(prclchild.name)
 
                 if prclchild.ostype not in ('cstr', 'csta'):
-                    filename = decision[unique_binary_tpl(prclchild)]
+                    filename = filename_dict[binary_of(prclchild)]
                     if prclchild.compress == 'lzss': filename += '.lzss'
-                    line += ' src=%s' % filename
+                    line += ' src=%s' % quote(filename)
 
                 if binary_counts[unique_binary_tpl(prclchild)] > 1:
                     line += ' deduplicate=1'
diff --git a/tbxi/pef_info.py b/tbxi/pef_info.py
new file mode 100644
index 0000000..3a99282
--- /dev/null
+++ b/tbxi/pef_info.py
@@ -0,0 +1,194 @@
+# Some scrounged code to give name/version suggestions for NDRVs
+
+
+import struct
+
+
+MAGIC = b'Joy!peff'
+
+
+class PEF:
+    CONT_HEAD_FMT = '>4s4s4s5I2HI'
+    CONT_HEAD_LEN = struct.calcsize(CONT_HEAD_FMT)
+    
+    SEC_HEAD_FMT = '>i5I4B'
+    SEC_HED_LEN = struct.calcsize(SEC_HEAD_FMT)
+
+    def __init__(self, data):
+        if not data.startswith(MAGIC): raise ValueError('not a pef')
+
+        (magic, fourcc, arch, ver,
+        timestamp, old_def_ver, old_imp_ver, cur_ver,
+        sec_count, inst_sec_count, reserv) = struct.unpack_from(self.CONT_HEAD_FMT, data)
+
+        sec_earliest = len(data)
+        sec_latest = 0
+
+        self.sections = []
+        self.sectypes = []
+        self.headeroffsets = []
+
+        self.code = None
+
+        for i in range(sec_count):
+            sh_offset = self.CONT_HEAD_LEN + self.SEC_HED_LEN*i
+
+            (sectionName, sectionAddress, execSize,
+            initSize, rawSize, containerOffset,
+            regionKind, shareKind, alignment, reserved) = struct.unpack_from(self.SEC_HEAD_FMT, data, sh_offset)
+
+            the_sec = data[containerOffset : containerOffset + rawSize]
+
+            if regionKind == 0 and execSize == initSize == rawSize:
+                the_sec = bytearray(the_sec)
+                self.code = the_sec
+
+            self.sections.append(the_sec)
+            self.sectypes.append(regionKind)
+            self.headeroffsets.append(sh_offset)
+
+            sec_earliest = min(sec_earliest, containerOffset)
+            sec_latest = max(sec_latest, containerOffset + rawSize)
+
+        if any(data[sec_latest:]):
+            print('nonzero trailing data from', hex(sec_latest), 'to', hex(len(data)), ' ... will cause incorrect output')
+
+        self.padmult = 1
+        while len(data) % (self.padmult * 2) == 0:
+            self.padmult *= 2
+
+        self.header = data[:sec_earliest]
+
+    def __bytes__(self):
+        accum = bytearray(self.header)
+
+        for i in range(len(self.sections)):
+            the_sec = self.sections[i]
+            hoff = self.headeroffsets[i]
+
+            while len(accum) % 16:
+                accum.append(0)
+
+            new_off = len(accum)
+            new_len = len(the_sec)
+
+            accum.extend(the_sec)
+
+            struct.pack_into('>I', accum, hoff + 20, new_off)
+
+            if the_sec is self.code:
+                for i in range(8, 20, 4):
+                    struct.pack_into('>I', accum, hoff + i, new_len)
+
+        while len(accum) % self.padmult != 0:
+            accum.extend(b'\x00')
+
+        return bytes(accum)
+
+
+def pidata(packed):
+    def pullarg(from_iter):
+        arg = 0
+        for i in range(4):
+            cont = next(from_iter)
+            arg <<= 7
+            arg |= cont & 0x7f
+            if not (cont & 0x80): break
+        else:
+            raise ValueError('arg spread over too many bytes')
+        return arg
+
+    packed = iter(packed)
+    unpacked = bytearray()
+
+    for b in packed:
+        opcode = b >> 5
+        arg = b & 0b11111 or pullarg(packed)
+
+        if opcode == 0b000: # zero
+            count = arg
+            unpacked.extend(b'\0' * count)
+
+        elif opcode == 0b001: # blockCopy
+            blockSize = arg
+            for i in range(blockSize):
+                unpacked.append(next(packed))
+
+        elif opcode == 0b010: # repeatedBlock
+            blockSize = arg
+            repeatCount = pullarg(packed) + 1
+            rawData = bytes(next(packed) for n in range(blockSize))
+            for n in range(repeatCount):
+                unpacked.extend(rawData)
+
+        elif opcode == 0b011 or opcode == 0b100: # interleaveRepeatBlockWithBlockCopy
+            commonSize = arg                     # or interleaveRepeatBlockWithZero
+            customSize = pullarg(packed)
+            repeatCount = pullarg(packed)
+
+            if opcode == 0b011:
+                commonData = bytes(next(packed) for n in range(commonSize))
+            else:
+                commonData = b'\0' * commonSize
+
+            for i in range(repeatCount):
+                unpacked.extend(commonData)
+                for j in range(customSize):
+                    unpacked.append(next(packed))
+            unpacked.extend(commonData)
+
+        else:
+            raise ValueError('unknown pidata opcode/arg %s/%d' % (bin(opcode), arg))
+            return
+
+    return bytes(unpacked)
+
+
+def parse_version(num):
+    maj, minbug, stage, unreleased = num.to_bytes(4, byteorder='big')
+
+    maj = '%x' % maj
+    minor, bugfix = '%02x' % minbug
+
+    if stage == 0x80:
+        stage = 'f'
+    elif stage == 0x60:
+        stage = 'b'
+    elif stage == 0x40:
+        stage = 'a'
+    elif stage == 0x20:
+        stage = 'd'
+    else:
+        stage = '?'
+
+    unreleased = '%0x' % unreleased
+
+    vers = maj + '.' + minor
+
+    if bugfix != '0':
+        vers += '.' + bugfix
+
+    if (stage, unreleased) != ('f', '0'):
+        vers += stage + unreleased
+
+    return vers
+
+
+def suggest_name(pef):
+    if not pef.startswith(b'Joy!peff'): return
+
+    try:
+        pef = PEF(pef)
+
+        for sectype, section in zip(pef.sectypes, pef.sections):
+            if sectype == 2: section = pidata(section)
+
+            if section and sectype in (1, 2):
+                hdr_ofs = section.find(b'mtej')
+                if hdr_ofs != -1:
+                    sig, strvers, devnam, drvvers = struct.unpack_from('>4s L 32p L', section, hdr_ofs)                
+
+                    sugg = devnam.decode('mac_roman') + '-' + parse_version(drvvers)
+                    return sugg
+    except:
+        pass # do not complain about corrupt PEFs