TOC guesser: use reloc opcodes to ID real TVecs

Reloc table opcodes are used to filter out coincidental TVector-like things. This problem was identified while trying to debug all the XTOC glue that didn't make sense. We also ignore this glue.
2019-10-18 13:51:53 +08:00 · 2019-10-18 13:51:53 +08:00 · e8a3012b41
parent cf5af0f5f1
commit e8a3012b41
1 changed files with 16 additions and 3 deletions
--- a/cfmtool.py
+++ b/cfmtool.py
@ -484,6 +484,7 @@ def dump_lowlevel(basepath):
                        for i in range(runLength):
                            relocations.append(dict(section=sectionIndex, offset=relocAddress, to=('section', sectionC))); relocAddress += 4
                            relocations.append(dict(section=sectionIndex, offset=relocAddress, to=('section', sectionD))); relocAddress += 4
+                            if 'code' in sectionC and 'data' in sectionD: relocations[-2]['likelytv'] = 1
                            relocAddress += 4

                    elif subopcode == 0b0011: # RelocTVector8
@ -491,6 +492,7 @@ def dump_lowlevel(basepath):
                        for i in range(runLength):
                            relocations.append(dict(section=sectionIndex, offset=relocAddress, to=('section', sectionC))); relocAddress += 4
                            relocations.append(dict(section=sectionIndex, offset=relocAddress, to=('section', sectionD))); relocAddress += 4
+                            if 'code' in sectionC and 'data' in sectionD: relocations[-2]['likelytv'] = 1

                    elif subopcode == 0b0100: # RelocVTable8
                        #print('RelocVTable8 runLength=%d' % (runLength))
@ -634,6 +636,7 @@ def dump_highlevel(basepath):

    # Relocations in lookup-able form
    relocs = read_python(basepath, 'ldump', 'relocations.txt')
+    likelytv = set((rl['section'], rl['offset']) for rl in relocs if rl.get('likelytv', False))
    relocs = {(rl['section'], rl['offset']): rl['to'] for rl in relocs}

    # The base of the TOC is not guaranteed to be the base of the data section... what is the TOC of our exported funcs?
@ -654,11 +657,16 @@ def dump_highlevel(basepath):
            break


-    # Sometimes we need to fall back on an educated guess based on our apparent tvectors
+    # When we export even a single TVector, the TOC can be easily found as
+    # above. But some fragments, e.g. native sifters (nifts) and some USB
+    # code, only export some sort of dispatch table in which TVector pointers
+    # are difficult to identify. So we scan the entire relocation table to
+    # find things that  look like TVectors, then try to identify a consensus
+    # among the real-looking TVectors.
    if not table_of_contents:
        guesses = []
        for (reloc_sec, reloc_offset), (reloc_kind, reloc_targ_section) in relocs.items():
-            if 'data' in reloc_sec and reloc_kind == 'section' and 'code' in reloc_targ_section:
+            if 'data' in reloc_sec and reloc_kind == 'section' and 'code' in reloc_targ_section and (reloc_sec, reloc_offset) in likelytv:
                toc_reloc_kind, toc_reloc_targ_section = relocs.get((reloc_sec, reloc_offset+4), (None, None))
                if toc_reloc_kind == 'section' and 'data' in toc_reloc_targ_section:
                    secdata = read_bin(basepath, reloc_sec)
@ -726,7 +734,12 @@ def dump_highlevel(basepath):
                    if a != b and b != 0xFF: break
                else:
                    toc_ofs, = struct.unpack_from('>h', code, ofs+2)
-                    codelocs_xtocglue.append(dict(section=sec['filename'], offset=ofs, function=toc_imports[toc_ofs]))
+                    try:
+                        codelocs_xtocglue.append(dict(section=sec['filename'], offset=ofs, function=toc_imports[toc_ofs]))
+                    except KeyError:
+                        # The glue points inwards. This is quite rare, so just ignore it
+                        pass
+

    codelocs_xtocglue.sort(key=lambda dct: tuple(dct.values()))
    write_python(codelocs_xtocglue, basepath, 'hdump', 'codelocs-xtocglue.txt')