Add OpenFirmware detokenizer.

2021-08-07 22:00:29 +02:00 · 2021-08-07 22:00:29 +02:00 · bb39f6de07
parent b23fd9d47b
commit bb39f6de07
3 changed files with 581 additions and 0 deletions
--- a/detok.py
+++ b/detok.py
@ -0,0 +1,335 @@
 '''
    FCode DeTokenizer in pure Python.
    Author: Max Poliakovski 2019-2021
 '''
 import struct
 FORTH_WORDS = {
    0x10  : ('b(lit)',       ['num32']),
    0x11  : ("b(')",         ['fcode_num']),
    0x12  : ('b(")',         ['p_string']),
    0x13  : ('bbranch',      ['offset']),
    0x14  : ('b?branch',     ['offset']),
    0x15  : ('b(loop)',      ['offset']),
    0x16  : ('b(+loop)',     ['offset']),
    0x17  : ('b(do)',        ['offset']),
    0x18  : ('b(?do)',       ['offset']),
    0x19  : ('i',            []),
    0x1A  : ('j',            []),
    0x1B  : ('b(leave)',     []),
    0x1C  : ('b(of)',        ['offset']),
    0x1D  : ('execute',      []),
    0x1E  : ('+',            []),
    0x1F  : ('-',            []),
    0x20  : ('*',            []),
    0x21  : ('/',            []),
    0x22  : ('mod',          []),
    0x23  : ('and',          []),
    0x24  : ('or',           []),
    0x25  : ('xor',          []),
    0x26  : ('invert',       []),
    0x27  : ('lshift',       []),
    0x28  : ('rshift',       []),
    0x29  : ('>>a',          []),
    0x2A  : ('/mod',         []),
    0x2B  : ('u/mod',        []),
    0x2C  : ('negate',       []),
    0x2D  : ('abs',          []),
    0x2E  : ('min',          []),
    0x2F  : ('max',          []),
    0x30  : ('>r',           []),
    0x31  : ('r>',           []),
    0x32  : ('r@',           []),
    0x33  : ('exit',         []),
    0x34  : ('0=',           []),
    0x35  : ('0<>',          []),
    0x36  : ('0<',           []),
    0x37  : ('0<=',          []),
    0x38  : ('0>',           []),
    0x39  : ('0>=',          []),
    0x3A  : ('<',            []),
    0x3B  : ('>',            []),
    0x3C  : ('=',            []),
    0x3D  : ('<>',           []),
    0x3E  : ('u>',           []),
    0x3F  : ('u<=',          []),
    0x40  : ('u<',           []),
    0x41  : ('u>=',          []),
    0x42  : ('>=',           []),
    0x43  : ('<=',           []),
    0x44  : ('between',      []),
    0x45  : ('within',       []),
    0x46  : ('drop',         []),
    0x47  : ('dup',          []),
    0x48  : ('over',         []),
    0x49  : ('swap',         []),
    0x4A  : ('rot',          []),
    0x4B  : ('-rot',         []),
    0x4C  : ('tuck',         []),
    0x4D  : ('nip',          []),
    0x4E  : ('pick',         []),
    0x4F  : ('roll',         []),
    0x50  : ('?dup',         []),
    0x51  : ('depth',        []),
    0x52  : ('2drop',        []),
    0x53  : ('2dup',         []),
    0x54  : ('2over',        []),
    0x55  : ('2swap',        []),
    0x57  : ('2/',           []),
    0x59  : ('2*',           []),
    0x5C  : ('/l',           []),
    0x5E  : ('ca+',          []),
    0x5F  : ('wa+',          []),
    0x60  : ('la+',          []),
    0x61  : ('na+',          []),
    0x62  : ('char+',        []),
    0x63  : ('wa1+',         []),
    0x64  : ('la1+',         []),
    0x65  : ('cell+',        []),
    0x68  : ('/l*',          []),
    0x69  : ('cells',        []),
    0x6A  : ('on',           []),
    0x6B  : ('off',          []),
    0x6C  : ('+!',           []),
    0x6D  : ('@',            []),
    0x6E  : ('l@',           []),
    0x6F  : ('w@',           []),
    0x71  : ('c@',           []),
    0x72  : ('!',            []),
    0x73  : ('l!',           []),
    0x74  : ('w!',           []),
    0x75  : ('c!',           []),
    0x76  : ('2@',           []),
    0x77  : ('2!',           []),
    0x78  : ('move',         []),
    0x79  : ('fill',         []),
    0x7A  : ('comp',         []),
    0x7C  : ('lwsplit',      []),
    0x7D  : ('wljoin',       []),
    0x7E  : ('lbsplit',      []),
    0x7F  : ('bljoin',       []),
    0x80  : ('wbflip',       []),
    0x83  : ('pack',         []),
    0x84  : ('count',        []),
    0x85  : ('body>',        []),
    0x86  : ('>body',        []),
    0x89  : ('unloop',       []),
    0x8B  : ('alloc-mem',    []),
    0x8C  : ('free-mem',     []),
    0x8D  : ('key?',         []),
    0x8E  : ('key',          []),
    0x90  : ('type',         []),
    0x92  : ('cr',           []),
    0x9D  : ('.',            []),
    0xA0  : ('base',         []),
    0xA4  : ('-1',           []),
    0xA5  : ('0',            []),
    0xA6  : ('1',            []),
    0xA7  : ('2',            []),
    0xA8  : ('3',            []),
    0xA9  : ('bl',           []),
    0xAA  : ('bs',           []),
    0xAB  : ('bell',         []),
    0xAC  : ('bounds',       []),
    0xAD  : ('here',         []),
    0xAE  : ('aligned',      []),
    0xAF  : ('wbsplit',      []),
    0xB0  : ('bwjoin',       []),
    0xB1  : ('b(<mark)',     []),
    0xB2  : ('b(>resolve)',  []),
    0xB5  : ('new-token',    ['unnamed_tok']),
    0xB7  : ('b(:)',         []),
    0xB8  : ('b(value)',     ['line_break']),
    0xB9  : ('b(variable)',  ['line_break']),
    0xBA  : ('b(constant)',  ['line_break']),
    0xBB  : ('b(create)',    ['line_break']),
    0xBC  : ('b(defer)',     ['line_break']),
    0xBD  : ('b(buffer:)',   ['line_break']),
    0xBE  : ('b(field)',     ['line_break']),
    0xC0  : ('instance',     []),
    0xC2  : ('b(;)',         ['line_break']),
    0xC3  : ('b(to)',        ['fcode_num']),
    0xC4  : ('b(case)',      []),
    0xC5  : ('b(endcase)',   []),
    0xC6  : ('b(endof)',     ['offset']),
    0xCA  : ('external-token', ['named_tok']),
    0xD0  : ('c,',           []),
    0xD1  : ('w,',           []),
    0xD2  : ('l,',           []),
    0xD3  : (',',            []),
    0xD4  : ('um*',          []),
    0xD5  : ('um/mod',       []),
    0xD8  : ('d+',           []),
    0xD9  : ('d-',           []),
    0xDA  : ('get-token',    []),
    0xDB  : ('set-token',    []),
    0xDC  : ('state',        []),
    0xF1  : ('start1',       ['fcode_hdr', 'offset16']),
    0xFD  : ('version1',     ['fcode_hdr', 'offset8']),
    0x102 : ('my-address',   []),
    0x103 : ('my-space',     []),
    0x110 : ('property',     []),
    0x111 : ('encode-int',   []),
    0x112 : ('encode+',      []),
    0x113 : ('encode-phys',  []),
    0x114 : ('encode-string', []),
    0x115 : ('encode-bytes', []),
    0x119 : ('model',        []),
    0x11A : ('device-type',  []),
    0x11C : ('is-install',   []),
    0x11D : ('is-remove',    []),
    0x125 : ('get-msecs',    []),
    0x126 : ('ms',           []),
    0x128 : ('decode-phys',  []),
    0x150 : ('#lines',       []),
    0x15A : ('erase-screen', []),
    0x166 : ('window-left',  []),
    0x16A : ('default-font', []),
    0x16B : ('set-font',     []),
    0x16C : ('char-height',  []),
    0x16D : ('char-width',   []),
    0x18B : ('fb8-install',  []),
    0x201 : ('device-name',  []),
    0x203 : ('my-self',      []),
    0x207 : ('find-method',  []),
    0x209 : ('$call-parent', []),
    0x20A : ('my-parent',    []),
    0x20B : ('ihandle>phandle', []),
    0x216 : ('abort',        []),
    0x21A : ('get-my-property', []),
    0x21B : ('decode-int',   []),
    0x21D : ('get-inherited-property', []),
    0x21E : ('delete-property', []),
    0x226 : ('lwflip',       []),
    0x227 : ('lbflip',       []),
    0x230 : ('rb@',          []),
    0x231 : ('rb!',          []),
    0x232 : ('rw@',          []),
    0x233 : ('rw!',          []),
    0x234 : ('rl@',          []),
    0x235 : ('rl!',          [])
 }
 class DeTokenizer():
    def __init__(self, code_stream, code_len, pos = 0):
        self.pos = pos
        self.code_stream = code_stream
        self.code_length = code_len
        self.offset_bits = 8
        self.builtin_dict = FORTH_WORDS
        self.user_dict = {}
        self.new_line = False
    def reinit(self, code_stream, code_len, pos = 0):
        self.pos = pos
        self.code_stream = code_stream
        self.code_length = code_len
        self.offset_bits = 8
        self.new_line = False
    def next_toknum(self):
        tok_num = self.code_stream[self.pos]
        self.pos += 1
        if tok_num > 0 and tok_num <= 0xF:
            tok_num = (tok_num << 8) | self.code_stream[self.pos]
            self.pos += 1
        return tok_num
    def fcode_hdr(self):
        fcode_hdr = struct.unpack('>BHL', self.code_stream[self.pos:self.pos+7])
        self.pos += 7
        print("FCode header:")
        print("- format   = 0x%X" % fcode_hdr[0])
        print("- checksum = 0x%X" % fcode_hdr[1])
        print("- prog_len = 0x%X\n" % fcode_hdr[2])
    def offset8(self):
        self.offset_bits = 8
    def offset16(self):
        self.offset_bits = 16
    def offset(self):
        if self.offset_bits == 8:
            val = self.code_stream[self.pos]
            self.pos += 1
        elif self.offset_bits == 16:
            val = (self.code_stream[self.pos] << 8) | (self.code_stream[self.pos+1])
            self.pos += 2
        sign = 1 << (self.offset_bits - 1)
        offset = (val & (sign - 1)) - (val & sign)
        print("- offset: %d" % offset)
    def num32(self):
        num = (self.code_stream[self.pos] << 24)   | \
              (self.code_stream[self.pos+1] << 16) | \
              (self.code_stream[self.pos+2] << 8)  | \
               self.code_stream[self.pos+3]
        self.pos += 4
        print("- number: 0x%X" % num)
    def fcode_num(self):
        num = self.next_toknum()
        print("- FCode #: 0x%X" % num)
        return num
    def unnamed_tok(self):
        tok_num = self.fcode_num()
        tok_name = 'unnamed_' + format(tok_num, 'x')
        #if tok_num not in self.user_dict:
        self.user_dict[tok_num] = tok_name
    def named_tok(self):
        tok_name = self.p_string()
        tok_num = self.fcode_num()
        #if tok_num not in self.user_dict:
        self.user_dict[tok_num] = tok_name
    def p_string(self):
        len = self.code_stream[self.pos]
        self.pos += 1
        try:
            str = struct.unpack('%ds' % len, \
                  self.code_stream[self.pos:self.pos+len])[0].decode('utf-8')
            self.pos += len
            print('- String: " %s"' % str)
            return str
        except UnicodeDecodeError: # Forth string may contain non-printable chars!
            bytes = struct.unpack('%dB' % len, \
                    self.code_stream[self.pos:self.pos+len])
            self.pos += len
            print(' '.join(format(x, '02x') for x in bytes))
            return bytes
    def line_break(self):
            print("")
            self.new_line = True
    def insert_newline(self, tok_num):
        if tok_num == 0xCA or tok_num == 0xB5:
            if not self.new_line:
                self.line_break()
        else:
            self.new_line = False
    def decode_stream(self):
        while self.pos < self.code_length:
            tok_num = self.next_toknum()
            if tok_num == 0:
                print('0x00 ; end0')
                break
            self.insert_newline(tok_num)
            if tok_num in self.builtin_dict:
                dict_entry = self.builtin_dict[tok_num]
                print("0x%X ; %s" % (tok_num, dict_entry[0]))
                for fun in dict_entry[1]:
                    fun_obj = getattr(self, fun)
                    fun_obj()
            elif tok_num in self.user_dict: # check user dictionary
                print("0x%X ; %s" % (tok_num, self.user_dict[tok_num]))
            else:
                print("Token %X not found at offset 0x%X!" % (tok_num, self.pos))
                crash()
--- a/detok_of.py
+++ b/detok_of.py
@ -0,0 +1,128 @@
 '''
    DeTokenizer for Apple OpenFirmware.
    Author: Max Poliakovski 2019-2021
 '''
 import struct
 from argparse import ArgumentParser
 from extractdict import parse_coff_container, scan_forth_dict, print_dict
 from detok import DeTokenizer
 def get_fcode_prog(infile):
    # try to get FCode program header
    fpos = infile.tell()
    fcode_hdr = struct.unpack('>BBHL', infile.read(8))
    infile.seek(fpos)
    if fcode_hdr[0] != 0xFD and fcode_hdr[0] != 0xF1:
        #print("Unsupported FCode header function 0x%X" % fcode_hdr[0])
        return (0,0)
    prog_len = fcode_hdr[3]
    prog_stream = infile.read(prog_len)
    return (prog_stream, prog_len)
 def decode_package_header(infile):
    pkg_hdr = struct.unpack('>LHHLL', infile.read(16))
    print("Device package header:")
    print("----------------------")
    print("Next package offset: %X" % pkg_hdr[0])
    print("Device ID: %X" % pkg_hdr[1])
    print("Vendor ID: %X" % pkg_hdr[2])
    print("Device class: %X" % pkg_hdr[3])
    print("Package header size: %X" % pkg_hdr[4])
    return (pkg_hdr[0], pkg_hdr[4])
 def populate_user_dict(src_dict, dst_dict):
    for tok_num, word in src_dict.items():
        if tok_num >= 0x100:
            dst_dict[tok_num] = word['name']
    # add Apple specific FCodes for managing stack frames
    for i in range(0,9):
        dst_dict[0x407 + i] = '(pushlocals_%s)' % i
    for i in range(0,8):
        dst_dict[0x410 + i] = '(local@%s)' % i
        dst_dict[0x418 + i] = '(local!%s)' % i
 def main():
    parser = ArgumentParser()
    parser.add_argument('--rom_path', type=str,
                        dest='rom_path',
                        help='path to ROM file to process',
                        metavar='ROM_PATH', required=True)
    parser.add_argument('--offset', type=lambda x: int(x,0),
                        dest='of_offset',
                        help='offset to OF container (autodetect attempt if omitted)',
                        metavar='OF_OFFSET', required=True)
    opts = parser.parse_args()
    with open(opts.rom_path, 'rb') as infile:
        pos, size = parse_coff_container(infile, opts.of_offset);
        if size == 0:
            print("No valid OF binary found at offset %X" % opts.of_offset)
            exit(1)
        print("pos = 0x%X, size = 0x%X" % (pos, size))
        dict = scan_forth_dict(infile, opts.of_offset + pos, pos + size)
        #print_dict(dict)
        print("Detokenizing main OF package...")
        print("-------------------------------\n")
        infile.seek(opts.of_offset + pos + 8)
        prog_offset = struct.unpack('>L', infile.read(4))[0]
        print("FCode program offset: %X" % (prog_offset + pos))
        infile.seek(opts.of_offset + prog_offset + pos)
        prog_stream, prog_size = get_fcode_prog(infile)
        detokenizer = DeTokenizer(prog_stream, prog_size)
        populate_user_dict(dict, detokenizer.user_dict)
        detokenizer.decode_stream()
        print("\nDetokenizing device packages...")
        print("-------------------------------\n")
        infile.seek(opts.of_offset + pos + 0x40)
        pkg_offset = struct.unpack('>L', infile.read(4))[0] + pos + opts.of_offset
        print("Last OF device package offset: %X" % (pkg_offset))
        prev_pkg_offset = pkg_offset
        while True:
            print("\n")
            infile.seek(pkg_offset)
            next_pkg_offset, hdr_size = decode_package_header(infile)
            prog_stream, prog_size = get_fcode_prog(infile)
            if prog_size == 0:
                prog_size = prev_pkg_offset - pkg_offset - hdr_size
                #print("Headerless FCode program size: %X" % prog_size)
                #print("File pos: %X" % infile.tell())
                prog_stream = infile.read(prog_size)
            print("\nDetokenizing package at offset %X...\n" % pkg_offset)
            detokenizer.reinit(prog_stream, prog_size)
            detokenizer.decode_stream()
            # navigate to previous package or exit if there is no more packages
            if next_pkg_offset == 0:
                break
            prev_pkg_offset = pkg_offset
            pkg_offset = (pkg_offset + next_pkg_offset) & 0xFFFFFFFF
 if __name__ == '__main__':
    main()
--- a/extractdict.py
+++ b/extractdict.py
@ -0,0 +1,118 @@
 '''
    Dictionary extraction script for Apple OpenFirmware.
    Author: Max Poliakovski 2019-2021
 '''
 import os
 import struct
 from argparse import ArgumentParser
 class OFWordHeader:
    def __init__(self, infile, pos):
        infile.seek(pos)
        # get common fields
        hdr = struct.unpack('>iBBH', infile.read(8))
        self.prev    = hdr[0]
        self.flags   = hdr[1]
        self.type    = hdr[2]
        self.tok_num = hdr[3]
        if self.flags & 0x20: # bit 5 means nameless word
            # generate artificial name
            self.name = 'unnamed_' + format(self.tok_num, 'x')
        else:
            len = struct.unpack('B', infile.read(1))[0]
            self.name = struct.unpack('%ds' % len,
                        infile.read(len))[0].decode('utf-8')
        self.pos = pos
 def parse_coff_container(infile, cont_offset):
    infile.seek(cont_offset)
    # read COFF header
    coff_hdr = struct.unpack('>HHL', infile.read(8))
    n_sections = coff_hdr[1]
    # COFF magic and at least one section are required
    if coff_hdr[0] != 0x1DF or n_sections < 1:
        print("No valid COFF header found at offset %X" % cont_offset)
        return (0, 0)
    if coff_hdr[2] == 0x47617279:
        print("Detected Macintosh OldWorld OF binary...")
    infile.seek(cont_offset + 20) # rewind to sections array
    # search for executable code section
    for sect in range(n_sections):
        sect_desc = struct.unpack('>8sLLLLLLHHL', infile.read(40))
        sect_name = sect_desc[0].decode('utf-8').strip('\x00')
        if sect_name == '.text':
            return (sect_desc[4], sect_desc[3])
    return (0, 0)
 def scan_forth_dict(infile, pos, end_pos):
    # try offset at code_section[0x48] that usually points
    # to the header of the last word (cold-load)
    infile.seek(pos + 0x48)
    dict_last_offset = struct.unpack('>L', infile.read(4))[0]
    if (dict_last_offset + 20) >= end_pos:
        return 0
    word = OFWordHeader(infile, dict_last_offset + pos)
    if word.name == 'cold-load':
        print("cold-load found at offset %X" % word.pos)
    else:
        print('Scanning for cold-load not implemented yet')
        return 0
    print('\n')
    forth_dict = {}
    word_pos = dict_last_offset + pos
    while 1:
        forth_dict[word.tok_num] = {'name' : word.name, 'type' : word.type, 'pos' : word.pos}
        if word.prev >= 0:
            return forth_dict
        word_pos += word.prev
        del word
        word = OFWordHeader(infile, word_pos)
 def print_dict(dict):
    for tok_num, word in dict.items():
        print("Word: %04X, name: %s, type: %02X, offset = %08X" % (tok_num, word['name'], word['type'], word['pos']))
 def main():
    parser = ArgumentParser()
    parser.add_argument('--rom_path', type=str,
                        dest='rom_path',
                        help='path to ROM file to process',
                        metavar='ROM_PATH', required=True)
    parser.add_argument('--offset', type=lambda x: int(x,0),
                        dest='of_offset',
                        help='offset to OF container (autodetect attempt if omitted)',
                        metavar='OF_OFFSET', required=True)
    opts = parser.parse_args()
    with open(opts.rom_path, 'rb') as infile:
        pos, size = parse_coff_container(infile, opts.of_offset);
        if size == 0:
            print("No valid OF binary found at offset %X" % opts.of_offset)
            exit(1)
        dict = scan_forth_dict(infile, opts.of_offset + pos, pos + size)
        print_dict(dict)
 if __name__ == '__main__':
    main()