diff --git a/detok.py b/detok.py new file mode 100644 index 0000000..33d0714 --- /dev/null +++ b/detok.py @@ -0,0 +1,335 @@ +''' + FCode DeTokenizer in pure Python. + + Author: Max Poliakovski 2019-2021 +''' +import struct + +FORTH_WORDS = { + 0x10 : ('b(lit)', ['num32']), + 0x11 : ("b(')", ['fcode_num']), + 0x12 : ('b(")', ['p_string']), + 0x13 : ('bbranch', ['offset']), + 0x14 : ('b?branch', ['offset']), + 0x15 : ('b(loop)', ['offset']), + 0x16 : ('b(+loop)', ['offset']), + 0x17 : ('b(do)', ['offset']), + 0x18 : ('b(?do)', ['offset']), + 0x19 : ('i', []), + 0x1A : ('j', []), + 0x1B : ('b(leave)', []), + 0x1C : ('b(of)', ['offset']), + 0x1D : ('execute', []), + 0x1E : ('+', []), + 0x1F : ('-', []), + 0x20 : ('*', []), + 0x21 : ('/', []), + 0x22 : ('mod', []), + 0x23 : ('and', []), + 0x24 : ('or', []), + 0x25 : ('xor', []), + 0x26 : ('invert', []), + 0x27 : ('lshift', []), + 0x28 : ('rshift', []), + 0x29 : ('>>a', []), + 0x2A : ('/mod', []), + 0x2B : ('u/mod', []), + 0x2C : ('negate', []), + 0x2D : ('abs', []), + 0x2E : ('min', []), + 0x2F : ('max', []), + 0x30 : ('>r', []), + 0x31 : ('r>', []), + 0x32 : ('r@', []), + 0x33 : ('exit', []), + 0x34 : ('0=', []), + 0x35 : ('0<>', []), + 0x36 : ('0<', []), + 0x37 : ('0<=', []), + 0x38 : ('0>', []), + 0x39 : ('0>=', []), + 0x3A : ('<', []), + 0x3B : ('>', []), + 0x3C : ('=', []), + 0x3D : ('<>', []), + 0x3E : ('u>', []), + 0x3F : ('u<=', []), + 0x40 : ('u<', []), + 0x41 : ('u>=', []), + 0x42 : ('>=', []), + 0x43 : ('<=', []), + 0x44 : ('between', []), + 0x45 : ('within', []), + 0x46 : ('drop', []), + 0x47 : ('dup', []), + 0x48 : ('over', []), + 0x49 : ('swap', []), + 0x4A : ('rot', []), + 0x4B : ('-rot', []), + 0x4C : ('tuck', []), + 0x4D : ('nip', []), + 0x4E : ('pick', []), + 0x4F : ('roll', []), + 0x50 : ('?dup', []), + 0x51 : ('depth', []), + 0x52 : ('2drop', []), + 0x53 : ('2dup', []), + 0x54 : ('2over', []), + 0x55 : ('2swap', []), + 0x57 : ('2/', []), + 0x59 : ('2*', []), + 0x5C : ('/l', []), + 0x5E : ('ca+', []), + 0x5F : ('wa+', []), + 0x60 : ('la+', []), + 0x61 : ('na+', []), + 0x62 : ('char+', []), + 0x63 : ('wa1+', []), + 0x64 : ('la1+', []), + 0x65 : ('cell+', []), + 0x68 : ('/l*', []), + 0x69 : ('cells', []), + 0x6A : ('on', []), + 0x6B : ('off', []), + 0x6C : ('+!', []), + 0x6D : ('@', []), + 0x6E : ('l@', []), + 0x6F : ('w@', []), + 0x71 : ('c@', []), + 0x72 : ('!', []), + 0x73 : ('l!', []), + 0x74 : ('w!', []), + 0x75 : ('c!', []), + 0x76 : ('2@', []), + 0x77 : ('2!', []), + 0x78 : ('move', []), + 0x79 : ('fill', []), + 0x7A : ('comp', []), + 0x7C : ('lwsplit', []), + 0x7D : ('wljoin', []), + 0x7E : ('lbsplit', []), + 0x7F : ('bljoin', []), + 0x80 : ('wbflip', []), + 0x83 : ('pack', []), + 0x84 : ('count', []), + 0x85 : ('body>', []), + 0x86 : ('>body', []), + 0x89 : ('unloop', []), + 0x8B : ('alloc-mem', []), + 0x8C : ('free-mem', []), + 0x8D : ('key?', []), + 0x8E : ('key', []), + 0x90 : ('type', []), + 0x92 : ('cr', []), + 0x9D : ('.', []), + 0xA0 : ('base', []), + 0xA4 : ('-1', []), + 0xA5 : ('0', []), + 0xA6 : ('1', []), + 0xA7 : ('2', []), + 0xA8 : ('3', []), + 0xA9 : ('bl', []), + 0xAA : ('bs', []), + 0xAB : ('bell', []), + 0xAC : ('bounds', []), + 0xAD : ('here', []), + 0xAE : ('aligned', []), + 0xAF : ('wbsplit', []), + 0xB0 : ('bwjoin', []), + 0xB1 : ('b(resolve)', []), + 0xB5 : ('new-token', ['unnamed_tok']), + 0xB7 : ('b(:)', []), + 0xB8 : ('b(value)', ['line_break']), + 0xB9 : ('b(variable)', ['line_break']), + 0xBA : ('b(constant)', ['line_break']), + 0xBB : ('b(create)', ['line_break']), + 0xBC : ('b(defer)', ['line_break']), + 0xBD : ('b(buffer:)', ['line_break']), + 0xBE : ('b(field)', ['line_break']), + 0xC0 : ('instance', []), + 0xC2 : ('b(;)', ['line_break']), + 0xC3 : ('b(to)', ['fcode_num']), + 0xC4 : ('b(case)', []), + 0xC5 : ('b(endcase)', []), + 0xC6 : ('b(endof)', ['offset']), + 0xCA : ('external-token', ['named_tok']), + 0xD0 : ('c,', []), + 0xD1 : ('w,', []), + 0xD2 : ('l,', []), + 0xD3 : (',', []), + 0xD4 : ('um*', []), + 0xD5 : ('um/mod', []), + 0xD8 : ('d+', []), + 0xD9 : ('d-', []), + 0xDA : ('get-token', []), + 0xDB : ('set-token', []), + 0xDC : ('state', []), + 0xF1 : ('start1', ['fcode_hdr', 'offset16']), + 0xFD : ('version1', ['fcode_hdr', 'offset8']), + 0x102 : ('my-address', []), + 0x103 : ('my-space', []), + 0x110 : ('property', []), + 0x111 : ('encode-int', []), + 0x112 : ('encode+', []), + 0x113 : ('encode-phys', []), + 0x114 : ('encode-string', []), + 0x115 : ('encode-bytes', []), + 0x119 : ('model', []), + 0x11A : ('device-type', []), + 0x11C : ('is-install', []), + 0x11D : ('is-remove', []), + 0x125 : ('get-msecs', []), + 0x126 : ('ms', []), + 0x128 : ('decode-phys', []), + 0x150 : ('#lines', []), + 0x15A : ('erase-screen', []), + 0x166 : ('window-left', []), + 0x16A : ('default-font', []), + 0x16B : ('set-font', []), + 0x16C : ('char-height', []), + 0x16D : ('char-width', []), + 0x18B : ('fb8-install', []), + 0x201 : ('device-name', []), + 0x203 : ('my-self', []), + 0x207 : ('find-method', []), + 0x209 : ('$call-parent', []), + 0x20A : ('my-parent', []), + 0x20B : ('ihandle>phandle', []), + 0x216 : ('abort', []), + 0x21A : ('get-my-property', []), + 0x21B : ('decode-int', []), + 0x21D : ('get-inherited-property', []), + 0x21E : ('delete-property', []), + 0x226 : ('lwflip', []), + 0x227 : ('lbflip', []), + 0x230 : ('rb@', []), + 0x231 : ('rb!', []), + 0x232 : ('rw@', []), + 0x233 : ('rw!', []), + 0x234 : ('rl@', []), + 0x235 : ('rl!', []) +} + +class DeTokenizer(): + def __init__(self, code_stream, code_len, pos = 0): + self.pos = pos + self.code_stream = code_stream + self.code_length = code_len + self.offset_bits = 8 + self.builtin_dict = FORTH_WORDS + self.user_dict = {} + self.new_line = False + + def reinit(self, code_stream, code_len, pos = 0): + self.pos = pos + self.code_stream = code_stream + self.code_length = code_len + self.offset_bits = 8 + self.new_line = False + + def next_toknum(self): + tok_num = self.code_stream[self.pos] + self.pos += 1 + if tok_num > 0 and tok_num <= 0xF: + tok_num = (tok_num << 8) | self.code_stream[self.pos] + self.pos += 1 + return tok_num + + def fcode_hdr(self): + fcode_hdr = struct.unpack('>BHL', self.code_stream[self.pos:self.pos+7]) + self.pos += 7 + print("FCode header:") + print("- format = 0x%X" % fcode_hdr[0]) + print("- checksum = 0x%X" % fcode_hdr[1]) + print("- prog_len = 0x%X\n" % fcode_hdr[2]) + + def offset8(self): + self.offset_bits = 8 + + def offset16(self): + self.offset_bits = 16 + + def offset(self): + if self.offset_bits == 8: + val = self.code_stream[self.pos] + self.pos += 1 + elif self.offset_bits == 16: + val = (self.code_stream[self.pos] << 8) | (self.code_stream[self.pos+1]) + self.pos += 2 + sign = 1 << (self.offset_bits - 1) + offset = (val & (sign - 1)) - (val & sign) + print("- offset: %d" % offset) + + def num32(self): + num = (self.code_stream[self.pos] << 24) | \ + (self.code_stream[self.pos+1] << 16) | \ + (self.code_stream[self.pos+2] << 8) | \ + self.code_stream[self.pos+3] + self.pos += 4 + print("- number: 0x%X" % num) + + def fcode_num(self): + num = self.next_toknum() + print("- FCode #: 0x%X" % num) + return num + + def unnamed_tok(self): + tok_num = self.fcode_num() + tok_name = 'unnamed_' + format(tok_num, 'x') + #if tok_num not in self.user_dict: + self.user_dict[tok_num] = tok_name + + def named_tok(self): + tok_name = self.p_string() + tok_num = self.fcode_num() + #if tok_num not in self.user_dict: + self.user_dict[tok_num] = tok_name + + def p_string(self): + len = self.code_stream[self.pos] + self.pos += 1 + try: + str = struct.unpack('%ds' % len, \ + self.code_stream[self.pos:self.pos+len])[0].decode('utf-8') + self.pos += len + print('- String: " %s"' % str) + return str + except UnicodeDecodeError: # Forth string may contain non-printable chars! + bytes = struct.unpack('%dB' % len, \ + self.code_stream[self.pos:self.pos+len]) + self.pos += len + print(' '.join(format(x, '02x') for x in bytes)) + return bytes + + def line_break(self): + print("") + self.new_line = True + + def insert_newline(self, tok_num): + if tok_num == 0xCA or tok_num == 0xB5: + if not self.new_line: + self.line_break() + else: + self.new_line = False + + def decode_stream(self): + while self.pos < self.code_length: + tok_num = self.next_toknum() + if tok_num == 0: + print('0x00 ; end0') + break + + self.insert_newline(tok_num) + + if tok_num in self.builtin_dict: + dict_entry = self.builtin_dict[tok_num] + print("0x%X ; %s" % (tok_num, dict_entry[0])) + for fun in dict_entry[1]: + fun_obj = getattr(self, fun) + fun_obj() + elif tok_num in self.user_dict: # check user dictionary + print("0x%X ; %s" % (tok_num, self.user_dict[tok_num])) + else: + print("Token %X not found at offset 0x%X!" % (tok_num, self.pos)) + crash() diff --git a/detok_of.py b/detok_of.py new file mode 100644 index 0000000..ae819b1 --- /dev/null +++ b/detok_of.py @@ -0,0 +1,128 @@ +''' + DeTokenizer for Apple OpenFirmware. + + Author: Max Poliakovski 2019-2021 +''' +import struct +from argparse import ArgumentParser + +from extractdict import parse_coff_container, scan_forth_dict, print_dict + +from detok import DeTokenizer + +def get_fcode_prog(infile): + # try to get FCode program header + fpos = infile.tell() + fcode_hdr = struct.unpack('>BBHL', infile.read(8)) + infile.seek(fpos) + + if fcode_hdr[0] != 0xFD and fcode_hdr[0] != 0xF1: + #print("Unsupported FCode header function 0x%X" % fcode_hdr[0]) + return (0,0) + prog_len = fcode_hdr[3] + prog_stream = infile.read(prog_len) + return (prog_stream, prog_len) + + +def decode_package_header(infile): + pkg_hdr = struct.unpack('>LHHLL', infile.read(16)) + print("Device package header:") + print("----------------------") + print("Next package offset: %X" % pkg_hdr[0]) + print("Device ID: %X" % pkg_hdr[1]) + print("Vendor ID: %X" % pkg_hdr[2]) + print("Device class: %X" % pkg_hdr[3]) + print("Package header size: %X" % pkg_hdr[4]) + return (pkg_hdr[0], pkg_hdr[4]) + + +def populate_user_dict(src_dict, dst_dict): + for tok_num, word in src_dict.items(): + if tok_num >= 0x100: + dst_dict[tok_num] = word['name'] + + # add Apple specific FCodes for managing stack frames + for i in range(0,9): + dst_dict[0x407 + i] = '(pushlocals_%s)' % i + + for i in range(0,8): + dst_dict[0x410 + i] = '(local@%s)' % i + dst_dict[0x418 + i] = '(local!%s)' % i + + +def main(): + parser = ArgumentParser() + parser.add_argument('--rom_path', type=str, + dest='rom_path', + help='path to ROM file to process', + metavar='ROM_PATH', required=True) + parser.add_argument('--offset', type=lambda x: int(x,0), + dest='of_offset', + help='offset to OF container (autodetect attempt if omitted)', + metavar='OF_OFFSET', required=True) + opts = parser.parse_args() + + with open(opts.rom_path, 'rb') as infile: + pos, size = parse_coff_container(infile, opts.of_offset); + if size == 0: + print("No valid OF binary found at offset %X" % opts.of_offset) + exit(1) + + print("pos = 0x%X, size = 0x%X" % (pos, size)) + + dict = scan_forth_dict(infile, opts.of_offset + pos, pos + size) + + #print_dict(dict) + + print("Detokenizing main OF package...") + print("-------------------------------\n") + + infile.seek(opts.of_offset + pos + 8) + prog_offset = struct.unpack('>L', infile.read(4))[0] + print("FCode program offset: %X" % (prog_offset + pos)) + + infile.seek(opts.of_offset + prog_offset + pos) + + prog_stream, prog_size = get_fcode_prog(infile) + + detokenizer = DeTokenizer(prog_stream, prog_size) + + populate_user_dict(dict, detokenizer.user_dict) + + detokenizer.decode_stream() + + print("\nDetokenizing device packages...") + print("-------------------------------\n") + + infile.seek(opts.of_offset + pos + 0x40) + pkg_offset = struct.unpack('>L', infile.read(4))[0] + pos + opts.of_offset + print("Last OF device package offset: %X" % (pkg_offset)) + + prev_pkg_offset = pkg_offset + + while True: + print("\n") + infile.seek(pkg_offset) + next_pkg_offset, hdr_size = decode_package_header(infile) + + prog_stream, prog_size = get_fcode_prog(infile) + + if prog_size == 0: + prog_size = prev_pkg_offset - pkg_offset - hdr_size + #print("Headerless FCode program size: %X" % prog_size) + #print("File pos: %X" % infile.tell()) + prog_stream = infile.read(prog_size) + + print("\nDetokenizing package at offset %X...\n" % pkg_offset) + + detokenizer.reinit(prog_stream, prog_size) + detokenizer.decode_stream() + + # navigate to previous package or exit if there is no more packages + if next_pkg_offset == 0: + break + prev_pkg_offset = pkg_offset + pkg_offset = (pkg_offset + next_pkg_offset) & 0xFFFFFFFF + +if __name__ == '__main__': + main() diff --git a/extractdict.py b/extractdict.py new file mode 100644 index 0000000..9e50313 --- /dev/null +++ b/extractdict.py @@ -0,0 +1,118 @@ +''' + Dictionary extraction script for Apple OpenFirmware. + + Author: Max Poliakovski 2019-2021 +''' +import os +import struct +from argparse import ArgumentParser + +class OFWordHeader: + def __init__(self, infile, pos): + infile.seek(pos) + + # get common fields + hdr = struct.unpack('>iBBH', infile.read(8)) + self.prev = hdr[0] + self.flags = hdr[1] + self.type = hdr[2] + self.tok_num = hdr[3] + + if self.flags & 0x20: # bit 5 means nameless word + # generate artificial name + self.name = 'unnamed_' + format(self.tok_num, 'x') + else: + len = struct.unpack('B', infile.read(1))[0] + self.name = struct.unpack('%ds' % len, + infile.read(len))[0].decode('utf-8') + + self.pos = pos + + +def parse_coff_container(infile, cont_offset): + infile.seek(cont_offset) + + # read COFF header + coff_hdr = struct.unpack('>HHL', infile.read(8)) + n_sections = coff_hdr[1] + + # COFF magic and at least one section are required + if coff_hdr[0] != 0x1DF or n_sections < 1: + print("No valid COFF header found at offset %X" % cont_offset) + return (0, 0) + + if coff_hdr[2] == 0x47617279: + print("Detected Macintosh OldWorld OF binary...") + + infile.seek(cont_offset + 20) # rewind to sections array + + # search for executable code section + for sect in range(n_sections): + sect_desc = struct.unpack('>8sLLLLLLHHL', infile.read(40)) + sect_name = sect_desc[0].decode('utf-8').strip('\x00') + if sect_name == '.text': + return (sect_desc[4], sect_desc[3]) + + return (0, 0) + + +def scan_forth_dict(infile, pos, end_pos): + # try offset at code_section[0x48] that usually points + # to the header of the last word (cold-load) + infile.seek(pos + 0x48) + dict_last_offset = struct.unpack('>L', infile.read(4))[0] + if (dict_last_offset + 20) >= end_pos: + return 0 + + word = OFWordHeader(infile, dict_last_offset + pos) + if word.name == 'cold-load': + print("cold-load found at offset %X" % word.pos) + else: + print('Scanning for cold-load not implemented yet') + return 0 + + print('\n') + + forth_dict = {} + + word_pos = dict_last_offset + pos + + while 1: + forth_dict[word.tok_num] = {'name' : word.name, 'type' : word.type, 'pos' : word.pos} + if word.prev >= 0: + return forth_dict + word_pos += word.prev + del word + word = OFWordHeader(infile, word_pos) + + +def print_dict(dict): + for tok_num, word in dict.items(): + print("Word: %04X, name: %s, type: %02X, offset = %08X" % (tok_num, word['name'], word['type'], word['pos'])) + + +def main(): + parser = ArgumentParser() + parser.add_argument('--rom_path', type=str, + dest='rom_path', + help='path to ROM file to process', + metavar='ROM_PATH', required=True) + parser.add_argument('--offset', type=lambda x: int(x,0), + dest='of_offset', + help='offset to OF container (autodetect attempt if omitted)', + metavar='OF_OFFSET', required=True) + opts = parser.parse_args() + + with open(opts.rom_path, 'rb') as infile: + pos, size = parse_coff_container(infile, opts.of_offset); + if size == 0: + print("No valid OF binary found at offset %X" % opts.of_offset) + exit(1) + + dict = scan_forth_dict(infile, opts.of_offset + pos, pos + size) + + print_dict(dict) + + +if __name__ == '__main__': + main()