From e5dfcf65e788084ea90958b7c54d9727fb7252d7 Mon Sep 17 00:00:00 2001
From: Elliot Nunn <elliotnunn@me.com>
Date: Sun, 19 Jan 2020 14:50:41 +0800
Subject: [PATCH] Speed up Rez code by a lot!

And rip out some non-features
---
 bin/SimpleDeRez          |   3 +-
 bin/SimpleRez            |   2 +-
 macresources/__init__.py |   2 +-
 macresources/main.py     | 332 +++++++++++++++++++++------------------
 test_all.py              |   7 -
 5 files changed, 185 insertions(+), 161 deletions(-)

diff --git a/bin/SimpleDeRez b/bin/SimpleDeRez
index 62ca3a3..70e3d5d 100755
--- a/bin/SimpleDeRez
+++ b/bin/SimpleDeRez
@@ -15,13 +15,12 @@ parser = argparse.ArgumentParser(description='''
 
 parser.add_argument('resourceFile', help='file to be decompiled')
 parser.add_argument('-ascii', action='store_true', help='[!] guarantee ASCII output')
-parser.add_argument('-fakehdr', action='store_true', help='[!] save 225b file header as fake resource')
 parser.add_argument('-useDF', action='store_true', help='ignored: data fork is always used')
 
 args = parser.parse_args()
 
 with open(args.resourceFile, 'rb') as f:
-    resources = macresources.parse_file(f.read(), fake_header_rsrc=args.fakehdr)
+    resources = macresources.parse_file(f.read())
 
 try:
 	rez = macresources.make_rez_code(resources, ascii_clean=args.ascii)
diff --git a/bin/SimpleRez b/bin/SimpleRez
index a8a9652..1ba9fdb 100755
--- a/bin/SimpleRez
+++ b/bin/SimpleRez
@@ -39,7 +39,7 @@ args = parser.parse_args()
 resources = []
 for in_path in args.rezFile:
     with open(in_path, 'rb') as f:
-        resources.extend(macresources.parse_rez_code(f.read()))
+        resources.extend(macresources.parse_rez_code(f.read(), original_file=f.name))
 
 with open(args.o, 'wb') as f:
     f.write(macresources.make_file(resources, align=args.align))
diff --git a/macresources/__init__.py b/macresources/__init__.py
index 6e6aaf5..fc69cd6 100644
--- a/macresources/__init__.py
+++ b/macresources/__init__.py
@@ -1,2 +1,2 @@
-from .main import parse_rez_code, parse_file, make_rez_code, make_file, Resource, ResourceAttrs
 from . import binhex
+from .main import parse_rez_code, parse_file, make_rez_code, make_file, Resource
diff --git a/macresources/main.py b/macresources/main.py
index 786be19..c8940c7 100644
--- a/macresources/main.py
+++ b/macresources/main.py
@@ -1,9 +1,41 @@
 import collections
 import struct
 import enum
+import re
 
 
-FAKE_HEADER_RSRC_TYPE = b'header' # obviously invalid
+# The allowed token sequence when parsing Rez code (quite restrictive)
+rez_tokens = [
+    ((),         r'(\s|//.*?\n|/\*.*?\*/)+'),                                   # 0 whitespace/comment (gets ignored)
+    ((1,11),     r'\$"\s*((?:[0-9A-Fa-f]{2}\s*)*)"'),                           # 1 hex data
+    ((3,),       r'(data)'),                                                    # 2 start of raw resource
+    ((4,),       r"('(?:[^'\\]|\\0x[0-9A-Fa-f]{2}|\\[\\'\\?btrvfn])*')"),       # 3 type
+    ((5,),       r'(\()'),                                                      # 4 start of bracketed resource info
+    ((6,7,8,9),  r'(-?\d+)'),                                                   # 5 ID
+    ((7,8,9),    r',gap("(?:[^"\\]|\\0x[0-9A-Fa-f]{2}|\\[\\"\\?btrvfn])*")'),   # 6 name
+    ((9,),       r',gap\$([0-9a-fA-F]{1,2})'),                                  # 7 attribs (hex)
+    ((8,9),      r',gap(sysheap|purgeable|locked|protected|preload)'),          # 8 attribs (specific)
+    ((10,),      r'(\))'),                                                      # 9 end of bracketed resource info
+    ((1,11),     r'(\{)'),                                                      # 10 start of hex block
+    ((12,),      r'(\})'),                                                      # 11 end of hex block
+    ((2,-1),     r'(;)'),                                                       # 12 the end for real
+    ((),         r'(.)'),                                                       # 13 unexpected character (always errors)
+]
+
+allowed_to_follow_kind, token_regexen = zip(*rez_tokens)
+
+# The 'gap' hack turns ', sysheap' etc into a single token
+gap = r'(?:\s|//.*?\n|/\*.*?\*/)*'
+rez_tokenizer = '|'.join(token_regexen).replace('gap', gap).encode('ascii')
+rez_tokenizer = re.compile(rez_tokenizer)
+
+
+class RezSyntaxError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __str__(self):
+       return self.msg
 
 
 MAP = bytearray(range(256))
@@ -46,57 +78,15 @@ def _rez_escape(src, singlequote=False, ascii_clean=False):
     return b''.join(chars)
 
 
-def _rez_unescape(src):
-    the_quote = src[0:1]
-    src = src[1:]
-
-    backslash_dict = {
-        b'b': 8,
-        b't': 9,
-        b'r': 10,
-        b'v': 11,
-        b'f': 12,
-        b'n': 13,
-        b'?': 127,
-    }
-
-    chars = []
-    while not src.startswith(the_quote):
-        if src.startswith(b'\\'):
-            src = src[1:]
-            if src.startswith(b'0x'):
-                ch = int(src[2:4].decode('ascii'), 16)
-                src = src[4:]
-            else:
-                ch = backslash_dict.get(src[0:1], src[0])
-                src = src[1:]
-        else:
-            ch = src[0]
-            src = src[1:]
-        chars.append(ch)
-    src = src[1:] # cut off the final quote
-    chars = bytes(chars)
-    return chars, src # return leftover in tuple
-
-
-class ResourceAttrs(enum.IntFlag):
-    """Resource attibutes byte."""
-    
-    _sysref = 0x80 # "reference to system/local reference" (unclear significance)
-    sysheap = 0x40 # load into System heap instead of app heap
-    purgeable = 0x20 # Memory Mgr may remove from heap to free up memory
-    locked = 0x10 # Memory Mgr may not move the block to reduce fragmentation
-    protected = 0x08 # prevents app from changing resource
-    preload = 0x04 # causes resource to be read into heap as soon as file is opened
-    _changed = 0x02 # marks a resource that has been changes since loading from file (should not be seen on disk)
-    _compressed = 0x01 # "indicates that the resource data is compressed" (only documented in https://github.com/kreativekorp/ksfl/wiki/Macintosh-Resource-File-Format)
-
-    def _for_derez(self):
-        mylist = [p.name for p in self.__class__ if self & p]
-        if any(p.startswith('_') for p in mylist):
-            arg = '$%02X' % self
-            mylist = [arg]
-        return mylist
+def attribs_for_derez(attribs):
+    if attribs & ~0x7C:
+        yield '$%02X' % attribs
+    else:
+        if attribs & 0x40: yield 'sysheap'
+        if attribs & 0x20: yield 'purgeable'
+        if attribs & 0x10: yield 'locked'
+        if attribs & 0x08: yield 'protected'
+        if attribs & 0x04: yield 'preload'
 
 
 class Resource(bytearray):
@@ -106,21 +96,12 @@ class Resource(bytearray):
     optional.
     """
 
-    ALL_ATTRIBS = [
-        'sysheap',
-        'purgeable',
-        'locked',
-        'protected',
-        'preload',
-    ]
-
-    def __init__(self, type, id, name=None, attribs=0, data=None):
+    def __init__(self, type, id, name=None, attribs=0, data=b''):
         self.type = type
         self.id = id
-        self.data = data or bytearray()
+        self.data = data
         self.name = name
-        self.attribs = ResourceAttrs(0)
-        self.attribs |= attribs
+        self.attribs = attribs
 
     def __repr__(self):
         datarep = repr(bytes(self.data[:4]))
@@ -136,15 +117,12 @@ class Resource(bytearray):
         self[:] = set_to
 
 
-def parse_file(from_resfile, fake_header_rsrc=False):
+def parse_file(from_resfile):
     """Get an iterator of Resource objects from a binary resource file."""
 
     if not from_resfile: # empty resource forks are fine
         return
 
-    if fake_header_rsrc and any(from_resfile[16:256]):
-        yield Resource(FAKE_HEADER_RSRC_TYPE, 0, name='Header as fake resource (not for Rez)', data=from_resfile[16:256])
-
     data_offset, map_offset, data_len, map_len = struct.unpack_from('>4L', from_resfile)
 
     typelist_offset, namelist_offset, numtypes = struct.unpack_from('>24xHHH', from_resfile, map_offset)
@@ -182,7 +160,36 @@ def parse_file(from_resfile, fake_header_rsrc=False):
             yield Resource(type=rtype, id=rid, name=name, attribs=rattribs, data=bytearray(rdata))
 
 
-def parse_rez_code(from_rezcode):
+def string_surrogate(m):
+    m = m.group(0)
+
+    if len(m) == 5: # \0xFF is the most common
+        return bytes([int(m[3:], 16)])
+    elif m == b'\\"':
+        return b'"'
+    elif m == b"\\'":
+        return b"'"
+    elif m == b'\\b':
+        return b'\x08' # backspace
+    elif m == b'\\t':
+        return b'\t'
+    elif m == b'\\r':
+        return b'\n'
+    elif m == b'\\v':
+        return b'\x0b' # vertical tab
+    elif m == b'\\f':
+        return b'\x0c' # form feed
+    elif m == b'\\n':
+        return b'\r'
+    elif m == b'\\?':
+        return b'\x7f' # del
+
+
+def string_literal(string):
+    return re.sub(rb'(\\0x..|\\.)', string_surrogate, string[1:-1])
+
+
+def parse_rez_code(from_rezcode, original_file='<string>'):
     """Get an iterator of Resource objects from code in a subset of the Rez language (bytes or str)."""
 
     try:
@@ -192,63 +199,74 @@ def parse_rez_code(from_rezcode):
 
     from_rezcode = from_rezcode.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
 
-    for line in from_rezcode.split(b'\n'):
-        line = line.lstrip()
+    # Slightly faster than finditer
+    all_tokens = rez_tokenizer.findall(from_rezcode)
+    def line_no_for_error(token_idx):
+        # Redo all the lexing with finditer, which is slower but
+        # gives us Match objects with a byte offset
+        work_redoer = rez_tokenizer.finditer(from_rezcode)
+        match_obj = next(m for i, m in enumerate(work_redoer) if i == token_idx)
+        line_no = from_rezcode[:match_obj.start()].count(ord('\n')) + 1
 
-        if line.startswith(b'data '):
-            try:
-                yield cur_resource
-            except NameError:
-                pass
+    allowed_token_kinds = (2,-1)
+    for token_idx, token_captures in enumerate(all_tokens):
+        # Which single capture is non-empty?
+        for token_kind, payload in enumerate(token_captures):
+            if payload: break
 
-            _, _, line = line.partition(b' ')
-            rsrctype, line = _rez_unescape(line)
-            _, _, line = line.partition(b'(')
+        # Ignore whitespace
+        if not token_kind: continue
 
-            args = []
-            while True:
-                line = line.lstrip(b' ,\t')
-                if line.startswith(b')'): break
-                if line.startswith(b'"'):
-                    arg, line = _rez_unescape(line)
-                    args.append(('string', arg))
-                else:
-                    arg = bytearray()
-                    while line and line[0:1] not in b' ,\t)':
-                        arg.append(line[0])
-                        line = line[1:]
-                    args.append(('nonstring', arg))
+        # Unexpected token!
+        if token_kind not in allowed_token_kinds:
+            raise RezSyntaxError('File %r, line %r' % (original_file, line_no_for_error(token_idx)))
 
-            rsrcname = None
-            rsrcattrs = ResourceAttrs(0)
+        elif token_kind == 1:
+            hex_accum.append(payload)
 
-            for i, (argtype, arg) in enumerate(args):
-                if i == 0 and argtype == 'nonstring':
-                    rsrcid = int(arg)
+        elif token_kind == 2:
+            res = Resource(b'', 0)
+            hex_accum = []
 
-                elif i > 0:
-                    if argtype == 'string':
-                        rsrcname = arg.decode('mac_roman')
-                    else:
-                        if arg.startswith(b'$'):
-                            newattr = int(arg[1:], 16)
-                        elif arg and arg[0] in b'0123456789':
-                            newattr = int(arg)
-                        else:
-                            newattr = getattr(ResourceAttrs, arg.decode('ascii'))
-                        rsrcattrs |= newattr
+        elif token_kind == 3:
+            res.type = string_literal(payload)
+            if len(res.type) != 4:
+                raise RezSyntaxError('File %r, line %r, type not 4 chars' % (original_file, line_no_for_error(token_idx)))
 
-            cur_resource = Resource(type=rsrctype, id=rsrcid, name=rsrcname, attribs=rsrcattrs)
+        elif token_kind == 5:
+            res.id = int(payload)
+            if not (-65536 <= res.id < 65536):
+                raise RezSyntaxError('File %r, line %r, ID out of 16-bit range' % (original_file, line_no_for_error(token_idx)))
 
-        elif line.startswith(b'$"'):
-            hexdat = line[2:].partition(b'"')[0]
-            bindat = bytes.fromhex(hexdat.decode('ascii'))
-            cur_resource.data.extend(bindat)
+        elif token_kind == 6:
+            res.name = string_literal(payload).decode('mac_roman')
+            if len(res.name) > 255:
+                raise RezSyntaxError('File %r, line %r, name > 255 chars' % (original_file, line_no_for_error(token_idx)))
 
-    try:
-        yield cur_resource
-    except NameError:
-        pass
+        elif token_kind == 7:
+            res.attribs = int(payload, 16)
+
+        elif token_kind == 8:
+            if payload == b'sysheap':
+                res.attribs |= 0x40
+            elif payload == b'purgeable':
+                res.attribs |= 0x20
+            elif payload == b'locked':
+                res.attribs |= 0x10
+            elif payload == b'protected':
+                res.attribs |= 0x08
+            elif payload == b'preload':
+                res.attribs |= 0x04
+
+        elif token_kind == 12:
+            res[:] = bytes.fromhex(b''.join(hex_accum).decode('ascii'))
+            yield res
+
+        allowed_token_kinds = allowed_to_follow_kind[token_kind]
+
+    # Premature EOF
+    if -1 not in allowed_token_kinds:
+        raise RezSyntaxError('File %r, unexpected end of file' % original_file)
 
 
 def make_file(from_iter, align=1):
@@ -263,12 +281,6 @@ def make_file(from_iter, align=1):
     data_offset = len(accum)
     bigdict = collections.OrderedDict() # maintain order of types, but manually order IDs
     for r in from_iter:
-        if r.type == FAKE_HEADER_RSRC_TYPE:
-            if len(r.data) > 256-16:
-                raise ValueError('Special resource length (%r) too long' % len(r.data))
-            accum[16:16+len(r.data)] = r.data
-            continue
-
         wrapped = wrap(r)
 
         while len(accum) % align:
@@ -357,43 +369,63 @@ def make_rez_code(from_iter, ascii_clean=False):
         args.append(str(resource.id).encode('ascii'))
         if resource.name is not None:
             args.append(_rez_escape(resource.name.encode('mac_roman'), singlequote=False, ascii_clean=ascii_clean))
-        args.extend(x.encode('ascii') for x in resource.attribs._for_derez())
+        args.extend(x.encode('ascii') for x in attribs_for_derez(resource.attribs))
         args = b', '.join(args)
 
         fourcc = _rez_escape(resource.type, singlequote=True, ascii_clean=ascii_clean)
 
-        if resource.type == FAKE_HEADER_RSRC_TYPE:
-            lines.append(b'#if 0')
         lines.append(b'data %s (%s) {' % (fourcc, args))
 
-        step = 16
+        # Create a template bytearray
+        numlines = (len(resource) + 15) // 16
+        overhang = numlines * 16 - len(resource)
+        fulllines = numlines - bool(overhang)
+        fl_bytes = fulllines * 78
+        guts = numlines * bytearray(b'\t$"                                                    /*                    \n')
+        del guts[-1:] # no trailing newline
 
-        star, slash, dot, space = b'*/. '
-        whole_preview = bytearray(resource.data)
-        for i in range(len(whole_preview)):
-            if not i % step: mode = False
-            thisone = whole_preview[i]
-            if mode and thisone == slash:
-                thisone = dot
-                mode = False
-            elif thisone == star:
-                mode = True
-            elif thisone >= space:
-                mode = False
-            whole_preview[i] = themap[thisone]
+        # The hex inside the $"" literals
+        hex_column = resource.hex().upper().encode('ascii')
+        if overhang:
+            hex_column += (2 * overhang) * b' '
 
-        for ofs in range(0, len(resource.data), step):
-            linedat = resource.data[ofs:ofs+step]
-            line = ' '.join(linedat[i:i+2].hex() for i in range(0, len(linedat), 2)).encode('ascii')
-            line = line.upper()
-            line = b'\t$"%s"' % line
-            line = line.ljust(55)
-            line += b'/* %s */' % whole_preview[ofs:ofs+step]
-            lines.append(line)
+        # Insert the hex column
+        for i in range(8):
+            for j in range(4):
+                guts[3+i*5+j::78] = hex_column[i*4+j::32]
+
+        # Close the hex literal
+        guts[42:fl_bytes:78] = b'"' * fulllines
+        if overhang: # slightly hacky -- searches for spaces!
+            guts[fl_bytes+guts[fl_bytes:].index(b'  ')] = ord('"')
+
+        # Prevent star-slash from ending the comment column prematurely
+        def comment_end_fixer(m):
+            start, stop = m.span()
+            stop -= 1
+            if start & -16 == stop & -16:
+                return m.group()[:-1] + b'.'
+            else:
+                return m.group()
+        comment_column = re.sub(rb'\*[\x00-\x1F]{0,14}/', comment_end_fixer, resource)
+        comment_column = comment_column.translate(themap)
+        if overhang:
+            comment_column += overhang * b' '
+
+        # Insert the comment column
+        for i in range(16):
+            guts[58+i::78] = comment_column[i::16]
+
+        # Close the comment
+        guts[75:fl_bytes:78] = b'*' * fulllines
+        guts[76:fl_bytes:78] = b'/' * fulllines
+        if overhang:
+            del guts[-overhang-2:]
+            guts.extend(b'*/')
+
+        if guts: lines.append(guts)
 
         lines.append(b'};')
-        if resource.type == FAKE_HEADER_RSRC_TYPE:
-            lines.append(b'#endif')
         lines.append(b'')
     if lines: lines.append(b'') # hack, because all posix lines end with a newline
 
diff --git a/test_all.py b/test_all.py
index 5d2e026..feac430 100644
--- a/test_all.py
+++ b/test_all.py
@@ -7,13 +7,6 @@ data 'elmo' (123, "lamename") {
 };
 """.strip()
 
-def test_enum():
-    r = ResourceAttrs
-    r1 = ResourceAttrs.sysheap
-    r2 = ResourceAttrs.purgeable
-    assert int(r1 | r2) == 0x60
-    assert bool((r1 | r2) & r1)
-
 def test_parse_file():
     l = list(parse_file(RF))