macresources/macresources/main.py

import collections
import struct
import enum
import re


# The allowed token sequence when parsing Rez code (quite restrictive)
rez_tokens = [
    ((),         r'(\s|//.*?\n|/\*.*?\*/)+'),                                   # 0 whitespace/comment (gets ignored)
    ((1,11),     r'\$"\s*((?:[0-9A-Fa-f]{2}\s*)*)"'),                           # 1 hex data
    ((3,),       r'(data)'),                                                    # 2 start of raw resource
    ((4,),       r"('(?:[^'\\]|\\0x[0-9A-Fa-f]{2}|\\[\\'\\?btrvfn])*')"),       # 3 type
    ((5,),       r'(\()'),                                                      # 4 start of bracketed resource info
    ((6,7,8,9),  r'(-?\d+)'),                                                   # 5 ID
    ((7,8,9),    r',gap("(?:[^"\\]|\\0x[0-9A-Fa-f]{2}|\\[\\"\\?btrvfn])*")'),   # 6 name
    ((9,),       r',gap\$([0-9a-fA-F]{1,2})'),                                  # 7 attribs (hex)
    ((8,9),      r',gap(sysheap|purgeable|locked|protected|preload)'),          # 8 attribs (specific)
    ((10,),      r'(\))'),                                                      # 9 end of bracketed resource info
    ((1,11),     r'(\{)'),                                                      # 10 start of hex block
    ((12,),      r'(\})'),                                                      # 11 end of hex block
    ((2,-1),     r'(;)'),                                                       # 12 the end for real
    ((),         r'(.)'),                                                       # 13 unexpected character (always errors)
]

allowed_to_follow_kind, token_regexen = zip(*rez_tokens)

# The 'gap' hack turns ', sysheap' etc into a single token
gap = r'(?:\s|//.*?\n|/\*.*?\*/)*'
rez_tokenizer = '|'.join(token_regexen).replace('gap', gap).encode('ascii')
rez_tokenizer = re.compile(rez_tokenizer)


class RezSyntaxError(Exception):
    def __init__(self, msg):
        self.msg = msg

    def __str__(self):
       return self.msg


MAP = bytearray(range(256))
for i in range(32): MAP[i] = ord('.')
MAP[127] = ord('.')
MAP[9] = 0xC6 # tab -> greek delta
MAP[10] = 0xC2 # lf -> logical not

CLEANMAP = bytearray(MAP)
for i in range(256):
    if CLEANMAP[i] >= 128:
        CLEANMAP[i] = ord('.')


def _rez_escape(src, singlequote=False, ascii_clean=False):
    if singlequote:
        the_quote = b"'"
    else:
        the_quote = b'"'

    chars = [the_quote]
    for ch in src:
        if 8 <= ch <= 13:
            nu = b'\\' + b'btrvfn'[ch-8:][:1]
        elif ch < 32 or (ascii_clean and ch >= 128):
            nu = b'\\0x%02X' % ch
        elif ch == ord('\\'):
            nu = b'\\\\' # two backslashes
        elif ch == 127: # DEL character
            nu = b'\\?'
        elif ch == ord("'") and singlequote:
            nu = b"\\'"
        elif ch == ord('"') and not singlequote:
            nu = b'\\"'
        else:
            nu = bytes([ch])
        chars.append(nu)
    chars.append(the_quote)

    return b''.join(chars)


def attribs_for_derez(attribs):
    if attribs & ~0x7C:
        yield '$%02X' % attribs
    else:
        if attribs & 0x40: yield 'sysheap'
        if attribs & 0x20: yield 'purgeable'
        if attribs & 0x10: yield 'locked'
        if attribs & 0x08: yield 'protected'
        if attribs & 0x04: yield 'preload'


class Resource(bytearray):
    """
    A single Mac resource. A four-byte type, a numeric id and some
    binary data are essential. Extra attributes and a name string are
    optional.
    """

    def __init__(self, type, id, name=None, attribs=0, data=b''):
        self.type = type
        self.id = id
        self.data = data
        self.name = name
        self.attribs = attribs

    def __repr__(self):
        datarep = repr(bytes(self.data[:4]))
        if len(self.data) > len(datarep): datarep += '...%sb' % len(self.data)
        return '%s(type=%r, id=%r, name=%r, attribs=%r, data=%s)' % (self.__class__.__name__, self.type, self.id, self.name, self.attribs, datarep)

    @property
    def data(self):
        return self

    @data.setter
    def data(self, set_to):
        self[:] = set_to


def parse_file(from_resfile):
    """Get an iterator of Resource objects from a binary resource file."""

    if not from_resfile: # empty resource forks are fine
        return

    data_offset, map_offset, data_len, map_len = struct.unpack_from('>4L', from_resfile)

    typelist_offset, namelist_offset, numtypes = struct.unpack_from('>24xHHH', from_resfile, map_offset)
    typelist_offset += map_offset # something is definitely fishy here
    namelist_offset += map_offset

    if numtypes == 0xFFFF: return
    numtypes += 1

    typelist = []
    for i in range(numtypes):
        rtype, rtypen, reflist_offset = struct.unpack_from('>4sHH', from_resfile, typelist_offset + 2 + 8*i)
        rtypen += 1
        reflist_offset += typelist_offset
        typelist.append((rtype, rtypen, reflist_offset))

    for rtype, rtypen, reflist_offset in typelist:
        for i in range(rtypen):
            rid, name_offset, mixedfield = struct.unpack_from('>hHL', from_resfile, reflist_offset + 12*i)
            rdata_offset = mixedfield & 0xFFFFFF
            rattribs = mixedfield >> 24

            rdata_offset += data_offset

            rdata_len, = struct.unpack_from('>L', from_resfile, rdata_offset)
            rdata = from_resfile[rdata_offset+4:rdata_offset+4+rdata_len]

            if name_offset == 0xFFFF:
                name = None
            else:
                name_offset += namelist_offset
                name_len = from_resfile[name_offset]
                name = from_resfile[name_offset+1:name_offset+1+name_len].decode('mac_roman')

            yield Resource(type=rtype, id=rid, name=name, attribs=rattribs, data=bytearray(rdata))


def string_surrogate(m):
    m = m.group(0)

    if len(m) == 5: # \0xFF is the most common
        return bytes([int(m[3:], 16)])
    elif m == b'\\"':
        return b'"'
    elif m == b"\\'":
        return b"'"
    elif m == b'\\b':
        return b'\x08' # backspace
    elif m == b'\\t':
        return b'\t'
    elif m == b'\\r':
        return b'\n'
    elif m == b'\\v':
        return b'\x0b' # vertical tab
    elif m == b'\\f':
        return b'\x0c' # form feed
    elif m == b'\\n':
        return b'\r'
    elif m == b'\\?':
        return b'\x7f' # del


def string_literal(string):
    return re.sub(rb'(\\0x..|\\.)', string_surrogate, string[1:-1])


def parse_rez_code(from_rezcode, original_file='<string>'):
    """Get an iterator of Resource objects from code in a subset of the Rez language (bytes or str)."""

    try:
        from_rezcode = from_rezcode.encode('mac_roman')
    except AttributeError:
        pass

    from_rezcode = from_rezcode.replace(b'\r\n', b'\n').replace(b'\r', b'\n')

    # Slightly faster than finditer
    all_tokens = rez_tokenizer.findall(from_rezcode)
    def line_no_for_error(token_idx):
        # Redo all the lexing with finditer, which is slower but
        # gives us Match objects with a byte offset
        work_redoer = rez_tokenizer.finditer(from_rezcode)
        match_obj = next(m for i, m in enumerate(work_redoer) if i == token_idx)
        line_no = from_rezcode[:match_obj.start()].count(ord('\n')) + 1

    allowed_token_kinds = (2,-1)
    for token_idx, token_captures in enumerate(all_tokens):
        # Which single capture is non-empty?
        for token_kind, payload in enumerate(token_captures):
            if payload: break

        # Ignore whitespace
        if not token_kind: continue

        # Unexpected token!
        if token_kind not in allowed_token_kinds:
            raise RezSyntaxError('File %r, line %r' % (original_file, line_no_for_error(token_idx)))

        elif token_kind == 1:
            hex_accum.append(payload)

        elif token_kind == 2:
            res = Resource(b'', 0)
            hex_accum = []

        elif token_kind == 3:
            res.type = string_literal(payload)
            if len(res.type) != 4:
                raise RezSyntaxError('File %r, line %r, type not 4 chars' % (original_file, line_no_for_error(token_idx)))

        elif token_kind == 5:
            res.id = int(payload)
            if not (-65536 <= res.id < 65536):
                raise RezSyntaxError('File %r, line %r, ID out of 16-bit range' % (original_file, line_no_for_error(token_idx)))

        elif token_kind == 6:
            res.name = string_literal(payload).decode('mac_roman')
            if len(res.name) > 255:
                raise RezSyntaxError('File %r, line %r, name > 255 chars' % (original_file, line_no_for_error(token_idx)))

        elif token_kind == 7:
            res.attribs = int(payload, 16)

        elif token_kind == 8:
            if payload == b'sysheap':
                res.attribs |= 0x40
            elif payload == b'purgeable':
                res.attribs |= 0x20
            elif payload == b'locked':
                res.attribs |= 0x10
            elif payload == b'protected':
                res.attribs |= 0x08
            elif payload == b'preload':
                res.attribs |= 0x04

        elif token_kind == 12:
            res[:] = bytes.fromhex(b''.join(hex_accum).decode('ascii'))
            yield res

        allowed_token_kinds = allowed_to_follow_kind[token_kind]

    # Premature EOF
    if -1 not in allowed_token_kinds:
        raise RezSyntaxError('File %r, unexpected end of file' % original_file)


def make_file(from_iter, align=1):
    """Pack an iterator of Resource objects into a binary resource file."""

    class wrap:
        def __init__(self, from_obj):
            self.obj = from_obj

    accum = bytearray(256) # defer header

    data_offset = len(accum)
    bigdict = collections.OrderedDict() # maintain order of types, but manually order IDs
    for r in from_iter:
        wrapped = wrap(r)

        while len(accum) % align:
            accum.extend(b'\x00')

        wrapped.data_offset = len(accum)
        accum.extend(struct.pack('>L', len(r.data)))
        accum.extend(r.data)

        if r.type not in bigdict:
            bigdict[r.type] = []
        bigdict[r.type].append(wrapped)

    map_offset = len(accum)
    accum.extend(bytes(28))

    typelist_offset = len(accum)
    accum.extend(bytes(2 + 8 * len(bigdict)))

    reflist_offset = len(accum)
    resource_count = sum(len(idlist) for idlist in bigdict.values())
    accum.extend(bytes(12 * resource_count))

    namelist_offset = len(accum)
    for rtype, idlist in bigdict.items():
        for res in idlist:
            if res.obj.name is not None:
                res.name_offset = len(accum)
                as_bytes = res.obj.name.encode('mac_roman')
                accum.append(len(as_bytes))
                accum.extend(as_bytes)

    # all right, now populate the reference lists...
    counter = reflist_offset
    for rtype, idlist in bigdict.items():
        for res in idlist:
            res.ref_offset = counter
            if res.obj.name is None:
                this_name_offset = 0xFFFF
            else:
                this_name_offset = res.name_offset - namelist_offset
            attribs = int(res.obj.attribs)
            this_data_offset = res.data_offset - data_offset
            mixedfield = (attribs << 24) | this_data_offset
            struct.pack_into('>hHL', accum, counter, res.obj.id, this_name_offset, mixedfield)

            counter += 12

    # all right, now populate the type list
    struct.pack_into('>H', accum, typelist_offset, (len(bigdict) - 1) & 0xFFFF)
    counter = typelist_offset + 2
    for rtype, idlist in bigdict.items():
        this_type = idlist[0].obj.type
        ref_count = len(idlist)
        firstref_offset = idlist[0].ref_offset - typelist_offset
        struct.pack_into('>4sHH', accum, counter, this_type, ref_count - 1, firstref_offset)

        counter += 8

    # all right, now populate the map
    struct.pack_into('>24xHH', accum, map_offset, typelist_offset - map_offset, namelist_offset - map_offset)

    # all right, now populate the header
    data_len = map_offset - data_offset
    map_len = len(accum) - map_offset
    struct.pack_into('>LLLL', accum, 0, data_offset, map_offset, data_len, map_len)

    return bytes(accum)


def make_rez_code(from_iter, ascii_clean=False):
    """Express an iterator of Resource objects as Rez code (bytes).

    This will match the output of the deprecated Rez utility, unless the
    `ascii_clean` argument is used to get a 7-bit-only code block.
    """

    if ascii_clean:
        themap = CLEANMAP
    else:
        themap = MAP

    lines = []
    for resource in from_iter:
        args = []
        args.append(str(resource.id).encode('ascii'))
        if resource.name is not None:
            args.append(_rez_escape(resource.name.encode('mac_roman'), singlequote=False, ascii_clean=ascii_clean))
        args.extend(x.encode('ascii') for x in attribs_for_derez(resource.attribs))
        args = b', '.join(args)

        fourcc = _rez_escape(resource.type, singlequote=True, ascii_clean=ascii_clean)

        lines.append(b'data %s (%s) {' % (fourcc, args))

        # Create a template bytearray
        numlines = (len(resource) + 15) // 16
        overhang = numlines * 16 - len(resource)
        fulllines = numlines - bool(overhang)
        fl_bytes = fulllines * 78
        guts = numlines * bytearray(b'\t$"                                                    /*                    \n')
        del guts[-1:] # no trailing newline

        # The hex inside the $"" literals
        hex_column = resource.hex().upper().encode('ascii')
        if overhang:
            hex_column += (2 * overhang) * b' '

        # Insert the hex column
        for i in range(8):
            for j in range(4):
                guts[3+i*5+j::78] = hex_column[i*4+j::32]

        # Close the hex literal
        guts[42:fl_bytes:78] = b'"' * fulllines
        if overhang: # slightly hacky -- searches for spaces!
            guts[fl_bytes+guts[fl_bytes:].index(b'  ')] = ord('"')

        # Prevent star-slash from ending the comment column prematurely
        def comment_end_fixer(m):
            start, stop = m.span()
            stop -= 1
            if start & -16 == stop & -16:
                return m.group()[:-1] + b'.'
            else:
                return m.group()
        comment_column = re.sub(rb'\*[\x00-\x1F]{0,14}/', comment_end_fixer, resource)
        comment_column = comment_column.translate(themap)
        if overhang:
            comment_column += overhang * b' '

        # Insert the comment column
        for i in range(16):
            guts[58+i::78] = comment_column[i::16]

        # Close the comment
        guts[75:fl_bytes:78] = b'*' * fulllines
        guts[76:fl_bytes:78] = b'/' * fulllines
        if overhang:
            del guts[-overhang-2:]
            guts.extend(b'*/')

        if guts: lines.append(guts)

        lines.append(b'};')
        lines.append(b'')
    if lines: lines.append(b'') # hack, because all posix lines end with a newline

    return b'\n'.join(lines)