From d082f29238eba5ad1fec9db30e71e3c7eb3ea015 Mon Sep 17 00:00:00 2001 From: dgelessus Date: Tue, 3 Sep 2019 02:10:04 +0200 Subject: [PATCH] Use MacRoman as the encoding for four-char codes and strings Previously all non-ASCII characters were hex-escaped on output. However, many resource files use MacRoman characters in resource names and sometimes in resource types, so it makes sense to use MacRoman in the interest of readability. --- README.rst | 1 + rsrcfork/__main__.py | 32 +++++++++++++++++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index d103068..9037d5a 100644 --- a/README.rst +++ b/README.rst @@ -149,6 +149,7 @@ Version 1.2.0 (next version) * The old ``rsrcfork`` parameter has been deprecated and will be removed in the future, but for now it still works as before. * Added an explanatory message when a resource filter on the command line doesn't match any resources in the resource file. Previously there would either be no output or a confusing error, depending on the selected ``--format``. +* Changed resource type codes and names to be displayed in MacRoman instead of escaping all non-ASCII characters. * Cleaned up the resource descriptions in listings and dumps to improve readability. Previously they included some redundant or unnecessary information - for example, each resource with no attributes set would be explicitly marked as "no attributes". * Unified the formats of resource descriptions in listings and dumps, which were previously slightly different from each other. * Improved error messages when attempting to read multiple resources using ``--format=hex`` or ``--format=raw``. diff --git a/rsrcfork/__main__.py b/rsrcfork/__main__.py index 9ecae3b..dbd6f31 100644 --- a/rsrcfork/__main__.py +++ b/rsrcfork/__main__.py @@ -7,6 +7,9 @@ import typing from . import __version__, api, compress +# The encoding to use when rendering bytes as text (in four-char codes, strings, hex dumps, etc.) or reading a quoted byte string (from the command line). +_TEXT_ENCODING = "MacRoman" + # Translation table to replace ASCII non-printable characters with periods. _TRANSLATE_NONPRINTABLES = {k: "." for k in [*range(0x20), 0x7f]} @@ -27,8 +30,16 @@ def _decompose_flags(value: F) -> typing.Sequence[F]: return [bit for bit in type(value) if bit in value] +def _is_printable(char: str) -> bool: + """Determine whether a character is printable for our purposes. + + We mainly use Python's definition of printable (i. e. everything that Unicode does not consider a separator or "other" character). However, we also treat U+F8FF as printable, which is the private use codepoint used for the Apple logo character. + """ + + return char.isprintable() or char == "\uf8ff" + def _bytes_unescape(string: str) -> bytes: - """Convert a string containing ASCII characters and hex escapes to a bytestring. + """Convert a string containing text (in _TEXT_ENCODING) and hex escapes to a bytestring. (We implement our own unescaping mechanism here to not depend on any of Python's string/bytes escape syntax.) """ @@ -49,23 +60,22 @@ def _bytes_unescape(string: str) -> bytes: except StopIteration: raise ValueError("End of string in escape sequence") else: - out.append(ord(char)) + out.extend(char.encode(_TEXT_ENCODING)) return bytes(out) def _bytes_escape(bs: bytes, *, quote: str=None) -> str: - """Convert a bytestring to a string, with non-ASCII bytes hex-escaped. + """Convert a bytestring to a string (using _TEXT_ENCODING), with non-printable characters hex-escaped. (We implement our own escaping mechanism here to not depend on Python's str or bytes repr.) """ out = [] - for byte in bs: - c = chr(byte) - if c in {quote, "\\"}: - out.append(f"\\{c}") - elif 0x20 <= byte < 0x7f: - out.append(c) + for byte, char in zip(bs, bs.decode(_TEXT_ENCODING)): + if char in {quote, "\\"}: + out.append(f"\\{char}") + elif _is_printable(char): + out.append(char) else: out.append(f"\\x{byte:02x}") @@ -148,7 +158,7 @@ def _hexdump(data: bytes): for i in range(0, len(data), 16): line = data[i:i + 16] line_hex = " ".join(f"{byte:02x}" for byte in line) - line_char = line.decode("MacRoman").translate(_TRANSLATE_NONPRINTABLES) + line_char = line.decode(_TEXT_ENCODING).translate(_TRANSLATE_NONPRINTABLES) print(f"{i:08x} {line_hex:<{16*2+15}} |{line_char}|") if data: @@ -335,7 +345,7 @@ def main(): groups.append(f"{data[i+j]:02X}{data[i+j+1]:02X}") s = f'$"{" ".join(groups)}"' - comment = "/* " + data[i:i + 16].decode("MacRoman").translate(_TRANSLATE_NONPRINTABLES) + " */" + comment = "/* " + data[i:i + 16].decode(_TEXT_ENCODING).translate(_TRANSLATE_NONPRINTABLES) + " */" print(f"\t{s:<54s}{comment}") print("};")