From 5c3bc5d7e501d9d78a42090153e65fee67c7b2ea Mon Sep 17 00:00:00 2001 From: dgelessus Date: Sun, 1 Nov 2020 19:28:25 +0100 Subject: [PATCH] Remove custom stream types and read all resource data upfront again The custom stream types were almost always slower than just reading the entire data into memory, and there's no reason not to do that - resources are small enough that memory usage and disk IO speed aren't a concern (at least not for any machine that's modern enough to run Python 3...). Perhaps the only performance advantage was when reading a small amount of data from the start of a compressed resource. In that case the custom stream could incrementally decompress only the part of the data that's actually needed, which was a bit faster than decompressing the entire resource and then throwing away most of the data. But this situation is rare enough that it's not worth handling in the rsrcfork library. If this is a real performance issue for someone, they can manually call the incremental decompression functions from rsrcfork.compress where needed. --- rsrcfork/_io_utils.py | 74 ------------------------------ rsrcfork/api.py | 18 +++----- rsrcfork/compress/__init__.py | 85 ----------------------------------- 3 files changed, 7 insertions(+), 170 deletions(-) diff --git a/rsrcfork/_io_utils.py b/rsrcfork/_io_utils.py index 482c416..d44c43d 100644 --- a/rsrcfork/_io_utils.py +++ b/rsrcfork/_io_utils.py @@ -19,80 +19,6 @@ def read_exact(stream: typing.BinaryIO, byte_count: int) -> bytes: return data -class _SubStream(io.RawIOBase): - """A read-only stream that provides a view over a range of data from another stream.""" - - _outer_stream: typing.BinaryIO - _start_offset: int - _length: int - _seek_position: int - - def __init__(self, stream: typing.BinaryIO, start_offset: int, length: int) -> None: - super().__init__() - - self._outer_stream = stream - self._start_offset = start_offset - self._length = length - self._seek_position = 0 - - outer_stream_length = self._outer_stream.seek(0, io.SEEK_END) - if self._start_offset + self._length > outer_stream_length: - raise ValueError(f"start_offset ({self._start_offset}) or length ({self._length}) too high: outer stream must be at least {self._start_offset + self._length} bytes long, but is only {outer_stream_length} bytes") - - def seekable(self) -> bool: - return True - - def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: - if whence == io.SEEK_SET: - if offset < 0: - raise ValueError(f"Negative seek offset not allowed with SEEK_SET: {offset}") - - self._seek_position = offset - elif whence == io.SEEK_CUR: - self._seek_position += offset - elif whence == io.SEEK_END: - self._seek_position = self._length - offset - else: - raise ValueError(f"Invalid whence value: {whence}") - - self._seek_position = max(0, min(self._length, self._seek_position)) - - return self._seek_position - - def tell(self) -> int: - return self._seek_position - - def readable(self) -> bool: - return True - - def readall(self) -> bytes: - self._outer_stream.seek(self._start_offset + self._seek_position) - return self._outer_stream.read(self._length - self._seek_position) - - def readinto(self, buffer: bytearray) -> typing.Optional[int]: - size = min(len(buffer), self._length - self._seek_position) - self._outer_stream.seek(self._start_offset + self._seek_position) - data = self._outer_stream.read(size) - self._seek_position += len(data) - buffer[:len(data)] = data - return len(data) - - -def make_substream(stream: typing.BinaryIO, start_offset: int, length: int) -> typing.BinaryIO: - """Create a read-only stream that exposes the specified range of data from ``stream``. - - :param stream: The underlying binary stream from which to read the data. - The stream must be readable and seekable and contain at least ``start_offset + length`` bytes of data. - :param start_offset: The absolute offset in the parent stream at which the data to expose starts. - This offset will correspond to offset 0 in the returned stream. - :param length: The length of the data to expose. - This is the highest valid offset in the returned stream. - """ - - # For some reason, mypy thinks that io.BufferedReader is not a typing.BinaryIO. - return typing.cast(typing.BinaryIO, io.BufferedReader(_SubStream(stream, start_offset, length))) - - if typing.TYPE_CHECKING: class PeekableIO(typing.Protocol): """Minimal protocol for binary IO streams that support the peek method. diff --git a/rsrcfork/api.py b/rsrcfork/api.py index f30034b..de0b67c 100644 --- a/rsrcfork/api.py +++ b/rsrcfork/api.py @@ -179,8 +179,8 @@ class Resource(object): try: return self._data_raw except AttributeError: - with self.open_raw() as f: - self._data_raw = f.read() + self._resfile._stream.seek(self._resfile.data_offset + self.data_raw_offset + STRUCT_RESOURCE_DATA_HEADER.size) + self._data_raw = _io_utils.read_exact(self._resfile._stream, self.length_raw) return self._data_raw def open_raw(self) -> typing.BinaryIO: @@ -196,7 +196,7 @@ class Resource(object): because the stream API does not require the entire resource data to be read in advance. """ - return _io_utils.make_substream(self._resfile._stream, self._resfile.data_offset + self.data_raw_offset + STRUCT_RESOURCE_DATA_HEADER.size, self.length_raw) + return io.BytesIO(self.data_raw) @property def compressed_info(self) -> typing.Optional[compress.common.CompressedHeaderInfo]: @@ -252,8 +252,9 @@ class Resource(object): try: return self._data_decompressed except AttributeError: - with self.open() as f: - self._data_decompressed = f.read() + with self.open_raw() as compressed_f: + compressed_f.seek(self.compressed_info.header_length) + self._data_decompressed = b"".join(compress.decompress_stream_parsed(self.compressed_info, compressed_f)) return self._data_decompressed else: return self.data_raw @@ -271,12 +272,7 @@ class Resource(object): because the stream API does not require the entire resource data to be read (and possibly decompressed) in advance. """ - if self.compressed_info is None: - return self.open_raw() - else: - f = self.open_raw() - f.seek(self.compressed_info.header_length) - return compress.DecompressingStream(f, self.compressed_info, close_stream=True) + return io.BytesIO(self.data) class _LazyResourceMap(typing.Mapping[int, Resource]): diff --git a/rsrcfork/compress/__init__.py b/rsrcfork/compress/__init__.py index 80f340f..b6a23eb 100644 --- a/rsrcfork/compress/__init__.py +++ b/rsrcfork/compress/__init__.py @@ -66,88 +66,3 @@ def decompress(data: bytes, *, debug: bool = False) -> bytes: """Decompress the given compressed resource data.""" return b"".join(decompress_stream(io.BytesIO(data), debug=debug)) - - -class DecompressingStream(io.BufferedIOBase, typing.BinaryIO): - _compressed_stream: typing.BinaryIO - _close_stream: bool - _header_info: CompressedHeaderInfo - _decompress_iter: typing.Iterator[bytes] - _decompressed_stream: typing.BinaryIO - _seek_position: int - - def __init__(self, compressed_stream: typing.BinaryIO, header_info: typing.Optional[CompressedHeaderInfo], *, close_stream: bool = False) -> None: - super().__init__() - - self._compressed_stream = compressed_stream - self._close_stream = close_stream - - if header_info is not None: - self._header_info = header_info - else: - self._header_info = CompressedHeaderInfo.parse_stream(self._compressed_stream) - - self._decompress_iter = decompress_stream_parsed(self._header_info, self._compressed_stream) - self._decompressed_stream = io.BytesIO() - self._seek_position = 0 - - # This override does nothing, - # but is needed to make mypy happy, - # otherwise it complains (apparently incorrectly) about the __enter__ definitions from IOBase and BinaryIO being incompatible with each other. - def __enter__(self: "DecompressingStream") -> "DecompressingStream": - return super().__enter__() - - def close(self) -> None: - super().close() - if self._close_stream: - self._compressed_stream.close() - del self._decompress_iter - self._decompressed_stream.close() - - def seekable(self) -> bool: - return True - - def tell(self) -> int: - return self._seek_position - - def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: - if whence == io.SEEK_SET: - if offset < 0: - raise ValueError(f"Negative seek offset not allowed with SEEK_SET: {offset}") - - self._seek_position = offset - elif whence == io.SEEK_CUR: - self._seek_position += offset - elif whence == io.SEEK_END: - self._seek_position = self._header_info.decompressed_length - offset - else: - raise ValueError(f"Invalid whence value: {whence}") - - self._seek_position = max(0, min(self._header_info.decompressed_length, self._seek_position)) - - return self._seek_position - - def readable(self) -> bool: - return True - - def read(self, size: typing.Optional[int] = -1) -> bytes: - if size is None: - size = -1 - - self._decompressed_stream.seek(0, io.SEEK_END) - - if size < 0: - for chunk in self._decompress_iter: - self._decompressed_stream.write(chunk) - else: - if self._decompressed_stream.tell() - self._seek_position < size: - for chunk in self._decompress_iter: - self._decompressed_stream.write(chunk) - - if self._decompressed_stream.tell() - self._seek_position >= size: - break - - self._decompressed_stream.seek(self._seek_position) - ret = self._decompressed_stream.read(size) - self._seek_position += len(ret) - return ret