From 126795239c85d1ef5ac25220ffcce26e541c4c77 Mon Sep 17 00:00:00 2001 From: dgelessus Date: Thu, 23 Jul 2020 02:42:32 +0200 Subject: [PATCH] Reimplement Resource.data_raw using a custom stream type (SubStream) This way all reads performed on a resource data stream are forwarded to the underlying resource file stream, with the read offsets and lengths adjusted appropriately. --- README.rst | 7 ----- rsrcfork/_io_utils.py | 66 +++++++++++++++++++++++++++++++++++++++++++ rsrcfork/api.py | 15 ++++++---- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index 21acd73..cf6edff 100644 --- a/README.rst +++ b/README.rst @@ -119,13 +119,6 @@ Version 1.8.1 (next version) * Added ``open`` and ``open_raw`` methods to ``Resource`` objects, for stream-based access to resource data. - * These methods are currently implemented using simple ``io.BytesIO`` wrappers around the resource data, - so there is currently no performance difference between ``open``/``open_raw`` and ``data``/``data_raw``. - In the future, - the stream-based API implementations will be optimized - to allow efficient access to parts of the resource data - without having to read the entire data in advance. - Version 1.8.0 ^^^^^^^^^^^^^ diff --git a/rsrcfork/_io_utils.py b/rsrcfork/_io_utils.py index d44c43d..f267936 100644 --- a/rsrcfork/_io_utils.py +++ b/rsrcfork/_io_utils.py @@ -19,6 +19,72 @@ def read_exact(stream: typing.BinaryIO, byte_count: int) -> bytes: return data +class SubStream(io.BufferedIOBase, typing.BinaryIO): + """A read-only stream that provides a view over a range of data from another stream.""" + + _outer_stream: typing.BinaryIO + _start_offset: int + _length: int + _seek_position: int + + def __init__(self, stream: typing.BinaryIO, start_offset: int, length: int) -> None: + """Create a new stream that exposes the specified range of data from ``stream``. + + :param stream: The underlying binary stream from which to read the data. + The stream must be readable and seekable and contain at least ``start_offset + length`` bytes of data. + :param start_offset: The absolute offset in the parent stream at which the data to expose starts. + This offset will correspond to offset 0 in the new :class:`SubStream`. + :param length: The length of the data to expose. + This is the highest valid offset in the new :class:`SubStream`. + """ + + super().__init__() + + self._outer_stream = stream + self._start_offset = start_offset + self._length = length + self._seek_position = 0 + + outer_stream_length = self._outer_stream.seek(0, io.SEEK_END) + if self._start_offset + self._length > outer_stream_length: + raise ValueError(f"start_offset ({self._start_offset}) or length ({self._length}) too high: outer stream must be at least {self._start_offset + self._length} bytes long, but is only {outer_stream_length} bytes") + + def seekable(self) -> bool: + return True + + def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: + if whence == io.SEEK_SET: + if offset < 0: + raise ValueError(f"Negative seek offset not allowed with SEEK_SET: {offset}") + + self._seek_position = offset + elif whence == io.SEEK_CUR: + self._seek_position += offset + elif whence == io.SEEK_END: + self._seek_position = self._length - offset + else: + raise ValueError(f"Invalid whence value: {whence}") + + self._seek_position = max(0, min(self._length, self._seek_position)) + + return self._seek_position + + def tell(self) -> int: + return self._seek_position + + def readable(self) -> bool: + return True + + def read(self, size: typing.Optional[int] = -1) -> bytes: + if size is None or size < 0 or size > self._length - self._seek_position: + size = self._length - self._seek_position + + self._outer_stream.seek(self._start_offset + self._seek_position) + res = self._outer_stream.read(size) + self._seek_position += len(res) + return res + + if typing.TYPE_CHECKING: class PeekableIO(typing.Protocol): """Minimal protocol for binary IO streams that support the peek method. diff --git a/rsrcfork/api.py b/rsrcfork/api.py index 4bf9fa8..17592c0 100644 --- a/rsrcfork/api.py +++ b/rsrcfork/api.py @@ -109,6 +109,7 @@ class Resource(object): _name: typing.Optional[bytes] attributes: ResourceAttrs data_raw_offset: int + _length_raw: int _data_raw: bytes _compressed_info: compress.common.CompressedHeaderInfo _data_decompressed: bytes @@ -178,9 +179,8 @@ class Resource(object): try: return self._data_raw except AttributeError: - self._resfile._stream.seek(self._resfile.data_offset + self.data_raw_offset) - (data_raw_length,) = self._resfile._stream_unpack(STRUCT_RESOURCE_DATA_HEADER) - self._data_raw = self._resfile._read_exact(data_raw_length) + with self.open_raw() as f: + self._data_raw = f.read() return self._data_raw def open_raw(self) -> typing.BinaryIO: @@ -196,7 +196,7 @@ class Resource(object): because the stream API does not require the entire resource data to be read in advance. """ - return io.BytesIO(self.data_raw) + return _io_utils.SubStream(self._resfile._stream, self._resfile.data_offset + self.data_raw_offset + STRUCT_RESOURCE_DATA_HEADER.size, self.length_raw) @property def compressed_info(self) -> typing.Optional[compress.common.CompressedHeaderInfo]: @@ -222,7 +222,12 @@ class Resource(object): Accessing this attribute may be faster than computing len(self.data_raw) manually. """ - return len(self.data_raw) + try: + return self._length_raw + except AttributeError: + self._resfile._stream.seek(self._resfile.data_offset + self.data_raw_offset) + (self._length_raw,) = self._resfile._stream_unpack(STRUCT_RESOURCE_DATA_HEADER) + return self._length_raw @property def length(self) -> int: