Remove custom stream types and read all resource data upfront again

The custom stream types were almost always slower than just reading the
entire data into memory, and there's no reason not to do that -
resources are small enough that memory usage and disk IO speed aren't a
concern (at least not for any machine that's modern enough to run
Python 3...).

Perhaps the only performance advantage was when reading a small amount
of data from the start of a compressed resource. In that case the
custom stream could incrementally decompress only the part of the data
that's actually needed, which was a bit faster than decompressing the
entire resource and then throwing away most of the data. But this
situation is rare enough that it's not worth handling in the rsrcfork
library. If this is a real performance issue for someone, they can
manually call the incremental decompression functions from
rsrcfork.compress where needed.
This commit is contained in:
dgelessus 2020-11-01 19:28:25 +01:00
parent d74dbc41ba
commit 5c3bc5d7e5
3 changed files with 7 additions and 170 deletions

View File

@ -19,80 +19,6 @@ def read_exact(stream: typing.BinaryIO, byte_count: int) -> bytes:
return data
class _SubStream(io.RawIOBase):
"""A read-only stream that provides a view over a range of data from another stream."""
_outer_stream: typing.BinaryIO
_start_offset: int
_length: int
_seek_position: int
def __init__(self, stream: typing.BinaryIO, start_offset: int, length: int) -> None:
super().__init__()
self._outer_stream = stream
self._start_offset = start_offset
self._length = length
self._seek_position = 0
outer_stream_length = self._outer_stream.seek(0, io.SEEK_END)
if self._start_offset + self._length > outer_stream_length:
raise ValueError(f"start_offset ({self._start_offset}) or length ({self._length}) too high: outer stream must be at least {self._start_offset + self._length} bytes long, but is only {outer_stream_length} bytes")
def seekable(self) -> bool:
return True
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
if whence == io.SEEK_SET:
if offset < 0:
raise ValueError(f"Negative seek offset not allowed with SEEK_SET: {offset}")
self._seek_position = offset
elif whence == io.SEEK_CUR:
self._seek_position += offset
elif whence == io.SEEK_END:
self._seek_position = self._length - offset
else:
raise ValueError(f"Invalid whence value: {whence}")
self._seek_position = max(0, min(self._length, self._seek_position))
return self._seek_position
def tell(self) -> int:
return self._seek_position
def readable(self) -> bool:
return True
def readall(self) -> bytes:
self._outer_stream.seek(self._start_offset + self._seek_position)
return self._outer_stream.read(self._length - self._seek_position)
def readinto(self, buffer: bytearray) -> typing.Optional[int]:
size = min(len(buffer), self._length - self._seek_position)
self._outer_stream.seek(self._start_offset + self._seek_position)
data = self._outer_stream.read(size)
self._seek_position += len(data)
buffer[:len(data)] = data
return len(data)
def make_substream(stream: typing.BinaryIO, start_offset: int, length: int) -> typing.BinaryIO:
"""Create a read-only stream that exposes the specified range of data from ``stream``.
:param stream: The underlying binary stream from which to read the data.
The stream must be readable and seekable and contain at least ``start_offset + length`` bytes of data.
:param start_offset: The absolute offset in the parent stream at which the data to expose starts.
This offset will correspond to offset 0 in the returned stream.
:param length: The length of the data to expose.
This is the highest valid offset in the returned stream.
"""
# For some reason, mypy thinks that io.BufferedReader is not a typing.BinaryIO.
return typing.cast(typing.BinaryIO, io.BufferedReader(_SubStream(stream, start_offset, length)))
if typing.TYPE_CHECKING:
class PeekableIO(typing.Protocol):
"""Minimal protocol for binary IO streams that support the peek method.

View File

@ -179,8 +179,8 @@ class Resource(object):
try:
return self._data_raw
except AttributeError:
with self.open_raw() as f:
self._data_raw = f.read()
self._resfile._stream.seek(self._resfile.data_offset + self.data_raw_offset + STRUCT_RESOURCE_DATA_HEADER.size)
self._data_raw = _io_utils.read_exact(self._resfile._stream, self.length_raw)
return self._data_raw
def open_raw(self) -> typing.BinaryIO:
@ -196,7 +196,7 @@ class Resource(object):
because the stream API does not require the entire resource data to be read in advance.
"""
return _io_utils.make_substream(self._resfile._stream, self._resfile.data_offset + self.data_raw_offset + STRUCT_RESOURCE_DATA_HEADER.size, self.length_raw)
return io.BytesIO(self.data_raw)
@property
def compressed_info(self) -> typing.Optional[compress.common.CompressedHeaderInfo]:
@ -252,8 +252,9 @@ class Resource(object):
try:
return self._data_decompressed
except AttributeError:
with self.open() as f:
self._data_decompressed = f.read()
with self.open_raw() as compressed_f:
compressed_f.seek(self.compressed_info.header_length)
self._data_decompressed = b"".join(compress.decompress_stream_parsed(self.compressed_info, compressed_f))
return self._data_decompressed
else:
return self.data_raw
@ -271,12 +272,7 @@ class Resource(object):
because the stream API does not require the entire resource data to be read (and possibly decompressed) in advance.
"""
if self.compressed_info is None:
return self.open_raw()
else:
f = self.open_raw()
f.seek(self.compressed_info.header_length)
return compress.DecompressingStream(f, self.compressed_info, close_stream=True)
return io.BytesIO(self.data)
class _LazyResourceMap(typing.Mapping[int, Resource]):

View File

@ -66,88 +66,3 @@ def decompress(data: bytes, *, debug: bool = False) -> bytes:
"""Decompress the given compressed resource data."""
return b"".join(decompress_stream(io.BytesIO(data), debug=debug))
class DecompressingStream(io.BufferedIOBase, typing.BinaryIO):
_compressed_stream: typing.BinaryIO
_close_stream: bool
_header_info: CompressedHeaderInfo
_decompress_iter: typing.Iterator[bytes]
_decompressed_stream: typing.BinaryIO
_seek_position: int
def __init__(self, compressed_stream: typing.BinaryIO, header_info: typing.Optional[CompressedHeaderInfo], *, close_stream: bool = False) -> None:
super().__init__()
self._compressed_stream = compressed_stream
self._close_stream = close_stream
if header_info is not None:
self._header_info = header_info
else:
self._header_info = CompressedHeaderInfo.parse_stream(self._compressed_stream)
self._decompress_iter = decompress_stream_parsed(self._header_info, self._compressed_stream)
self._decompressed_stream = io.BytesIO()
self._seek_position = 0
# This override does nothing,
# but is needed to make mypy happy,
# otherwise it complains (apparently incorrectly) about the __enter__ definitions from IOBase and BinaryIO being incompatible with each other.
def __enter__(self: "DecompressingStream") -> "DecompressingStream":
return super().__enter__()
def close(self) -> None:
super().close()
if self._close_stream:
self._compressed_stream.close()
del self._decompress_iter
self._decompressed_stream.close()
def seekable(self) -> bool:
return True
def tell(self) -> int:
return self._seek_position
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
if whence == io.SEEK_SET:
if offset < 0:
raise ValueError(f"Negative seek offset not allowed with SEEK_SET: {offset}")
self._seek_position = offset
elif whence == io.SEEK_CUR:
self._seek_position += offset
elif whence == io.SEEK_END:
self._seek_position = self._header_info.decompressed_length - offset
else:
raise ValueError(f"Invalid whence value: {whence}")
self._seek_position = max(0, min(self._header_info.decompressed_length, self._seek_position))
return self._seek_position
def readable(self) -> bool:
return True
def read(self, size: typing.Optional[int] = -1) -> bytes:
if size is None:
size = -1
self._decompressed_stream.seek(0, io.SEEK_END)
if size < 0:
for chunk in self._decompress_iter:
self._decompressed_stream.write(chunk)
else:
if self._decompressed_stream.tell() - self._seek_position < size:
for chunk in self._decompress_iter:
self._decompressed_stream.write(chunk)
if self._decompressed_stream.tell() - self._seek_position >= size:
break
self._decompressed_stream.seek(self._seek_position)
ret = self._decompressed_stream.read(size)
self._seek_position += len(ret)
return ret