Optimize make_data_tables and use numpy.save instead of pickling. The

file sizes are a bit larger but it unblocks updating to python 3.8.
This commit is contained in:
kris 2023-01-18 00:01:34 +00:00
parent 89633aa845
commit 1d5bcfd74e
3 changed files with 39 additions and 43 deletions

View File

@ -50,8 +50,7 @@ TODO: link video once it is available.
## Installation
This currently requires python3.7 because some dependencies (e.g. weighted-levenshtein) don't compile with 3.9+, and 3.8
has a [bug](https://bugs.python.org/issue44439) in object pickling.
This currently requires python3.7 because some dependencies (e.g. weighted-levenshtein) don't compile with 3.9+.
```
python3.7 -m venv venv
@ -59,23 +58,21 @@ source venv/bin/activate
pip install -r requirements.txt
```
To generate the data files required by the transcoder:
Before you can run the transcoder you need to generate the data files it requires:
```
% python transcoder/make_data_tables.py
```
This takes about 3 hours on my machine.
TODO: download instructions
This is a one-time setup. It takes about 90 minutes on my machine.
## Release Notes
### v0.3 (17 Jan 2023)
- Fixed an image quality bug in the transcoder
- Quality of life improvements to installation process
- Stop using LFS to store the generated data files in git, they're using up my quota
- Documentation/quality of life improvements to installation process
- Stop using LFS to store the generated data files in git, they're using up all my quota
### v0.2 (19 July 2019)

View File

@ -113,7 +113,7 @@ def compute_edit_distance(
edp: EditDistanceParams,
bitmap_cls: Type[screen.Bitmap],
nominal_colours: Type[colours.NominalColours]
):
) -> np.ndarray:
"""Computes edit distance matrix between all pairs of pixel strings.
Enumerates all possible values of the masked bit representation from
@ -131,44 +131,45 @@ def compute_edit_distance(
bitrange = np.uint64(2 ** bits)
edit = []
for _ in range(len(bitmap_cls.BYTE_MASKS)):
edit.append(
np.zeros(shape=np.uint64(bitrange * bitrange), dtype=np.uint16))
edit = np.zeros(
shape=(len(bitmap_cls.BYTE_MASKS), np.uint64(bitrange * bitrange)),
dtype=np.uint16)
# Matrix is symmetrical with zero diagonal so only need to compute upper
# triangle
bar = ProgressBar((bitrange * (bitrange - 1)) / 2, max_width=80)
bar = ProgressBar(
bitrange * (bitrange - 1) / 2 * len(bitmap_cls.PHASES), max_width=80)
num_dots = bitmap_cls.MASKED_DOTS
cnt = 0
for i in range(np.uint64(bitrange)):
for j in range(i):
cnt += 1
pair_base = np.uint64(i) << bits
for o, ph in enumerate(bitmap_cls.PHASES):
# Compute this in the outer loop since it's invariant under j
first_dots = bitmap_cls.to_dots(i, byte_offset=o)
first_pixels = pixel_string(
colours.dots_to_nominal_colour_pixel_values(
num_dots, first_dots, nominal_colours,
init_phase=ph)
)
if cnt % 10000 == 0:
bar.numerator = cnt
print(bar, end='\r')
sys.stdout.flush()
# Matrix is symmetrical with zero diagonal so only need to compute
# upper triangle
for j in range(i):
cnt += 1
if cnt % 100000 == 0:
bar.numerator = cnt
print(bar, end='\r')
sys.stdout.flush()
pair = (np.uint64(i) << bits) + np.uint64(j)
pair = pair_base + np.uint64(j)
for o, ph in enumerate(bitmap_cls.PHASES):
first_dots = bitmap_cls.to_dots(i, byte_offset=o)
second_dots = bitmap_cls.to_dots(j, byte_offset=o)
first_pixels = pixel_string(
colours.dots_to_nominal_colour_pixel_values(
num_dots, first_dots, nominal_colours,
init_phase=ph)
)
second_pixels = pixel_string(
colours.dots_to_nominal_colour_pixel_values(
num_dots, second_dots, nominal_colours,
init_phase=ph)
)
edit[o][pair] = edit_distance(
edit[o, pair] = edit_distance(
edp, first_pixels, second_pixels, error=False)
return edit
@ -183,10 +184,9 @@ def make_edit_distance(
"""Write file containing (D)HGR edit distance matrix for a palette."""
dist = compute_edit_distance(edp, bitmap_cls, nominal_colours)
data = "transcoder/data/%s_palette_%d_edit_distance.pickle.bz2" % (
data = "transcoder/data/%s_palette_%d_edit_distance.npz" % (
bitmap_cls.NAME, pal.ID.value)
with bz2.open(data, "wb", compresslevel=9) as out:
pickle.dump(dist, out, protocol=pickle.HIGHEST_PROTOCOL)
np.savez_compressed(data, edit_distance=dist)
def main():

View File

@ -342,15 +342,13 @@ class Bitmap:
@classmethod
@functools.lru_cache(None)
def edit_distances(cls, palette_id: pal.Palette) -> List[np.ndarray]:
def edit_distances(cls, palette_id: pal.Palette) -> np.ndarray:
"""Load edit distance matrices for masked, shifted byte values."""
data = "transcoder/data/%s_palette_%d_edit_distance.pickle.bz2" % (
cls.NAME,
palette_id.value
data = "transcoder/data/%s_palette_%d_edit_distance.npz" % (
cls.NAME, palette_id.value
)
with bz2.open(data, "rb") as ed:
dist = pickle.load(ed) # type: List[np.ndarray]
dist = np.load(data)['edit_distance']
# dist is an upper-triangular matrix of edit_distance(a, b)
# encoded as dist[(a << N) + b] = edit_distance(a, b)
@ -363,8 +361,8 @@ class Bitmap:
(identity & np.uint64(2 ** cls.MASKED_BITS - 1)) <<
cls.MASKED_BITS)
for i in range(len(dist)):
dist[i][transpose] += dist[i][identity]
for i in range(dist.shape[0]):
dist[i, transpose] += dist[i, identity]
return dist
@ -741,6 +739,7 @@ class HGRBitmap(Bitmap):
return double
@classmethod
@functools.lru_cache(None)
def to_dots(cls, masked_val: int, byte_offset: int) -> int:
"""Convert masked representation to bit sequence of display dots.