Optimize make_data_tables and use numpy.save instead of pickling. The

file sizes are a bit larger but it unblocks updating to python 3.8.
2024-06-26 00:29:29 +00:00 · 2023-01-18 00:01:34 +00:00 · 2023-01-18 00:01:34 +00:00 · 1d5bcfd74e
commit 1d5bcfd74e
parent 89633aa845
3 changed files with 39 additions and 43 deletions
--- a/README.md
+++ b/README.md
@ -50,8 +50,7 @@ TODO: link video once it is available.
 ## Installation
-This currently requires python3.7 because some dependencies (e.g. weighted-levenshtein) don't compile with 3.9+, and 3.8
+This currently requires python3.7 because some dependencies (e.g. weighted-levenshtein) don't compile with 3.9+.
 has a [bug](https://bugs.python.org/issue44439) in object pickling.  
 ```
 python3.7 -m venv venv
@ -59,23 +58,21 @@ source venv/bin/activate
 pip install -r requirements.txt
 ```
-To generate the data files required by the transcoder:
+Before you can run the transcoder you need to generate the data files it requires:
 ```
 % python transcoder/make_data_tables.py
 ```
-This takes about 3 hours on my machine.
+This is a one-time setup.  It takes about 90 minutes on my machine.
 TODO: download instructions
 ## Release Notes
 ### v0.3 (17 Jan 2023)
 - Fixed an image quality bug in the transcoder
- Quality of life improvements to installation process
+- Documentation/quality of life improvements to installation process
- Stop using LFS to store the generated data files in git, they're using up my quota
+- Stop using LFS to store the generated data files in git, they're using up all my quota
 ### v0.2 (19 July 2019)
--- a/transcoder/make_data_tables.py
+++ b/transcoder/make_data_tables.py
@ -113,7 +113,7 @@ def compute_edit_distance(
        edp: EditDistanceParams,
        bitmap_cls: Type[screen.Bitmap],
        nominal_colours: Type[colours.NominalColours]
-):
+) -> np.ndarray:
    """Computes edit distance matrix between all pairs of pixel strings.
    Enumerates all possible values of the masked bit representation from
@ -131,44 +131,45 @@ def compute_edit_distance(
    bitrange = np.uint64(2 ** bits)
-    edit = []
+    edit = np.zeros(
-    for _ in range(len(bitmap_cls.BYTE_MASKS)):
+        shape=(len(bitmap_cls.BYTE_MASKS), np.uint64(bitrange * bitrange)),
-        edit.append(
+        dtype=np.uint16)
            np.zeros(shape=np.uint64(bitrange * bitrange), dtype=np.uint16))
-    # Matrix is symmetrical with zero diagonal so only need to compute upper
+    bar = ProgressBar(
-    # triangle
+        bitrange * (bitrange - 1) / 2 * len(bitmap_cls.PHASES), max_width=80)
    bar = ProgressBar((bitrange * (bitrange - 1)) / 2, max_width=80)
    num_dots = bitmap_cls.MASKED_DOTS
    cnt = 0
    for i in range(np.uint64(bitrange)):
-        for j in range(i):
+        pair_base = np.uint64(i) << bits
-            cnt += 1
+        for o, ph in enumerate(bitmap_cls.PHASES):
            # Compute this in the outer loop since it's invariant under j
            first_dots = bitmap_cls.to_dots(i, byte_offset=o)
            first_pixels = pixel_string(
                colours.dots_to_nominal_colour_pixel_values(
                    num_dots, first_dots, nominal_colours,
                    init_phase=ph)
            )
-            if cnt % 10000 == 0:
+            # Matrix is symmetrical with zero diagonal so only need to compute
-                bar.numerator = cnt
+            # upper triangle
-                print(bar, end='\r')
+            for j in range(i):
-                sys.stdout.flush()
+                cnt += 1
                if cnt % 100000 == 0:
                    bar.numerator = cnt
                    print(bar, end='\r')
                    sys.stdout.flush()
-            pair = (np.uint64(i) << bits) + np.uint64(j)
+                pair = pair_base + np.uint64(j)
            for o, ph in enumerate(bitmap_cls.PHASES):
                first_dots = bitmap_cls.to_dots(i, byte_offset=o)
                second_dots = bitmap_cls.to_dots(j, byte_offset=o)
                first_pixels = pixel_string(
                    colours.dots_to_nominal_colour_pixel_values(
                        num_dots, first_dots, nominal_colours,
                        init_phase=ph)
                )
                second_pixels = pixel_string(
                    colours.dots_to_nominal_colour_pixel_values(
                        num_dots, second_dots, nominal_colours,
                        init_phase=ph)
                )
-                edit[o][pair] = edit_distance(
+                edit[o, pair] = edit_distance(
                    edp, first_pixels, second_pixels, error=False)
    return edit
@ -183,10 +184,9 @@ def make_edit_distance(
    """Write file containing (D)HGR edit distance matrix for a palette."""
    dist = compute_edit_distance(edp, bitmap_cls, nominal_colours)
-    data = "transcoder/data/%s_palette_%d_edit_distance.pickle.bz2" % (
+    data = "transcoder/data/%s_palette_%d_edit_distance.npz" % (
        bitmap_cls.NAME, pal.ID.value)
-    with bz2.open(data, "wb", compresslevel=9) as out:
+    np.savez_compressed(data, edit_distance=dist)
        pickle.dump(dist, out, protocol=pickle.HIGHEST_PROTOCOL)
 def main():
--- a/transcoder/screen.py
+++ b/transcoder/screen.py
@ -342,15 +342,13 @@ class Bitmap:
    @classmethod
    @functools.lru_cache(None)
-    def edit_distances(cls, palette_id: pal.Palette) -> List[np.ndarray]:
+    def edit_distances(cls, palette_id: pal.Palette) -> np.ndarray:
        """Load edit distance matrices for masked, shifted byte values."""
-        data = "transcoder/data/%s_palette_%d_edit_distance.pickle.bz2" % (
+        data = "transcoder/data/%s_palette_%d_edit_distance.npz" % (
-            cls.NAME,
+            cls.NAME, palette_id.value
            palette_id.value
        )
-        with bz2.open(data, "rb") as ed:
+        dist = np.load(data)['edit_distance']
            dist = pickle.load(ed)  # type: List[np.ndarray]
        # dist is an upper-triangular matrix of edit_distance(a, b)
        # encoded as dist[(a << N) + b] = edit_distance(a, b)
@ -363,8 +361,8 @@ class Bitmap:
                (identity & np.uint64(2 ** cls.MASKED_BITS - 1)) <<
                cls.MASKED_BITS)
-        for i in range(len(dist)):
+        for i in range(dist.shape[0]):
-            dist[i][transpose] += dist[i][identity]
+            dist[i, transpose] += dist[i, identity]
        return dist
@ -741,6 +739,7 @@ class HGRBitmap(Bitmap):
        return double
    @classmethod
    @functools.lru_cache(None)
    def to_dots(cls, masked_val: int, byte_offset: int) -> int:
        """Convert masked representation to bit sequence of display dots.