Move edit distance functions into separate module and clean up

partially. Slight optimization to not heapppush() many times, instead build a regular list and then heapify.
2025-02-20 17:29:17 +00:00 · 2019-03-14 22:32:52 +00:00 · 2019-03-14 22:32:52 +00:00 · 01ffd034eb
commit 01ffd034eb
parent e0ac37fe4a
2 changed files with 233 additions and 401 deletions
--- a/edit_distance.py
+++ b/edit_distance.py
@ -0,0 +1,163 @@
+import functools
+
+import numpy as np
+import weighted_levenshtein
+
+
+@functools.lru_cache(None)
+def byte_to_colour_string(b: int, is_odd_offset: bool) -> str:
+    pixels = []
+
+    idx = 0
+    if is_odd_offset:
+        pixels.append("01"[b & 0x01])
+        idx += 1
+
+    # K = black
+    # G = green
+    # V = violet
+    # W = white
+    palettes = (
+        (
+            "K",  # 0x00
+            "V",  # 0x01
+            "G",  # 0x10
+            "W"  # 0x11
+        ), (
+            "K",  # 0x00
+            "B",  # 0x01
+            "O",  # 0x10
+            "W"  # 0x11
+        )
+    )
+    palette = palettes[(b & 0x80) != 0]
+
+    for _ in range(3):
+        pixel = palette[(b >> idx) & 0b11]
+        pixels.append(pixel)
+        idx += 2
+
+    if not is_odd_offset:
+        pixels.append("01"[b & 0x40 != 0])
+        idx += 1
+
+    return "".join(pixels)
+
+
+# TODO: what about increasing transposition cost?  Might be better to have
+# any pixel at the right place even if the wrong colour?
+
+substitute_costs = np.ones((128, 128), dtype=np.float64)
+error_substitute_costs = np.ones((128, 128), dtype=np.float64)
+
+# Penalty for turning on/off a black bit
+for c in "01GVWOB":
+    substitute_costs[(ord('K'), ord(c))] = 1
+    substitute_costs[(ord(c), ord('K'))] = 1
+    error_substitute_costs[(ord('K'), ord(c))] = 5
+    error_substitute_costs[(ord(c), ord('K'))] = 5
+
+# Penalty for changing colour
+for c in "01GVWOB":
+    for d in "01GVWOB":
+        substitute_costs[(ord(c), ord(d))] = 1
+        substitute_costs[(ord(d), ord(c))] = 1
+        error_substitute_costs[(ord(c), ord(d))] = 5
+        error_substitute_costs[(ord(d), ord(c))] = 5
+
+insert_costs = np.ones(128, dtype=np.float64) * 1000
+delete_costs = np.ones(128, dtype=np.float64) * 1000
+
+
+def _edit_weight(a: int, b: int, is_odd_offset: bool, error: bool):
+    a_pixels = byte_to_colour_string(a, is_odd_offset)
+    b_pixels = byte_to_colour_string(b, is_odd_offset)
+
+    dist = weighted_levenshtein.dam_lev(
+        a_pixels, b_pixels,
+        insert_costs=insert_costs,
+        delete_costs=delete_costs,
+        substitute_costs=error_substitute_costs if error else substitute_costs,
+    )
+    return np.int64(dist)
+
+
+def edit_weight_matrixes(error: bool) -> np.array:
+    ewm = np.zeros(shape=(256, 256, 2), dtype=np.int64)
+    for a in range(256):
+        for b in range(256):
+            for is_odd_offset in (False, True):
+                ewm[a, b, int(is_odd_offset)] = _edit_weight(
+                    a, b, is_odd_offset, error)
+
+    return ewm
+
+
+_ewm = edit_weight_matrixes(False)
+_error_ewm = edit_weight_matrixes(True)
+
+
+@functools.lru_cache(None)
+def edit_weight(a: int, b: int, is_odd_offset: bool, error: bool):
+    e = _error_ewm if error else _ewm
+    return e[a, b, int(is_odd_offset)]
+
+_even_ewm = {}
+_odd_ewm = {}
+_even_error_ewm = {}
+_odd_error_ewm = {}
+for a in range(256):
+    for b in range(256):
+        _even_ewm[(a << 8) + b] = edit_weight(a, b, False, False)
+        _odd_ewm[(a << 8) + b] = edit_weight(a, b, True, False)
+
+        _even_error_ewm[(a << 8) + b] = edit_weight(a, b, False, True)
+        _odd_error_ewm[(a << 8) + b] = edit_weight(a, b, True, True)
+
+
+@functools.lru_cache(None)
+def _content_a_array(content: int, shape) -> np.array:
+    return (np.ones(shape, dtype=np.uint16) * content) << 8
+
+
+def content_edit_weight(content: int, b: np.array) -> np.array:
+    assert b.shape == (32, 256), b.shape
+
+    # Extract even and off column offsets (128,)
+    even_b = b[:, ::2]
+    odd_b = b[:, 1::2]
+
+    a = _content_a_array(content, even_b.shape)
+
+    even = a + even_b
+    odd = a + odd_b
+
+    even_weights = np.vectorize(_even_error_ewm.__getitem__)(even)
+    odd_weights = np.vectorize(_odd_error_ewm.__getitem__)(odd)
+
+    res = np.ndarray(shape=b.shape, dtype=np.int64)
+    res[:, ::2] = even_weights
+    res[:, 1::2] = odd_weights
+
+    return res
+
+
+def array_edit_weight(a: np.array, b: np.array) -> np.array:
+    # Extract even and off column offsets (32, 128)
+    even_a = a[:, ::2]
+    odd_a = a[:, 1::2]
+
+    even_b = b[:, ::2]
+    odd_b = b[:, 1::2]
+
+    even = (even_a.astype(np.uint16) << 8) + even_b
+    odd = (odd_a.astype(np.uint16) << 8) + odd_b
+
+    even_weights = np.vectorize(_even_ewm.__getitem__)(even)
+    odd_weights = np.vectorize(_odd_ewm.__getitem__)(odd)
+
+    res = np.ndarray(shape=a.shape, dtype=np.int64)
+    res[:, ::2] = even_weights
+    res[:, 1::2] = odd_weights
+
+    return res
--- a/video.py
+++ b/video.py
@ -1,4 +1,3 @@
-import functools
 import heapq
 import random
 import os
@ -11,251 +10,18 @@ from typing import List, Iterator, Tuple
 from PIL import Image
 import numpy as np
 import skvideo.io
-import weighted_levenshtein

+import edit_distance
 import opcodes
 import screen


-def hamming_weight(n):
-    """Compute hamming weight of 8-bit int"""
-    n = (n & 0x55) + ((n & 0xAA) >> 1)
-    n = (n & 0x33) + ((n & 0xCC) >> 2)
-    n = (n & 0x0F) + ((n & 0xF0) >> 4)
-    return n
-
-# TODO: what about increasing transposition cost?  Might be better to have
-# any pixel at the right place even if the wrong colour?
-
-substitute_costs = np.ones((128, 128), dtype=np.float64)
-error_substitute_costs = np.ones((128, 128), dtype=np.float64)
-
-# Penalty for turning on/off a black bit
-for c in "01GVWOB":
-    substitute_costs[(ord('K'), ord(c))] = 1
-    substitute_costs[(ord(c), ord('K'))] = 1
-    error_substitute_costs[(ord('K'), ord(c))] = 5
-    error_substitute_costs[(ord(c), ord('K'))] = 5
-
-# Penalty for changing colour
-for c in "01GVWOB":
-    for d in "01GVWOB":
-        substitute_costs[(ord(c), ord(d))] = 1
-        substitute_costs[(ord(d), ord(c))] = 1
-        error_substitute_costs[(ord(c), ord(d))] = 5
-        error_substitute_costs[(ord(d), ord(c))] = 5
-
-insert_costs = np.ones(128, dtype=np.float64) * 1000
-delete_costs = np.ones(128, dtype=np.float64) * 1000
-
-
-def _edit_weight(a: int, b: int, is_odd_offset: bool, error: bool):
-    a_pixels = byte_to_colour_string(a, is_odd_offset)
-    b_pixels = byte_to_colour_string(b, is_odd_offset)
-
-    dist = weighted_levenshtein.dam_lev(
-        a_pixels, b_pixels,
-        insert_costs=insert_costs,
-        delete_costs=delete_costs,
-        substitute_costs=error_substitute_costs if error else substitute_costs,
-    )
-    return np.int64(dist)
-
-
-def edit_weight_matrixes(error: bool) -> np.array:
-    ewm = np.zeros(shape=(256, 256, 2), dtype=np.int64)
-    for a in range(256):
-        for b in range(256):
-            for is_odd_offset in (False, True):
-                ewm[a, b, int(is_odd_offset)] = _edit_weight(
-                    a, b, is_odd_offset, error)
-
-    return ewm
-
-
-_ewm = edit_weight_matrixes(False)
-_error_ewm = edit_weight_matrixes(True)
-
-
-@functools.lru_cache(None)
-def edit_weight(a: int, b: int, is_odd_offset: bool, error: bool):
-    e = _error_ewm if error else _ewm
-    return e[a, b, int(is_odd_offset)]
-
-
-#
-# @functools.lru_cache(None)
-# def edit_weight_old(a: int, b: int, is_odd_offset: bool):
-#     a_pixels = byte_to_colour_string(a, is_odd_offset)
-#     b_pixels = byte_to_colour_string(b, is_odd_offset)
-#
-#     dist = weighted_levenshtein.dam_lev(
-#         a_pixels, b_pixels,
-#         insert_costs=insert_costs,
-#         delete_costs=delete_costs,
-#         substitute_costs=substitute_costs,
-#     )
-#     assert dist == edit_weight_new(a, b, is_odd_offset), (dist, a, b,
-#                                                           is_odd_offset)
-#     return np.int64(dist)
-
-_even_ewm = {}
-_odd_ewm = {}
-_even_error_ewm = {}
-_odd_error_ewm = {}
-for a in range(256):
-    for b in range(256):
-        _even_ewm[(a << 8) + b] = edit_weight(a, b, False, False)
-        _odd_ewm[(a << 8) + b] = edit_weight(a, b, True, False)
-
-        _even_error_ewm[(a << 8) + b] = edit_weight(a, b, False, True)
-        _odd_error_ewm[(a << 8) + b] = edit_weight(a, b, True, True)
-
-
-#
-# for a in range(256):
-#     for b in range(256):
-#         assert edit_weight(a, b, True) == edit_weight(b, a, True)
-#         assert edit_weight(a, b, False) == edit_weight(b, a, False)
-
-
-# def array_edit_weight2(content: int, b: np.array) -> np.array:
-#     assert b.shape == (256,), b.shape
-#
-#     # Extract even and off column offsets (128,)
-#     even_b = b[::2]
-#     odd_b = b[1::2]
-#
-#     a = np.ones(even_b.shape, dtype=np.int64) * content
-#
-#     even = (a << 8) + even_b
-#     odd = (a << 8) + odd_b
-#
-#     even_weights = npi.remap(
-#         even, _ewm_keys, _even_ewm_values, missing="raise")
-#     odd_weights = npi.remap(
-#         odd, _ewm_keys, _odd_ewm_values, missing="raise")
-#
-#     res = np.ndarray(shape=(256,), dtype=np.int64)
-#     res[::2] = even_weights
-#     res[1::2] = odd_weights
-#
-#     return res
-
-
-@functools.lru_cache(None)
-def _content_a_array(content: int, shape) -> np.array:
-    return (np.ones(shape, dtype=np.uint16) * content) << 8
-
-
-def content_edit_weight(content: int, b: np.array) -> np.array:
-    assert b.shape == (32, 256), b.shape
-
-    # Extract even and off column offsets (128,)
-    even_b = b[:, ::2]
-    odd_b = b[:, 1::2]
-
-    a = _content_a_array(content, even_b.shape)
-
-    even = a + even_b
-    odd = a + odd_b
-
-    even_weights = np.vectorize(_even_error_ewm.__getitem__)(even)
-    odd_weights = np.vectorize(_odd_error_ewm.__getitem__)(odd)
-
-    res = np.ndarray(shape=b.shape, dtype=np.int64)
-    res[:, ::2] = even_weights
-    res[:, 1::2] = odd_weights
-
-    return res
-
-
-def array_edit_weight(a: np.array, b: np.array) -> np.array:
-    # assert a.shape == b.shape == (32, 256), (a.shape, b.shape)
-
-    # Extract even and off column offsets (32, 128)
-    even_a = a[:, ::2]
-    odd_a = a[:, 1::2]
-
-    even_b = b[:, ::2]
-    odd_b = b[:, 1::2]
-
-    even = (even_a.astype(np.uint16) << 8) + even_b
-    odd = (odd_a.astype(np.uint16) << 8) + odd_b
-    #
-    # print("XXX")
-    # print(a)
-    # print(b)
-    # print(even_a)
-    # print(even_b)
-    # print(even)
-
-    even_weights = np.vectorize(_even_ewm.__getitem__)(even)
-    odd_weights = np.vectorize(_odd_ewm.__getitem__)(odd)
-
-    #
-    # print(even_weights)
-    # print(odd_weights)
-
-    res = np.ndarray(shape=a.shape, dtype=np.int64)
-    res[:, ::2] = even_weights
-    res[:, 1::2] = odd_weights
-
-    return res
-
-
-# _x = np.ndarray((4, 4), dtype=np.uint8)
-# print(array_edit_weight(_x, _x))
-# assert np.array_equal(array_edit_weight(_x, _x), np.zeros((32, 256)))
-
-@functools.lru_cache(None)
-def byte_to_colour_string(b: int, is_odd_offset: bool) -> str:
-    pixels = []
-
-    idx = 0
-    if is_odd_offset:
-        pixels.append("01"[b & 0x01])
-        idx += 1
-
-    # K = black
-    # G = green
-    # V = violet
-    # W = white
-    palettes = (
-        (
-            "K",  # 0x00
-            "V",  # 0x01
-            "G",  # 0x10
-            "W"  # 0x11
-        ), (
-            "K",  # 0x00
-            "B",  # 0x01
-            "O",  # 0x10
-            "W"  # 0x11
-        )
-    )
-    palette = palettes[(b & 0x80) != 0]
-
-    for _ in range(3):
-        pixel = palette[(b >> idx) & 0b11]
-        pixels.append(pixel)
-        idx += 2
-
-    if not is_odd_offset:
-        pixels.append("01"[b & 0x40 != 0])
-        idx += 1
-
-    return "".join(pixels)
-
-
 class Video:
    """Apple II screen memory map encoding a bitmapped frame."""

    CLOCK_SPEED = 1024 * 1024

-    def __init__(
-            self,
-            filename: str):
+    def __init__(self, filename: str):
        self.filename = filename  # type: str

        self._reader = skvideo.io.FFmpegReader(filename)
@ -276,8 +42,6 @@ class Video:
        self.update_priority = np.zeros((32, 256), dtype=np.int64)

    def tick(self, cycles) -> bool:
-        # print(cycles, self.cycles_per_frame, self.cycles_per_frame *
-        #       self.frame_number)
        if cycles > (self.cycles_per_frame * self.frame_number):
            self.frame_number += 1
            return True
@ -344,171 +108,16 @@ class Video:
        print("Similarity %f" % (self.update_priority.mean()))
        yield from self._index_changes(self.memory_map, target)

-    # def _diff_weights(
-    #         self,
-    #         source: screen.MemoryMap,
-    #         target: screen.MemoryMap
-    # ):
-    #     diff_weights = np.zeros((32, 256), dtype=np.int64)
-    #
-    #     it = np.nditer(
-    #         source.page_offset ^ target.page_offset, flags=['multi_index'])
-    #     while not it.finished:
-    #         # If no diff, don't need to bother
-    #         if not it[0]:
-    #             it.iternext()
-    #             continue
-    #
-    #         diff_weights[it.multi_index] = edit_weight(
-    #             source.page_offset[it.multi_index],
-    #             target.page_offset[it.multi_index],
-    #             it.multi_index[1] % 2 == 1
-    #         )
-    #         it.iternext()
-
-    # aew = array_edit_weight(source.page_offset,
-    #                         target.page_offset)
-    # if not np.array_equal(
-    #     diff_weights, aew
-    # ):
-    #     it = np.nditer(
-    #         diff_weights - aew, flags=['multi_index'])
-    #     while not it.finished:
-    #         # If no diff, don't need to bother
-    #         if it[0]:
-    #             print(
-    #                 source.page_offset[it.multi_index],
-    #                 target.page_offset[it.multi_index],
-    #                 diff_weights[it.multi_index],
-    #                 aew[it.multi_index], it.multi_index)
-    #         it.iternext()
-    #     assert False
-
-    # return diff_weights
-
-    @staticmethod
-    def _diff_weights_new(
-            source: screen.MemoryMap,
-            target: screen.MemoryMap
-    ):
-        return array_edit_weight(
-            source.page_offset, target.page_offset)
-
-    def _heapify_priorities(self) -> List:
-        priorities = []
-        it = np.nditer(self.update_priority, flags=['multi_index'])
-        while not it.finished:
-            priority = it[0]
-            if not priority:
-                it.iternext()
-                continue
-
-            page, offset = it.multi_index
-            # Don't use deterministic order for page, offset
-            nonce = random.random()
-            heapq.heappush(priorities, (-priority, nonce, page, offset))
-            it.iternext()
-
-        return priorities
-
-    @staticmethod
-    def _compute_delta(content, target, old):
-        return content_edit_weight(content, target) - old
-
-    # XXX 0WKK -> 1KKV (3)
-    #     1VVV -> 1KKV (2) is closer to target but a big
-    # visual difference
-
-    # 0WKK -> 1KKV = 2 transpose + 2 flip = 12, or 3 flip = 15
-    # 1VVV -> 1KKV = 2 flip = 10, delta = -2
-    # @functools.lru_cache(None)
-    # def _compute_delta_old(self, content, target, is_odd, old):
-    #     return edit_weight(content, target, is_odd)  # - old
-
-    _OFFSETS = np.arange(256)
-
-    def _compute_error(self, page, content, target, old_error, content_deltas):
-        offsets = []
-
-        delta_screen = content_deltas.get(content)
-        if delta_screen is None:
-            delta_screen = self._compute_delta(
-                content, target.page_offset, old_error)
-            content_deltas[content] = delta_screen
-        delta_page = delta_screen[page]
-
-        # old_error_page = old_error[page]
-        # tpo = target.page_offset[page]
-        #
-        # # If we store content at this offset, what is the difference
-        # # between this edit distance and the ideal target edit distance?
-        # delta_page = self._compute_delta(
-        #     content, tpo, old_error_page)
-        # # print(delta_page)
-        cond = delta_page < 0
-
-        candidate_offsets = self._OFFSETS[cond]
-        priorities = self.update_priority[page][cond]
-        # deltas = delta_page[cond]
-
-        # assert len(priorities) == len(candidate_offsets) == len(deltas) ==
-        # sum(cond)
-
-        l = [
-            (-priorities[i], random.random(), candidate_offsets[i])
-            for i in range(len(candidate_offsets))
-        ]
-        # offsets = [o for _, _, o in heapq.nsmallest(3, l)]
-        heapq.heapify(l)
-
-        while l:
-            _, _, o = heapq.heappop(l)
-            offsets.append(o)
-            if len(offsets) == 3:
-                break
-        #
-        # page_priorities = [(-p, random.random(), o) for o, p in enumerate(
-        #     self.update_priority[page]) if p]
-        # heapq.heapify(page_priorities)
-        #
-        # # Iterate in descending priority order and take first 3 offsets with
-        # # negative delta
-        # while page_priorities:
-        #     _, _, o = heapq.heappop(page_priorities)
-        #
-        #     # If we store content at this offset, what is the difference
-        #     # between this edit distance and the ideal target edit distance?
-        #     delta = self._compute_delta_old(
-        #         content, tpo[o], o % 2 == 1, old_error_page[o])
-        #
-        #     # Getting further away from goal, no thanks!
-        #     if delta >= 0:
-        #         continue
-        #     #
-        #     # # print("Offset %d prio %d: %d -> %d = %d" % (
-        #     # #   o, p, content,
-        #     # #   target.page_offset[page, o],
-        #     # #   delta
-        #     # # ))
-        #     offsets.append(o)
-        #     if len(offsets) == 3:
-        #         break
-
-        return offsets
-
    def _index_changes(
            self,
            source: screen.MemoryMap,
            target: screen.MemoryMap
-    ) -> Iterator[Tuple[int, int, int, int, int]]:
-        """Transform encoded screen to sequence of change tuples.
+    ) -> Iterator[Tuple[int, int, List[int]]]:
+        """Transform encoded screen to sequence of change tuples."""

-        Change tuple is (update_priority, page, offset, content, run_length)
-        """
+        diff_weights = self._diff_weights(source, target)

-        diff_weights = self._diff_weights_new(source, target)
-
-        # Clear any update priority entries that have resolved themselves 
+        # Clear any update priority entries that have resolved themselves
        # with new frame
        self.update_priority[diff_weights == 0] = 0

@ -532,8 +141,6 @@ class Video:

            offsets = [offset]
            content = target.page_offset[page, offset]
-            # print("Priority %d: page %d offset %d content %d" % (
-            #    priority, page, offset, content))

            # Clear priority for the offset we're emitting
            self.update_priority[page, offset] = 0
@ -556,11 +163,73 @@ class Video:
            for _ in range(len(offsets), 4):
                offsets.append(offsets[0])

-            # print("Page %d, content %d: offsets %s" % (page+32, content,
-            #                                           offsets))
            yield (page + 32, content, offsets)

        # If we run out of things to do, pad forever
        content = target.page_offset[(0, 0)]
        while True:
            yield (32, content, [0, 0, 0, 0])
+
+    @staticmethod
+    def _diff_weights(
+            source: screen.MemoryMap,
+            target: screen.MemoryMap
+    ):
+        return edit_distance.array_edit_weight(
+            source.page_offset, target.page_offset)
+
+    def _heapify_priorities(self) -> List:
+        priorities = []
+        it = np.nditer(self.update_priority, flags=['multi_index'])
+        while not it.finished:
+            priority = it[0]
+            if not priority:
+                it.iternext()
+                continue
+
+            page, offset = it.multi_index
+
+            # Don't use deterministic order for page, offset
+            nonce = random.random()
+            priorities.append((-priority, nonce, page, offset))
+            it.iternext()
+
+        heapq.heapify(priorities)
+        return priorities
+
+    @staticmethod
+    def _compute_delta(content, target, old):
+        """
+        This function is the critical path for the video encoding.
+        """
+        return edit_distance.content_edit_weight(content, target) - old
+
+    _OFFSETS = np.arange(256)
+
+    def _compute_error(self, page, content, target, old_error, content_deltas):
+        offsets = []
+
+        delta_screen = content_deltas.get(content)
+        if delta_screen is None:
+            delta_screen = self._compute_delta(
+                content, target.page_offset, old_error)
+            content_deltas[content] = delta_screen
+
+        delta_page = delta_screen[page]
+        cond = delta_page < 0
+        candidate_offsets = self._OFFSETS[cond]
+        priorities = self.update_priority[page][cond]
+
+        l = [
+            (-priorities[i], random.random(), candidate_offsets[i])
+            for i in range(len(candidate_offsets))
+        ]
+        heapq.heapify(l)
+
+        while l:
+            _, _, o = heapq.heappop(l)
+            offsets.append(o)
+            if len(offsets) == 3:
+                break
+
+        return offsets