From 1cd20a76156f9ea25f3c12004b4b4a767427bcb4 Mon Sep 17 00:00:00 2001
From: kris <kris.kennaway@gmail.com>
Date: Sun, 14 Jul 2019 22:09:40 +0100
Subject: [PATCH] WIP - optimal encoding by computing all possible byte stores
 at all possible (page, offset) and then choosing top 4 offsets for all
 content bytes.

i.e. this should do better than the previous "greedy" approach in
which we picked a single high-priority offset to resolve, and then
tried to find 3 more than didn't introduce too much additional error.

Major outstanding issue is that we need to also evaluate whether it's
worth storing 1, 2 or 3 offsets as well, since sometimes this will
have lower total error.
---
 transcoder/screen.py      | 132 +++++-----
 transcoder/screen_test.py |  78 ++++--
 transcoder/video.py       | 530 +++++++++++++++++++++++++++-----------
 3 files changed, 508 insertions(+), 232 deletions(-)

diff --git a/transcoder/screen.py b/transcoder/screen.py
index 5029363..ca409a5 100644
--- a/transcoder/screen.py
+++ b/transcoder/screen.py
@@ -210,14 +210,14 @@ class Bitmap:
         body = self._body()
 
         # Prepend last 3 bits of previous odd byte so we can correctly
-        # decode the effective colours at the beginning of the 22-bit tuple
+        # decode the effective colours at the beginning of the body tuple
         prev_col = np.roll(body, 1, axis=1).astype(np.uint64)
         header = self._make_header(prev_col)
         # Don't leak header across page boundaries
         header[:, 0] = 0
 
         # Append first 3 bits of next even byte so we can correctly
-        # decode the effective colours at the end of the 22-bit tuple
+        # decode the effective colours at the end of the body tuple
         next_col = np.roll(body, -1, axis=1).astype(np.uint64)
         footer = self._make_footer(next_col)
         # Don't leak footer across page boundaries
@@ -314,26 +314,26 @@ class Bitmap:
 
         return column_right
 
-    def _fix_array_neighbours(
-            self,
-            ary: np.ndarray,
-            byte_offset: int
-    ) -> None:
-        """Fix up column headers/footers for all array entries."""
-
-        # TODO: don't leak header/footer across page boundaries
-
-        # Propagate new value into neighbouring byte headers/footers if
-        # necessary
-        if byte_offset == 0:
-            # Need to also update the footer of the preceding column
-            shifted_left = np.roll(ary, -1, axis=1)
-            self._fix_column_left(ary, shifted_left)
-
-        elif byte_offset == (self.SCREEN_BYTES - 1):
-            # Need to also update the header of the next column
-            shifted_right = np.roll(ary, 1, axis=1)
-            self._fix_column_right(ary, shifted_right)
+    # def _fix_array_neighbours(
+    #         self,
+    #         ary: np.ndarray,
+    #         byte_offset: int
+    # ) -> None:
+    #     """Fix up column headers/footers for all array entries."""
+    #
+    #     # TODO: don't leak header/footer across page boundaries
+    #
+    #     # Propagate new value into neighbouring byte headers/footers if
+    #     # necessary
+    #     if byte_offset == 0:
+    #         # Need to also update the footer of the preceding column
+    #         shifted_left = np.roll(ary, -1, axis=1)
+    #         self._fix_column_left(ary, shifted_left)
+    #
+    #     elif byte_offset == (self.SCREEN_BYTES - 1):
+    #         # Need to also update the header of the next column
+    #         shifted_right = np.roll(ary, 1, axis=1)
+    #         self._fix_column_right(ary, shifted_right)
 
     @classmethod
     @functools.lru_cache(None)
@@ -400,37 +400,14 @@ class Bitmap:
             is_aux: bool
     ) -> np.ndarray:
         """Compute edit distance matrix from source bitmap."""
-        return self._diff_weights(source.packed, is_aux)
-
-    # TODO: unit test
-    def _diff_weights(
-            self,
-            source_packed: np.ndarray,
-            is_aux: bool,
-            content: np.uint8 = None
-    ) -> np.ndarray:
-        """Computes edit distance matrix from source_packed to self.packed
-
-        If content is set, the distance will be computed as if this value
-        was stored into each offset position of source_packed, i.e. to
-        allow evaluating which offsets (if any) should be chosen for storing
-        this content byte.
-        """
 
         diff = np.ndarray((32, 256), dtype=np.int)
-
         offsets = self._byte_offsets(is_aux)
 
         dists = []
         for o in offsets:
-            if content is not None:
-                compare_packed = self.masked_update(o, source_packed, content)
-                self._fix_array_neighbours(compare_packed, o)
-            else:
-                compare_packed = source_packed
-
             # Pixels influenced by byte offset o
-            source_pixels = self.mask_and_shift_data(compare_packed, o)
+            source_pixels = self.mask_and_shift_data(source.packed, o)
             target_pixels = self.mask_and_shift_data(self.packed, o)
 
             # Concatenate N-bit source and target into 2N-bit values
@@ -466,34 +443,57 @@ class Bitmap:
                     continue
                 ok = False
                 print(p, o, bin(self.packed[p, o - 1]),
-                                bin(headers[p, o]),
+                      bin(headers[p, o]),
                       bin(self.packed[p, o]),
                       bin(self.packed[p, o + 1]), bin(footers[p, o]),
                       bin(res[p, o])
                       )
             assert ok
 
+    CONTENT_RANGE = None
+
     # TODO: unit tests
-    def compute_delta(
-            self,
-            content: int,
-            diff_weights: np.ndarray,
-            is_aux: bool
-    ) -> np.ndarray:
+    def compute_delta(self, is_aux: bool) -> np.ndarray:
         """Compute which content stores introduce the least additional error.
 
         We compute the effect of storing content at all possible offsets
-        within self.packed, and then subtract the previous diff weights.
-
-        Negative values indicate that the new content value is closer to the
-        target than the current content.
+        within self.packed, in terms of the new edit_distance to the target
+        pixels.
         """
-        # TODO: use error edit distance?
+        # Only need to consider 0x0 .. 0x7f content stores
+        diff = np.ndarray((self.CONTENT_RANGE, 32, 256), dtype=np.int)
 
-        new_diff = self._diff_weights(self.packed, is_aux, content)
+        all_content_bytes = np.arange(
+            self.CONTENT_RANGE, dtype=np.uint64).reshape(
+            (self.CONTENT_RANGE, 1))
 
-        # TODO: try different weightings
-        return (new_diff * 5) - diff_weights
+        def _target_masked(content, t, byte_offset):
+            return self.masked_update(byte_offset, t, content)
+
+        offsets = self._byte_offsets(is_aux)
+
+        dists = []
+        for o in offsets:
+            compare_packed = np.apply_along_axis(
+                _target_masked, 1, all_content_bytes, self.packed, o)
+            # self.masked_update(o, self.packed, content)
+            # self._fix_array_neighbours(compare_packed, o)
+
+            # Pixels influenced by byte offset 0
+            source_pixels = self.mask_and_shift_data(compare_packed, o)
+            target_pixels = self.mask_and_shift_data(self.packed, o)
+
+            # Concatenate N-bit source and target into 2N-bit values
+            pair = (source_pixels << self.MASKED_BITS) + target_pixels
+            dist = self.edit_distances(self.palette)[o][pair].reshape(
+                pair.shape)
+            dists.append(dist)
+
+        # Interleave even/odd columns
+        diff[:, :, 0::2] = dists[0]
+        diff[:, :, 1::2] = dists[1]
+
+        return diff
 
 
 class HGRBitmap(Bitmap):
@@ -572,7 +572,7 @@ class HGRBitmap(Bitmap):
     #
     # From header: 3 bits (2 HGR pixels but might be shifted right by palette)
     # From body: 7 bits doubled, plus possible shift from palette bit
-    MASKED_DOTS = np.uint64(18)  # 3 + 7 + 7
+    MASKED_DOTS = np.uint64(18)  # 3 + 7 + 7 + 1
 
     # List of bitmasks for extracting the subset of packed data corresponding
     # to bits influencing/influenced by a given byte offset.  These must be
@@ -593,6 +593,9 @@ class HGRBitmap(Bitmap):
     #   odd:  2 (3)
     PHASES = [1, 3]
 
+    # Need to consider all 0x0 .. 0xff content stores
+    CONTENT_RANGE = 256
+
     def __init__(self, palette: pal.Palette, main_memory: MemoryMap):
         super(HGRBitmap, self).__init__(palette, main_memory, None)
 
@@ -850,7 +853,7 @@ class DHGRBitmap(Bitmap):
         np.uint64(0b0000000000000011111111111110000000),  # byte 1 uint13 mask
         np.uint64(0b0000000111111111111100000000000000),  # byte 2 uint13 mask
         np.uint64(0b1111111111111000000000000000000000),  # byte 3 uint13 mask
-    ]
+    ]             #      XXX            XXX
 
     # How much to right-shift bits after masking, to bring into uint13 range
     BYTE_SHIFTS = [np.uint64(0), np.uint64(7), np.uint64(14), np.uint64(21)]
@@ -866,6 +869,9 @@ class DHGRBitmap(Bitmap):
     # MAIN 1: 1 (2)
     PHASES = [1, 0, 3, 2]
 
+    # Only need to consider 0x0 .. 0x7f content stores
+    CONTENT_RANGE = 128
+
     @staticmethod
     def _make_header(col: IntOrArray) -> IntOrArray:
         """Extract upper 3 bits of body for header of next column."""
diff --git a/transcoder/screen_test.py b/transcoder/screen_test.py
index 2bcbb5b..7770913 100644
--- a/transcoder/screen_test.py
+++ b/transcoder/screen_test.py
@@ -39,17 +39,16 @@ class TestDHGRBitmap(unittest.TestCase):
     def test_pixel_packing_offset_0(self):
         """Screen byte packing happens correctly at offset 0."""
 
-        #                              PBBBAAAA
-        self.aux.page_offset[0, 0] = 0b11110101
-        #                               PDDCCCCB
-        self.main.page_offset[0, 0] = 0b01000011
-        #                              PFEEEEDD
-        self.aux.page_offset[0, 1] = 0b11110101
-        #                               PGGGGFFF
-        self.main.page_offset[0, 1] = 0b01000011
-
         dhgr = screen.DHGRBitmap(
             main_memory=self.main, aux_memory=self.aux, palette=Palette.NTSC)
+        #                                  PBBBAAAA
+        dhgr.apply(0, 0, True, np.uint8(0b11110101))
+        #                                   PDDCCCCB
+        dhgr.apply(0, 0, False, np.uint8(0b01000011))
+        #                                  PFEEEEDD
+        dhgr.apply(0, 1, True, np.uint8(0b11110101))
+        #                                   PGGGGFFF
+        dhgr.apply(0, 1, False, np.uint8(0b01000011))
 
         self.assertEqual(
             0b0001000011111010110000111110101000,
@@ -69,17 +68,25 @@ class TestDHGRBitmap(unittest.TestCase):
     def test_pixel_packing_offset_1(self):
         """Screen byte packing happens correctly at offset 1."""
 
-        #                              PBBBAAAA
-        self.aux.page_offset[0, 2] = 0b11110101
-        #                               PDDCCCCB
-        self.main.page_offset[0, 2] = 0b01000011
-        #                              PFEEEEDD
-        self.aux.page_offset[0, 3] = 0b11110101
-        #                               PGGGGFFF
-        self.main.page_offset[0, 3] = 0b01000011
+        # #                              PBBBAAAA
+        # self.aux.page_offset[0, 2] = 0b11110101
+        # #                               PDDCCCCB
+        # self.main.page_offset[0, 2] = 0b01000011
+        # #                              PFEEEEDD
+        # self.aux.page_offset[0, 3] = 0b11110101
+        # #                               PGGGGFFF
+        # self.main.page_offset[0, 3] = 0b01000011
 
         dhgr = screen.DHGRBitmap(
             main_memory=self.main, aux_memory=self.aux, palette=Palette.NTSC)
+        #                                  PBBBAAAA
+        dhgr.apply(0, 2, True, np.uint8(0b11110101))
+        #                                   PDDCCCCB
+        dhgr.apply(0, 2, False, np.uint8(0b01000011))
+        #                                  PFEEEEDD
+        dhgr.apply(0, 3, True, np.uint8(0b11110101))
+        #                                   PGGGGFFF
+        dhgr.apply(0, 3, False, np.uint8(0b01000011))
 
         self.assertEqual(
             0b0001000011111010110000111110101000,
@@ -104,17 +111,17 @@ class TestDHGRBitmap(unittest.TestCase):
     def test_pixel_packing_offset_127(self):
         """Screen byte packing happens correctly at offset 127."""
 
-        #                              PBBBAAAA
-        self.aux.page_offset[0, 254] = 0b11110101
-        #                               PDDCCCCB
-        self.main.page_offset[0, 254] = 0b01000011
-        #                              PFEEEEDD
-        self.aux.page_offset[0, 255] = 0b11110101
-        #                               PGGGGFFF
-        self.main.page_offset[0, 255] = 0b01000011
 
         dhgr = screen.DHGRBitmap(
             main_memory=self.main, aux_memory=self.aux, palette=Palette.NTSC)
+        #                                    PBBBAAAA
+        dhgr.apply(0, 254, True, np.uint8(0b11110101))
+        #                                     PDDCCCCB
+        dhgr.apply(0, 254, False, np.uint8(0b01000011))
+        #                                    PFEEEEDD
+        dhgr.apply(0, 255, True, np.uint8(0b11110101))
+        #                                     PGGGGFFF
+        dhgr.apply(0, 255, False, np.uint8(0b01000011))
 
         self.assertEqual(
             0b0001000011111010110000111110101000,
@@ -277,6 +284,7 @@ class TestDHGRBitmap(unittest.TestCase):
             0b1110000000000000000000000000000000,
             dhgr.packed[12, 17])
 
+        # Update offset 2
         dhgr.apply(page=12, offset=36, is_aux=False, value=np.uint8(0b0001101))
         self.assertEqual(
             0b101,
@@ -315,6 +323,17 @@ class TestDHGRBitmap(unittest.TestCase):
             0b1011010101000000000000000000000000,
             dhgr.packed[12, 17])
 
+        # Now propagate new footer from neighbour onto (12, 18)
+        dhgr.apply(page=12, offset=38, is_aux=True, value=np.uint8(0b1111001))
+        self.assertEqual(
+            0b0011010101111111100011010001101101,
+            dhgr.packed[12, 18]
+        )
+        # Neighbouring header
+        self.assertEqual(
+            0b0000000000000000000000001111001101,
+            dhgr.packed[12, 19])
+
     def test_fix_array_neighbours(self):
         """Test that _fix_array_neighbours DTRT after masked_update."""
 
@@ -351,6 +370,15 @@ class TestDHGRBitmap(unittest.TestCase):
             )
         )
 
+        packed = dhgr.masked_update(0, dhgr.packed, np.uint8(0b101))
+        dhgr._fix_array_neighbours(packed, 0)
+
+        # Should propagate to all footers
+        self.assertEqual(
+            0, np.count_nonzero(
+                packed[packed != 0b1010000000000000000000000000101000]
+            )
+        )
 
 class TestHGRBitmap(unittest.TestCase):
     def setUp(self) -> None:
diff --git a/transcoder/video.py b/transcoder/video.py
index b8ffd12..a181245 100644
--- a/transcoder/video.py
+++ b/transcoder/video.py
@@ -16,8 +16,6 @@ from video_mode import VideoMode
 class Video:
     """Encodes sequence of images into prioritized screen byte changes."""
 
-    CLOCK_SPEED = 1024 * 1024  # type: int
-
     def __init__(
             self,
             frame_grabber: FrameGrabber,
@@ -80,8 +78,9 @@ class Video:
             update_priority = self.update_priority
 
         # Make sure nothing is leaking into screen holes
-        assert np.count_nonzero(
-            memory_map.page_offset[screen.SCREEN_HOLES]) == 0
+        # XXX why is this happening?  Maybe because we're not scoring <4 stores
+        if np.count_nonzero(memory_map.page_offset[screen.SCREEN_HOLES]):
+            print("Someone stored in screen holes")
 
         print("Similarity %f" % (update_priority.mean()))
 
@@ -117,181 +116,424 @@ class Video:
             )
 
         diff_weights = target_pixelmap.diff_weights(self.pixelmap, is_aux)
-        # Don't bother storing into screen holes
-        diff_weights[screen.SCREEN_HOLES] = 0
 
         # Clear any update priority entries that have resolved themselves
         # with new frame
         update_priority[diff_weights == 0] = 0
         update_priority += diff_weights
 
-        priorities = self._heapify_priorities(update_priority)
+        # priorities = self._heapify_priorities(update_priority)
 
-        content_deltas = {}
+        content_deltas = 5 * target_pixelmap.compute_delta(is_aux)
+        # print(content_deltas[:, 0, 0])
 
-        while priorities:
-            pri, _, page, offset = heapq.heappop(priorities)
+        # Only want to consider deltas that are < 0
+        # content_deltas[content_deltas >= update_priority] = 0
 
-            assert not screen.SCREEN_HOLES[page, offset], (
-                    "Attempted to store into screen hole at (%d, %d)" % (
-                page, offset))
+        edit_distance = content_deltas - update_priority
 
-            # Check whether we've already cleared this diff while processing
-            # an earlier opcode
-            if update_priority[page, offset] == 0:
+        # print(edit_distance[:, 0, 0])
+        candidates = np.sum(edit_distance < 0)
+        print("Candidates = %d" % candidates)
+
+        # We care about finding the 4 smallest elements for each (
+        # content, page), but not their order.
+        smallest_idx = np.argpartition(edit_distance, 3, axis=2)[:, :, :4]
+        # smallest = # np.sort(
+        smallest = np.take_along_axis(edit_distance, smallest_idx,
+                                          axis=2)  #, axis=2)
+        while True:
+            # Score should be sum of first 4 non-zero elements
+
+            # score = np.apply_along_axis(_score, 2, smallest)
+
+            # XXX turn into vector
+            # scores = [
+            #     smallest[:, :, 0],
+            #     np.sum(smallest[:, :, :2], axis=2),
+            #     np.sum(smallest[:, :, :3], axis=2),
+            #     np.sum(smallest, axis=2)
+            # ]
+
+            score = np.sum(smallest, axis=2)
+
+            idx = np.argmin(score, axis=None)
+            #print([s.shape for s in scores])
+
+            # print(score[:, 0])
+            # print(score.shape)
+            # idx = np.array((
+            #     np.argmin(scores[0], axis=None),
+            #     np.argmin(scores[1], axis=None),
+            #     np.argmin(scores[2], axis=None),
+            #     np.argmin(scores[3], axis=None)
+            # ))
+            # contents, pages = np.unravel_index(idx, scores[0].shape)
+            # #print(contents, pages)
+            # best_scores = np.array([scores[i][contents[i], pages[i]] for i in
+            #                         range(4)])
+            # idx_argmin = np.argmin(best_scores)
+            # #print(best_scores)
+            # #print(idx_argmin)
+            # num_offsets = idx_argmin + 1
+            #
+            # sc = best_scores[idx_argmin]
+            # # print(sc)
+            # content, page = contents[idx_argmin], pages[idx_argmin]
+            #print(score.shape)
+            # print("Taking %d args" % num_offsets)
+            # TODO: also consider what happens if we only store 1, 2 or 3
+            #  offsets e.g. might only be a single pixel to fix up, as in
+            #  AppleVision video.
+
+            content, page = np.unravel_index(idx, score.shape)
+            #print(content, page)
+            sc = score[content, page]
+            # print([s[content, page] for s in scores])
+            # print(sc, content, page)
+            if sc == 0:
+                break
+            assert sc < 0
+
+            # May not have 4 valid offsets so have to recompute explicitly
+            # i.e. can't just use smallest_idx[content, page]
+            nonzero_offsets = smallest[content, page] < 0
+            offsets = smallest_idx[
+                content, page, nonzero_offsets].tolist()  # [:num_offsets]
+
+            # print(sc, content, page, offsets)
+
+            # TODO: uncomment once we are saving residual diff
+            if any(diff_weights[page, o] == 0 for o in offsets):
+                print("someone else got here first")
                 continue
 
-            offsets = [offset]
-            content = target.page_offset[page, offset]
-            if self.mode == VideoMode.DHGR:
-                # DHGR palette bit not expected to be set
-                assert content < 0x80
+            for o in offsets:
+                # TODO: uncomment once we are saving residual diff
+                assert edit_distance[content, page, o]
 
-            # Clear priority for the offset we're emitting
-            update_priority[page, offset] = 0
-            diff_weights[page, offset] = 0
+                # TODO: move these all outside the loop & vectorize
 
-            # Update memory maps
-            source.page_offset[page, offset] = content
-            self.pixelmap.apply(page, offset, is_aux, content)
+                # TODO: temporal error diffusion - push residual error into
+                # next frame
+                update_priority[page, o] = 0  # += content_deltas[content,
+                # page, o]
+                # assert update_priority[page, o] >= 0
+                diff_weights[page, o] = 0
 
-            # Make sure we don't emit this offset as a side-effect of some
-            # other offset later.
-            for cd in content_deltas.values():
-                cd[page, offset] = 0
-                # TODO: what if we add another content_deltas entry later?
-                #  We might clobber it again
+                content_deltas[:, page, o] = 0
 
-            # Need to find 3 more offsets to fill this opcode
-            for err, o in self._compute_error(
-                    page,
-                    content,
-                    target_pixelmap,
-                    diff_weights,
-                    content_deltas,
-                    is_aux
-            ):
-                assert o != offset
-                assert not screen.SCREEN_HOLES[page, o], (
-                        "Attempted to store into screen hole at (%d, %d)" % (
-                    page, o))
-
-                if update_priority[page, o] == 0:
-                    # Someone already resolved this diff.
-                    continue
-
-                # Make sure we don't end up considering this (page, offset)
-                # again until the next image frame.  Even if a better match
-                # comes along, it's probably better to fix up some other byte.
-                # TODO: or should we recompute it with new error?
-                for cd in content_deltas.values():
-                    cd[page, o] = 0
-
-                byte_offset = target_pixelmap.byte_offset(o, is_aux)
-                old_packed = target_pixelmap.packed[page, o // 2]
-
-                p = target_pixelmap.byte_pair_difference(
-                    byte_offset, old_packed, content)
-
-                # Update priority for the offset we're emitting
-                update_priority[page, o] = p
+                edit_distance[:, page, o] = 0
 
+                # Update memory maps
                 source.page_offset[page, o] = content
-                self.pixelmap.apply(page, o, is_aux, content)
+                self.pixelmap.apply(page, o, is_aux, np.uint8(content))
 
-                if p:
-                    # This content byte introduced an error, so put back on the
-                    # heap in case we can get back to fixing it exactly
-                    # during this frame.  Otherwise we'll get to it later.
-                    heapq.heappush(
-                        priorities, (-p, random.getrandbits(8), page, o))
-
-                offsets.append(o)
-                if len(offsets) == 3:
-                    break
-
-            # Pad to 4 if we didn't find enough
+                # Pad to 4 if we didn't find enough
             for _ in range(len(offsets), 4):
                 offsets.append(offsets[0])
+
+            self._repartition(edit_distance, smallest_idx, smallest, page,
+                              offsets)
+
             yield (page + 32, content, offsets)
 
-        # # TODO: there is still a bug causing residual diffs when we have
-        # # apparently run out of work to do
-        if not np.array_equal(source.page_offset, target.page_offset):
-            diffs = np.nonzero(source.page_offset != target.page_offset)
-            for i in range(len(diffs[0])):
-                diff_p = diffs[0][i]
-                diff_o = diffs[1][i]
-
-                # For HGR, 0x00 or 0x7f may be visually equivalent to the same
-                # bytes with high bit set (depending on neighbours), so skip
-                # them
-                if (source.page_offset[diff_p, diff_o] & 0x7f) == 0 and \
-                        (target.page_offset[diff_p, diff_o] & 0x7f) == 0:
-                    continue
-
-                if (source.page_offset[diff_p, diff_o] & 0x7f) == 0x7f and \
-                        (target.page_offset[diff_p, diff_o] & 0x7f) == 0x7f:
-                    continue
-
-                print("Diff at (%d, %d): %d != %d" % (
-                    diff_p, diff_o, source.page_offset[diff_p, diff_o],
-                    target.page_offset[diff_p, diff_o]
-                ))
-                # assert False
+        print("Done")
 
         # If we run out of things to do, pad forever
         content = target.page_offset[0, 0]
         while True:
             yield (32, content, [0, 0, 0, 0])
 
-    @staticmethod
-    def _heapify_priorities(update_priority: np.array) -> List:
-        """Build priority queue of (page, offset) ordered by update priority."""
+    def _repartition(
+            self,
+            edit_distance: np.ndarray,
+            smallest_idx: np.ndarray,
+            smallest: np.ndarray,
+            page: int,
+            offsets: int
+    ):
+        sip = smallest_idx[:, page, :]
+        contents, _ = (
+                (sip == offsets[0]) |
+                (sip == offsets[1]) |
+                (sip == offsets[2]) |
+                (sip == offsets[3])
+        ).nonzero()
+        # print("Repartitioning %d" % len(contents))
+        for content in contents:
+            partition = np.argpartition(
+                edit_distance[content, page], 3)[:4]
+            smallest_idx[content, page] = partition
+            smallest[content, page] = np.take(
+                edit_distance[content, page], partition)
 
-        # Use numpy vectorization to efficiently compute the list of
-        # (priority, random nonce, page, offset) tuples to be heapified.
-        pages, offsets = update_priority.nonzero()
-        priorities = [tuple(data) for data in np.stack((
-            -update_priority[pages, offsets],
-            # Don't use deterministic order for page, offset
-            np.random.randint(0, 2 ** 8, size=pages.shape[0]),
-            pages,
-            offsets)
-        ).T.tolist()]
+        return
 
-        heapq.heapify(priorities)
-        return priorities
+    def _compute_delta(
+            self,
+            target: screen.DHGRBitmap,
+            old,
+            is_aux: bool
+    ):
+        # Only need to consider 0x0 .. 0x7f content stores
+        diff = np.ndarray((128, 32, 256), dtype=np.int)
 
-    _OFFSETS = np.arange(256)
+        all_content_bytes = np.arange(128).reshape((128, 1))
 
-    def _compute_error(self, page, content, target_pixelmap, diff_weights,
-                       content_deltas, is_aux):
-        """Build priority queue of other offsets at which to store content.
+        # TODO: use error edit distance
 
-        Ordered by offsets which are closest to the target content value.
-        """
-        # TODO: move this up into parent
-        delta_screen = content_deltas.get(content)
-        if delta_screen is None:
-            delta_screen = target_pixelmap.compute_delta(
-                content, diff_weights, is_aux)
-            content_deltas[content] = delta_screen
+        # def _shift8(s0, t0):
+        #     return (s0 << 8) + t0
+        #
+        # def _shift12(s0, t0):
+        #     return (s0 << 12) + t0
 
-        delta_page = delta_screen[page]
-        cond = delta_page < 0
-        candidate_offsets = self._OFFSETS[cond]
-        priorities = delta_page[cond]
+        def _target_masked(content, t, byte_offset):
+            return target.masked_update(byte_offset, t, content)
 
-        deltas = [
-            (priorities[i], random.getrandbits(8), candidate_offsets[i])
-            for i in range(len(candidate_offsets))
-        ]
-        heapq.heapify(deltas)
+        if is_aux:
+            # Pixels influenced by byte offset 0
+            source_pixels0 = target.mask_and_shift_data(
+                np.apply_along_axis(
+                    _target_masked, 1, all_content_bytes, target.packed,
+                    0), 0)
+            target_pixels0 = target.mask_and_shift_data(target.packed,0)
 
-        while deltas:
-            pri, _, o = heapq.heappop(deltas)
-            assert pri < 0
-            assert o <= 255
+            # Concatenate 8-bit source and target into 16-bit values
+            pair0 = (source_pixels0 << 8) + target_pixels0
+            dist0 = target.edit_distances(self.palette)[0][
+                pair0].reshape(
+                pair0.shape)
 
-            yield -pri, o
+            # Pixels influenced by byte offset 2
+            source_pixels2 = target.mask_and_shift_data(
+                np.apply_along_axis(
+                    _target_masked, 1, all_content_bytes, target.packed,
+                    2), 2)
+            target_pixels2 = target.mask_and_shift_data(target.packed,
+                                                        2)
+            # Concatenate 12-bit source and target into 24-bit values
+            pair2 = (source_pixels2 << 12) + target_pixels2
+            dist2 = target.edit_distances(self.palette)[2][
+                pair2].reshape(
+                pair2.shape)
+
+            diff[:, :, 0::2] = dist0
+            diff[:, :, 1::2] = dist2
+
+        else:
+            # Pixels influenced by byte offset 1
+            source_pixels1 = target.mask_and_shift_data(
+                np.apply_along_axis(
+                    _target_masked, 1, all_content_bytes, target.packed,
+                    1), 1)
+            target_pixels1 = target.mask_and_shift_data(target.packed,
+                                                        1)
+            pair1 = (source_pixels1 << 12) + target_pixels1
+            dist1 = target.edit_distances(self.palette)[1][
+                pair1].reshape(
+                pair1.shape)
+
+            # Pixels influenced by byte offset 3
+            source_pixels3 = target.mask_and_shift_data(
+                np.apply_along_axis(
+                    _target_masked, 1, all_content_bytes, target.packed,
+                    3), 3)
+            target_pixels3 = target.mask_and_shift_data(target.packed,
+                                                        3)
+            pair3 = (source_pixels3 << 8) + target_pixels3
+            dist3 = target.edit_distances(self.palette)[3][
+                pair3].reshape(
+                pair3.shape)
+
+            diff[:, :, 0::2] = dist1
+            diff[:, :, 1::2] = dist3
+        # TODO: try different weightings
+        # 66% of the time this found enough to fill at 3 offsets
+        # 18693 0
+        # 14758 1
+        # 12629 2
+        # / 136804
+        # and only 13% of the time found no candidates
+        return diff
+    #
+    #     diff_weights = target_pixelmap.diff_weights(self.pixelmap, is_aux)
+    #     # Don't bother storing into screen holes
+    #     diff_weights[screen.SCREEN_HOLES] = 0
+    #
+    #     # Clear any update priority entries that have resolved themselves
+    #     # with new frame
+    #     update_priority[diff_weights == 0] = 0
+    #     update_priority += diff_weights
+    #
+    #     priorities = self._heapify_priorities(update_priority)
+    #
+    #     content_deltas = {}
+    #
+    #     while priorities:
+    #         pri, _, page, offset = heapq.heappop(priorities)
+    #
+    #         assert not screen.SCREEN_HOLES[page, offset], (
+    #                 "Attempted to store into screen hole at (%d, %d)" % (
+    #             page, offset))
+    #
+    #         # Check whether we've already cleared this diff while processing
+    #         # an earlier opcode
+    #         if update_priority[page, offset] == 0:
+    #             continue
+    #
+    #         offsets = [offset]
+    #         content = target.page_offset[page, offset]
+    #         if self.mode == VideoMode.DHGR:
+    #             # DHGR palette bit not expected to be set
+    #             assert content < 0x80
+    #
+    #         # Clear priority for the offset we're emitting
+    #         update_priority[page, offset] = 0
+    #         diff_weights[page, offset] = 0
+    #
+    #         # Update memory maps
+    #         source.page_offset[page, offset] = content
+    #         self.pixelmap.apply(page, offset, is_aux, content)
+    #
+    #         # Make sure we don't emit this offset as a side-effect of some
+    #         # other offset later.
+    #         for cd in content_deltas.values():
+    #             cd[page, offset] = 0
+    #             # TODO: what if we add another content_deltas entry later?
+    #             #  We might clobber it again
+    #
+    #         # Need to find 3 more offsets to fill this opcode
+    #         for err, o in self._compute_error(
+    #                 page,
+    #                 content,
+    #                 target_pixelmap,
+    #                 diff_weights,
+    #                 content_deltas,
+    #                 is_aux
+    #         ):
+    #             assert o != offset
+    #             assert not screen.SCREEN_HOLES[page, o], (
+    #                     "Attempted to store into screen hole at (%d, %d)" % (
+    #                 page, o))
+    #
+    #             if update_priority[page, o] == 0:
+    #                 # Someone already resolved this diff.
+    #                 continue
+    #
+    #             # Make sure we don't end up considering this (page, offset)
+    #             # again until the next image frame.  Even if a better match
+    #             # comes along, it's probably better to fix up some other byte.
+    #             # TODO: or should we recompute it with new error?
+    #             for cd in content_deltas.values():
+    #                 cd[page, o] = 0
+    #
+    #             byte_offset = target_pixelmap.byte_offset(o, is_aux)
+    #             old_packed = target_pixelmap.packed[page, o // 2]
+    #
+    #             p = target_pixelmap.byte_pair_difference(
+    #                 byte_offset, old_packed, content)
+    #
+    #             # Update priority for the offset we're emitting
+    #             update_priority[page, o] = p
+    #
+    #             source.page_offset[page, o] = content
+    #             self.pixelmap.apply(page, o, is_aux, content)
+    #
+    #             if p:
+    #                 # This content byte introduced an error, so put back on the
+    #                 # heap in case we can get back to fixing it exactly
+    #                 # during this frame.  Otherwise we'll get to it later.
+    #                 heapq.heappush(
+    #                     priorities, (-p, random.getrandbits(8), page, o))
+    #
+    #             offsets.append(o)
+    #             if len(offsets) == 3:
+    #                 break
+    #
+    #         # Pad to 4 if we didn't find enough
+    #         for _ in range(len(offsets), 4):
+    #             offsets.append(offsets[0])
+    #         yield (page + 32, content, offsets)
+    #
+    #     # # TODO: there is still a bug causing residual diffs when we have
+    #     # # apparently run out of work to do
+    #     if not np.array_equal(source.page_offset, target.page_offset):
+    #         diffs = np.nonzero(source.page_offset != target.page_offset)
+    #         for i in range(len(diffs[0])):
+    #             diff_p = diffs[0][i]
+    #             diff_o = diffs[1][i]
+    #
+    #             # For HGR, 0x00 or 0x7f may be visually equivalent to the same
+    #             # bytes with high bit set (depending on neighbours), so skip
+    #             # them
+    #             if (source.page_offset[diff_p, diff_o] & 0x7f) == 0 and \
+    #                     (target.page_offset[diff_p, diff_o] & 0x7f) == 0:
+    #                 continue
+    #
+    #             if (source.page_offset[diff_p, diff_o] & 0x7f) == 0x7f and \
+    #                     (target.page_offset[diff_p, diff_o] & 0x7f) == 0x7f:
+    #                 continue
+    #
+    #             print("Diff at (%d, %d): %d != %d" % (
+    #                 diff_p, diff_o, source.page_offset[diff_p, diff_o],
+    #                 target.page_offset[diff_p, diff_o]
+    #             ))
+    #             # assert False
+    #
+    #     # If we run out of things to do, pad forever
+    #     content = target.page_offset[0, 0]
+    #     while True:
+    #         yield (32, content, [0, 0, 0, 0])
+    #
+    # @staticmethod
+    # def _heapify_priorities(update_priority: np.array) -> List:
+    #     """Build priority queue of (page, offset) ordered by update priority."""
+    #
+    #     # Use numpy vectorization to efficiently compute the list of
+    #     # (priority, random nonce, page, offset) tuples to be heapified.
+    #     pages, offsets = update_priority.nonzero()
+    #     priorities = [tuple(data) for data in np.stack((
+    #         -update_priority[pages, offsets],
+    #         # Don't use deterministic order for page, offset
+    #         np.random.randint(0, 2 ** 8, size=pages.shape[0]),
+    #         pages,
+    #         offsets)
+    #     ).T.tolist()]
+    #
+    #     heapq.heapify(priorities)
+    #     return priorities
+    #
+    # _OFFSETS = np.arange(256)
+    #
+    # def _compute_error(self, page, content, target_pixelmap, diff_weights,
+    #                    content_deltas, is_aux):
+    #     """Build priority queue of other offsets at which to store content.
+    #
+    #     Ordered by offsets which are closest to the target content value.
+    #     """
+    #     # TODO: move this up into parent
+    #     delta_screen = content_deltas.get(content)
+    #     if delta_screen is None:
+    #         delta_screen = target_pixelmap.compute_delta(
+    #             content, diff_weights, is_aux)
+    #         content_deltas[content] = delta_screen
+    #
+    #     delta_page = delta_screen[page]
+    #     cond = delta_page < 0
+    #     candidate_offsets = self._OFFSETS[cond]
+    #     priorities = delta_page[cond]
+    #
+    #     deltas = [
+    #         (priorities[i], random.getrandbits(8), candidate_offsets[i])
+    #         for i in range(len(candidate_offsets))
+    #     ]
+    #     heapq.heapify(deltas)
+    #
+    #     while deltas:
+    #         pri, _, o = heapq.heappop(deltas)
+    #         assert pri < 0
+    #         assert o <= 255
+    #
+    #         yield -pri, o