ii-vision/edit_distance.py
kris 01ffd034eb Move edit distance functions into separate module and clean up
partially.

Slight optimization to not heapppush() many times, instead build a
regular list and then heapify.
2019-03-14 22:32:52 +00:00

164 lines
4.3 KiB
Python

import functools
import numpy as np
import weighted_levenshtein
@functools.lru_cache(None)
def byte_to_colour_string(b: int, is_odd_offset: bool) -> str:
pixels = []
idx = 0
if is_odd_offset:
pixels.append("01"[b & 0x01])
idx += 1
# K = black
# G = green
# V = violet
# W = white
palettes = (
(
"K", # 0x00
"V", # 0x01
"G", # 0x10
"W" # 0x11
), (
"K", # 0x00
"B", # 0x01
"O", # 0x10
"W" # 0x11
)
)
palette = palettes[(b & 0x80) != 0]
for _ in range(3):
pixel = palette[(b >> idx) & 0b11]
pixels.append(pixel)
idx += 2
if not is_odd_offset:
pixels.append("01"[b & 0x40 != 0])
idx += 1
return "".join(pixels)
# TODO: what about increasing transposition cost? Might be better to have
# any pixel at the right place even if the wrong colour?
substitute_costs = np.ones((128, 128), dtype=np.float64)
error_substitute_costs = np.ones((128, 128), dtype=np.float64)
# Penalty for turning on/off a black bit
for c in "01GVWOB":
substitute_costs[(ord('K'), ord(c))] = 1
substitute_costs[(ord(c), ord('K'))] = 1
error_substitute_costs[(ord('K'), ord(c))] = 5
error_substitute_costs[(ord(c), ord('K'))] = 5
# Penalty for changing colour
for c in "01GVWOB":
for d in "01GVWOB":
substitute_costs[(ord(c), ord(d))] = 1
substitute_costs[(ord(d), ord(c))] = 1
error_substitute_costs[(ord(c), ord(d))] = 5
error_substitute_costs[(ord(d), ord(c))] = 5
insert_costs = np.ones(128, dtype=np.float64) * 1000
delete_costs = np.ones(128, dtype=np.float64) * 1000
def _edit_weight(a: int, b: int, is_odd_offset: bool, error: bool):
a_pixels = byte_to_colour_string(a, is_odd_offset)
b_pixels = byte_to_colour_string(b, is_odd_offset)
dist = weighted_levenshtein.dam_lev(
a_pixels, b_pixels,
insert_costs=insert_costs,
delete_costs=delete_costs,
substitute_costs=error_substitute_costs if error else substitute_costs,
)
return np.int64(dist)
def edit_weight_matrixes(error: bool) -> np.array:
ewm = np.zeros(shape=(256, 256, 2), dtype=np.int64)
for a in range(256):
for b in range(256):
for is_odd_offset in (False, True):
ewm[a, b, int(is_odd_offset)] = _edit_weight(
a, b, is_odd_offset, error)
return ewm
_ewm = edit_weight_matrixes(False)
_error_ewm = edit_weight_matrixes(True)
@functools.lru_cache(None)
def edit_weight(a: int, b: int, is_odd_offset: bool, error: bool):
e = _error_ewm if error else _ewm
return e[a, b, int(is_odd_offset)]
_even_ewm = {}
_odd_ewm = {}
_even_error_ewm = {}
_odd_error_ewm = {}
for a in range(256):
for b in range(256):
_even_ewm[(a << 8) + b] = edit_weight(a, b, False, False)
_odd_ewm[(a << 8) + b] = edit_weight(a, b, True, False)
_even_error_ewm[(a << 8) + b] = edit_weight(a, b, False, True)
_odd_error_ewm[(a << 8) + b] = edit_weight(a, b, True, True)
@functools.lru_cache(None)
def _content_a_array(content: int, shape) -> np.array:
return (np.ones(shape, dtype=np.uint16) * content) << 8
def content_edit_weight(content: int, b: np.array) -> np.array:
assert b.shape == (32, 256), b.shape
# Extract even and off column offsets (128,)
even_b = b[:, ::2]
odd_b = b[:, 1::2]
a = _content_a_array(content, even_b.shape)
even = a + even_b
odd = a + odd_b
even_weights = np.vectorize(_even_error_ewm.__getitem__)(even)
odd_weights = np.vectorize(_odd_error_ewm.__getitem__)(odd)
res = np.ndarray(shape=b.shape, dtype=np.int64)
res[:, ::2] = even_weights
res[:, 1::2] = odd_weights
return res
def array_edit_weight(a: np.array, b: np.array) -> np.array:
# Extract even and off column offsets (32, 128)
even_a = a[:, ::2]
odd_a = a[:, 1::2]
even_b = b[:, ::2]
odd_b = b[:, 1::2]
even = (even_a.astype(np.uint16) << 8) + even_b
odd = (odd_a.astype(np.uint16) << 8) + odd_b
even_weights = np.vectorize(_even_ewm.__getitem__)(even)
odd_weights = np.vectorize(_odd_ewm.__getitem__)(odd)
res = np.ndarray(shape=a.shape, dtype=np.int64)
res[:, ::2] = even_weights
res[:, 1::2] = odd_weights
return res