From 468d47966d7387a148b4aabc28ac8985c3fa0546 Mon Sep 17 00:00:00 2001 From: kris Date: Thu, 3 Dec 2020 14:02:36 +0000 Subject: [PATCH] Optimize encoder by memoizing better, and expanding the recurrence relation for the difference equation to use a closed-form solution that can be vectorized better. This is now about 3x faster. --- encode_audio.py | 98 +++++++++++++++++++++++++++++++------------------ opcodes.py | 70 ++++++----------------------------- 2 files changed, 75 insertions(+), 93 deletions(-) diff --git a/encode_audio.py b/encode_audio.py index ca47ba9..839885e 100755 --- a/encode_audio.py +++ b/encode_audio.py @@ -24,16 +24,25 @@ # compensate for this "dead" period by pre-positioning. import collections +import functools import sys import librosa import numpy from eta import ETA +from typing import Tuple import opcodes + # TODO: add flags to parametrize options +@functools.lru_cache(None) +def _delta_powers(shape, step_size: int) -> Tuple[float, numpy.ndarray]: + delta = (1 - 1 / step_size) + return delta, numpy.cumprod(numpy.full(shape, delta), axis=-1) + + def lookahead(step_size: int, initial_position: float, data: numpy.ndarray, offset: int, voltages: numpy.ndarray): """Evaluate effects of multiple potential opcode sequences and pick best. @@ -46,53 +55,76 @@ def lookahead(step_size: int, initial_position: float, data: numpy.ndarray, performance with more opcode choices, although also has a larger fixed overhead. """ - positions = numpy.empty((voltages.shape[0], voltages.shape[1] + 1), - dtype=numpy.float32) - positions[:, 0] = initial_position + # The speaker position p_i evolves according to + # p_{i+1} = p_i + (v_i - p_i) / s + # where v_i is the i'th applied voltage, s is the speaker step size + # + # Rearranging, we get p_{i+1} = v_i / s + (1-1/s) p_i + # and if we expand the recurrence relation + # p_{i+1} = Sum_{j=0}^i (1-1/s)^(i-j) v_j / s + (1-1/s)^(i+1) p_0 + # = (1-1/s)^(i+1)(1/s * Sum_{j=0}^i v_j / (1-1/s)^(j+1) + p0) + delta, delta_powers = _delta_powers(voltages.shape, step_size) - target_val = data[offset:offset + voltages.shape[1]] - scaled_voltages = voltages / step_size - - for i in range(0, voltages.shape[1]): - positions[:, i + 1] = ( - scaled_voltages[:, i] + positions[:, i] * (1 - 1 / step_size)) - err = positions[:, 1:] - target_val - total_error = numpy.sum(numpy.power(err, 2), axis=1) + positions = delta_powers * ( + numpy.cumsum(voltages / delta_powers, axis=1) / step_size + + initial_position) + total_error = numpy.sum( + numpy.square(positions - data[offset:offset + voltages.shape[1]]), + axis=1) best = numpy.argmin(total_error) return best -# TODO: share implementation with lookahead +# TODO: Merge with lookahead def evolve(opcode: opcodes.Opcode, starting_position, starting_voltage, step_size, data, starting_idx): """Apply the effects of playing a single opcode to completion. Returns new state. """ - opcode_length = opcodes.cycle_length(opcode) voltages = starting_voltage * opcodes.VOLTAGE_SCHEDULE[opcode] - position = starting_position - total_err = 0.0 - v = starting_voltage - for i, v in enumerate(voltages): - position += (v - position) / step_size - err = position - data[starting_idx + i] - total_err += err ** 2 - return position, v, total_err, starting_idx + opcode_length + delta, delta_powers = _delta_powers(opcode_length, step_size) + + positions = delta_powers * ( + numpy.cumsum(voltages / delta_powers) / step_size + + starting_position) + + # TODO: compute error once at the end? + total_err = numpy.sum(numpy.square( + positions - data[starting_idx:starting_idx + opcode_length])) + return positions[-1], voltages[-1], total_err, starting_idx + opcode_length + + +@functools.lru_cache(None) +def frame_horizon(frame_offset: int, lookahead_steps: int): + """Optimize frame_offset when we're not within lookahead_steps of slowpath. + + When computing candidate opcodes, all frame offsets are the same until the + end-of-frame slowpath comes within our lookahead horizon. + """ + if frame_offset < (2047 - lookahead_steps): + return 0 + return frame_offset def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int): """Computes optimal sequence of player opcodes to reproduce audio data.""" dlen = len(data) - data = numpy.concatenate([data, numpy.zeros(lookahead_steps)]).astype( - numpy.float32) + # TODO: avoid temporarily doubling memory footprint to concatenate + data = numpy.concatenate( + [data, numpy.zeros(lookahead_steps, dtype=numpy.float32)]) voltage = -1.0 position = -1.0 + # Pre-warm cache so we don't skew ETA during encoding + for i in range(2048): + _, _ = opcodes.candidate_opcodes(frame_horizon(i, lookahead_steps), + lookahead_steps) + total_err = 0.0 frame_offset = 0 eta = ETA(total=1000) @@ -105,17 +137,13 @@ def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int): eta.print_status() last_updated = i - candidate_opcodes = opcodes.opcode_lookahead( - frame_offset, lookahead_steps) - pruned_opcodes, voltages = opcodes.prune_opcodes( - candidate_opcodes, lookahead_steps) - + candidate_opcodes, voltages = opcodes.candidate_opcodes( + frame_horizon(frame_offset, lookahead_steps), lookahead_steps) opcode_idx = lookahead(step, position, data, i, voltage * voltages) - opcode = pruned_opcodes[opcode_idx].opcodes[0] + opcode = candidate_opcodes[opcode_idx][0] opcode_counts[opcode] += 1 yield opcode - # TODO: round position and memoize, and use in lookahead too position, voltage, new_error, i = evolve( opcode, position, voltage, step, data, i) @@ -135,13 +163,13 @@ def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int): def preprocess( - filename: str, target_sample_rate: int, - normalize: float = 0.5) -> numpy.ndarray: + filename: str, target_sample_rate: int, normalize: float = 0.5, + normalization_percentile: int = 100) -> numpy.ndarray: """Upscale input audio to target sample rate and normalize signal.""" data, _ = librosa.load(filename, sr=target_sample_rate, mono=True) - max_value = numpy.percentile(data, 100) + max_value = numpy.percentile(data, normalization_percentile) data /= max_value data *= normalize @@ -160,10 +188,10 @@ def main(argv): # TODO: PAL Apple ][ clock rate is slightly different sample_rate = int(1024. * 1000) - data = preprocess(serve_file, sample_rate) with open(out, "wb+") as f: - for opcode in audio_bytestream(data, step, lookahead_steps): + for opcode in audio_bytestream( + preprocess(serve_file, sample_rate), step, lookahead_steps): f.write(bytes([opcode.value])) diff --git a/opcodes.py b/opcodes.py index 83852c9..9a8126f 100644 --- a/opcodes.py +++ b/opcodes.py @@ -66,27 +66,6 @@ def cycle_length(op: Opcode) -> int: return len(VOLTAGE_SCHEDULE[op]) -class _Opcodes: - """Container for immutable Iterable[Opcode], to improve hash performance.""" - - def __init__(self, opcodes: Iterable[Opcode]): - self.opcodes = tuple(opcodes) - self._hash = hash(self.opcodes) - - def __hash__(self): - return self._hash - - -# Guarantees each Tuple[Opcode] has a unique _Opcodes representation -_OPCODES_SINGLETON = {} - - -@functools.lru_cache(None) -def Opcodes(opcodes: Tuple[Opcode]): - """Returns unique _Opcodes representation for Tuple[Opcode].""" - return _OPCODES_SINGLETON.setdefault(opcodes, _Opcodes(opcodes)) - - @functools.lru_cache(None) def opcode_choices(frame_offset: int) -> List[Opcode]: """Returns sorted list of valid opcodes for given frame offset. @@ -104,16 +83,6 @@ def opcode_choices(frame_offset: int) -> List[Opcode]: @functools.lru_cache(None) def opcode_lookahead( - frame_offset: int, - lookahead_cycles: int) -> Tuple[_Opcodes]: - """Computes all valid sequences of opcodes spanning lookahead_cycles.""" - - return tuple(Opcodes(ops) for ops in - _opcode_lookahead(frame_offset, lookahead_cycles)) - - -@functools.lru_cache(None) -def _opcode_lookahead( frame_offset: int, lookahead_cycles: int) -> Tuple[Tuple[Opcode]]: """Recursively enumerates all valid opcode sequences.""" @@ -124,53 +93,38 @@ def _opcode_lookahead( if cycle_length(op) >= lookahead_cycles: ops.append((op,)) else: - for res in _opcode_lookahead((frame_offset + 1) % 2048, - lookahead_cycles - cycle_length(op)): + for res in opcode_lookahead((frame_offset + 1) % 2048, + lookahead_cycles - cycle_length(op)): ops.append((op,) + res) return tuple(ops) # TODO: fix return type -class Cycles: - """Container for immutable Tuple[float], to improve hash performance.""" - - def __init__(self, cycles: Tuple[float]): - self.cycles = cycles - self._hash = hash(cycles) - - def __hash__(self): - return self._hash - - -# Guarantees each Tuple[float] has a unique Cycles representation -_CYCLES_SINGLETON = {} - - @functools.lru_cache(None) def cycle_lookahead( - opcodes: _Opcodes, + opcodes: Tuple[Opcode], lookahead_cycles: int -) -> Cycles: +) -> Tuple[float]: """Computes the applied voltage effects of a sequence of opcodes. i.e. produces the sequence of applied voltage changes that will result from executing these opcodes, limited to the next lookahead_cycles. """ cycles = [] - for op in opcodes.opcodes: + for op in opcodes: cycles.extend(VOLTAGE_SCHEDULE[op]) - trunc_cycles = tuple(cycles[:lookahead_cycles]) - return _CYCLES_SINGLETON.setdefault(trunc_cycles, Cycles(trunc_cycles)) + return tuple(cycles[:lookahead_cycles]) @functools.lru_cache(None) -def prune_opcodes( - opcodes: Tuple[_Opcodes], lookahead_cycles: int -) -> Tuple[List[_Opcodes], numpy.ndarray]: +def candidate_opcodes( + frame_offset: int, lookahead_cycles: int +) -> Tuple[List[Opcode], numpy.ndarray]: """Deduplicate a tuple of opcode sequences that are equivalent. For each opcode sequence whose effect is the same when truncated to lookahead_cycles, retains the first such opcode sequence. """ + opcodes = opcode_lookahead(frame_offset, lookahead_cycles) seen_cycles = set() pruned_opcodes = [] pruned_cycles = [] @@ -180,6 +134,6 @@ def prune_opcodes( continue seen_cycles.add(cycles) pruned_opcodes.append(ops) - pruned_cycles.append(cycles.cycles) + pruned_cycles.append(cycles) - return pruned_opcodes, numpy.array(pruned_cycles, dtype=numpy.float32) \ No newline at end of file + return pruned_opcodes, numpy.array(pruned_cycles, dtype=numpy.float32)