From f64b1a6e2c34d01e946589db2ea426e774e73d28 Mon Sep 17 00:00:00 2001 From: kris Date: Sat, 9 Jul 2022 11:20:03 +0100 Subject: [PATCH] - parametrize frame size. 4KB has too much buffering though - Correct speaker model to apply coefficients for a square wave impulse - Parametrize speaker scaling factor - Flush wav file output after 1MB --- encode_audio.py | 106 ++++++++++++++++++++++++++++++++++++------------ opcodes.py | 13 +++--- 2 files changed, 86 insertions(+), 33 deletions(-) diff --git a/encode_audio.py b/encode_audio.py index c826710..1a2f3a7 100755 --- a/encode_audio.py +++ b/encode_audio.py @@ -40,9 +40,16 @@ import opcodes_generated def total_error(positions: numpy.ndarray, data: numpy.ndarray) -> numpy.ndarray: """Computes the total squared error for speaker position matrix vs data.""" + # Make sure we handle gracefully when the opcode would take us beyond + # the end of data + # XXX + # min_len = min(len(positions), len(data)) return numpy.sum(numpy.square(positions - data), axis=-1) +FRAME_SIZE = 2048 + + @functools.lru_cache(None) def frame_horizon(frame_offset: int, lookahead_steps: int): """Optimize frame_offset when more than lookahead_steps from end of frame. @@ -52,13 +59,14 @@ def frame_horizon(frame_offset: int, lookahead_steps: int): """ # TODO: This could be made tighter because a step is always at least 5 # cycles towards lookahead_steps. - if frame_offset < (2047 - lookahead_steps): + if frame_offset < (FRAME_SIZE - lookahead_steps): return 0 return frame_offset class Speaker: - def __init__(self, sample_rate: float, freq: float, damping: float): + def __init__(self, sample_rate: float, freq: float, damping: float, + scale: float): self.sample_rate = sample_rate self.freq = freq self.damping = damping @@ -72,19 +80,23 @@ class Speaker: c1 = 2 * e * numpy.cos(w) c2 = e * e - t0 = (1 - 2 * e * numpy.cos(w) + e * e) / (d * d + w * w) - t = d * d + w * w - numpy.pi * numpy.pi - t1 = (1 + 2 * e * numpy.cos(w) + e * e) / numpy.sqrt(t * t + 4 * d * d * - numpy.pi * numpy.pi) - b2 = (t1 - t0) / (t1 + t0) - b1 = b2 * dt * dt * (t0 + t1) / 2 + # t0 = (1 - 2 * e * numpy.cos(w) + e * e) / (d * d + w * w) + # t = d * d + w * w - numpy.pi * numpy.pi + # t1 = (1 + 2 * e * numpy.cos(w) + e * e) / numpy.sqrt(t * t + 4 * d * d * + # numpy.pi * numpy.pi) + # b2 = (t1 - t0) / (t1 + t0) + # b1 = b2 * dt * dt * (t0 + t1) / 2 + + # Square wave impulse + b2 = 0.0 + b1 = 1.0 self.c1 = c1 self.c2 = c2 self.b1 = b1 self.b2 = b2 - self.scale = numpy.float64(1 / 800) # TODO: analytic expression + self.scale = numpy.float64(scale) # TODO: analytic expression def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int, @@ -99,14 +111,25 @@ def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int, [data, numpy.zeros(max(lookahead_steps, opcodes.cycle_length( opcodes_generated.PlayerOps.TICK_00)), dtype=numpy.float32)])) + # At resonance freq the scale is about 22400 but we can only access about 7% + # of it across the frequency range. This is also the equilibrium speaker + # position when voltage is held constant. Normalize to this working + # range for convenience. + inv_scale = 22400 * 0.07759626164027278 # XXX + + # inv_scale = 15000 * 0.1102744481718292 + # + # inv_scale = 115954.98423621713 # Starting speaker applied voltage. voltage1 = voltage2 = 1.0 - # last 2 speaker positions + # last 2 speaker positions. y1 = y2 = 1.0 toggles = 0 - sp = Speaker(sample_rate, freq=3875, damping=-1210) + sp = Speaker(sample_rate, freq=3875, damping=-1210, scale=1 / inv_scale) + # sp = Speaker(sample_rate, freq=3968, damping=-1800, scale=1 / inv_scale) + # sp = Speaker(sample_rate, freq=475, damping=-210, scale=1 / inv_scale) total_err = 0.0 # Total squared error of audio output frame_offset = 0 # Position in 2048-byte TCP frame @@ -118,21 +141,35 @@ def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int, clicks = 0 min_lookahead_steps = lookahead_steps + # next_step = sample_rate + + # data = (numpy.arange(sample_rate) / sample_rate - 0.5).astype( + # numpy.float32) + + # dlen = len(data) while i < dlen // 1: + # if i > next_step: + # next_step += sample_rate + # inv_scale += 100 + # print("XXX scale %d" % inv_scale) + # sp = Speaker(sample_rate, freq=3875, damping=-1210, + # scale=1 / inv_scale) + if i >= next_tick: eta.print_status() next_tick = int(eta.i * dlen / 1000) - if frame_offset >= 2043: # XXX + if frame_offset >= (FRAME_SIZE - 5): # XXX lookahead_steps = min_lookahead_steps + 130 # XXX parametrize else: lookahead_steps = min_lookahead_steps # Compute all possible opcode sequences for this frame offset + last_opcode = opcode if frame_offset == FRAME_SIZE - 1 else None next_candidate_opcodes, voltages, lookahead_steps = \ opcodes.candidate_opcodes( frame_horizon(frame_offset, lookahead_steps), - lookahead_steps, opcode if frame_offset == 2047 else None) + lookahead_steps, last_opcode) opcode_idx = lookahead.evolve_return_best( sp, y1, y2, voltage1, voltage2, voltage1 * voltages, data[i:i + lookahead_steps]) @@ -165,14 +202,14 @@ def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int, # print(frame_offset, i / sample_rate, opcode) # for v in all_positions[0]: - # yield - # # print(v * sp.scale) + # print(v * sp.scale) # if frame_offset == 2047: # print(opcode) - yield opcode, (all_positions * sp.scale).astype(numpy.float32) + yield opcode, numpy.array( + all_positions * sp.scale, dtype=numpy.float32).reshape(-1) i += opcode_length - frame_offset = (frame_offset + 1) % 2048 + frame_offset = (frame_offset + 1) % FRAME_SIZE # Make sure we have at least 2k left in stream so player will do a # complete read. @@ -197,6 +234,14 @@ def preprocess( data, _ = librosa.load(filename, sr=target_sample_rate, mono=True) + # data = [] + # freq = 926 + # data.extend(numpy.sin(numpy.arange(target_sample_rate * 1) * ( + # 2 * numpy.pi / (target_sample_rate / freq))).astype( + # numpy.float32)) + # # freq *= 1.05 + # data = numpy.array(data, dtype=numpy.float32) + max_value = numpy.percentile(data, normalization_percentile) data /= max_value data *= normalize @@ -206,10 +251,15 @@ def preprocess( def resample_output(output_buffer, input_audio, sample_rate, output_rate, noise_output=False): - resampled_output = librosa.resample( - numpy.array(output_buffer, dtype=numpy.float32), - orig_sr=sample_rate, - target_sr=output_rate) + try: + resampled_output = librosa.resample( + numpy.array(output_buffer, dtype=numpy.float32), + orig_sr=sample_rate, + target_sr=output_rate) + except: + for i in output_buffer: + print(i) + raise resampled_noise = None if noise_output: @@ -238,7 +288,7 @@ def main(): parser.add_argument("--lookahead_cycles", type=int, help="Number of clock cycles to look ahead in audio " "stream.") - parser.add_argument("--normalization", default=1.0, type=float, + parser.add_argument("--normalization", default=0.8, type=float, help="Overall multiplier to rescale input audio " "values.") parser.add_argument("--norm_percentile", default=100, @@ -279,7 +329,7 @@ def main(): # We're not creating a file but still need a context noise_context = contextlib.nullcontext - with wav_context as wav_f, noise_context as noise_f, opcode_context\ + with wav_context as wav_f, noise_context as noise_f, opcode_context \ as opcode_f: for idx, sample_data in enumerate(audio_bytestream( input_audio, args.step_size, args.lookahead_cycles, @@ -293,12 +343,12 @@ def main(): # TODO: don't bother computing if we're not writing wavs - # Keep accumulating as long as we have <10MB in the buffer, or are - # within 10MB from the end. This ensures we have enough samples to + # Keep accumulating as long as we have <1MB in the buffer, or are + # within 1MB from the end. This ensures we have enough samples to # resample including the last (partial) buffer - if len(output_buffer) < 10 * 1024 * 1024: + if len(output_buffer) < 1 * 1024 * 1024: continue - if (len(input_audio) - input_offset) < 10 * 1024 * 1024: + if (len(input_audio) - input_offset) < 1 * 1024 * 1024: continue resampled_output_buffer, resampled_noise_buffer = resample_output( output_buffer, input_audio[input_offset - len(output_buffer):], @@ -306,8 +356,10 @@ def main(): ) if args.wav_output: wav_f.write(resampled_output_buffer) + wav_f.flush() if args.noise_output: noise_f.write(resampled_noise_buffer) + noise_f.flush() output_buffer = [] diff --git a/opcodes.py b/opcodes.py index 9df6a73..6c97133 100644 --- a/opcodes.py +++ b/opcodes.py @@ -19,6 +19,8 @@ def voltage_schedule(op: player_op.PlayerOp) -> numpy.ndarray: """Returns the 65C02 applied voltage schedule of a player opcode.""" return op.toggles +FRAME_SIZE = 2048 + #@functools.lru_cache(None) def opcode_choices( @@ -30,15 +32,14 @@ def opcode_choices( good results, we'll pick the one with the longest cycle count to reduce the stream bitrate. """ - if frame_offset == 2046: + if frame_offset == FRAME_SIZE - 2: return opcodes_generated.EOF_STAGE_1_OPS - if frame_offset == 2047: + if frame_offset == FRAME_SIZE - 1: return opcodes_generated.EOF_STAGE_2_3_OPS[eof_stage_1_op] return sorted( list(opcodes_generated.AUDIO_OPS), key=cycle_length, reverse=True) - #@functools.lru_cache(None) def opcode_lookahead( frame_offset: int, @@ -56,13 +57,13 @@ def opcode_lookahead( ops.append((op,)) else: # XXX check this - if frame_offset == 2046 and eof_stage_1_op is None: + if frame_offset == FRAME_SIZE - 2 and eof_stage_1_op is None: temp_op = op else: temp_op = eof_stage_1_op for res in opcode_lookahead( - (frame_offset + 1) % 2048, + (frame_offset + 1) % FRAME_SIZE, lookahead_cycles - cycle_length(op), temp_op): ops.append((op,) + res) return tuple(ops) # TODO: fix return type @@ -110,7 +111,7 @@ def candidate_opcodes( pruned_cycles = [] for ops in opcodes: cycles = cycle_lookahead(ops, lookahead_cycles) - if frame_offset == 2046 and cycles in seen_cycles: + if cycles in seen_cycles: # print("Dropping", ops, cycles, seen_cycles[cycles]) continue seen_cycles[cycles] = ops