ii-sound/encode_audio.py

#!/usr/bin/env python3
# Delta modulation audio encoder for playback via Uthernet II streaming.
#
# Simulates the Apple II speaker at 1MHz (i.e. cycle-level) resolution,
# by modeling it as a damped harmonic oscillator.
#
# On the Apple II side we use an audio player that is able to toggle the
# speaker with 1MHz precision, i.e. on any CPU clock cycle, although with a
# lower limit of 10 cycles between toggles (i.e. 102KHz maximum
# frequency).
#
# In order to reproduce a target audio waveform, we upscale it to 1MHz sample
# rate, i.e. to determine the desired speaker position at every CPU clock
# cycle, and compute the sequence of player operations on the Apple II side to
# best reproduce this waveform.
#
# This means that we are able to control the Apple II speaker with cycle-level
# precision, which results in high audio fidelity with low noise.
#
# To further optimize the audio quality we look ahead some defined number of
# cycles and choose a speaker trajectory that minimizes errors over this range.
# e.g. this allows us to anticipate large amplitude changes by pre-moving
# the speaker to better approximate them.
#
# This also needs to take into account scheduling the "end of frame" opcode
# every 2048 output bytes, where the Apple II will manage the TCP socket buffer
# while ticking the speaker at a regular (a, b) cadence to attempt to
# continue tracking the waveform as best we can.  Since we are stepping away
# from cycle-level management of the speaker during this period, it does
# introduce some quality degradation (manifesting as a slight background
# "crackle" to the audio)

import argparse
import collections
import contextlib
import functools
import librosa
import numpy
import soundfile as sf
from eta import ETA
from typing import Tuple

import lookahead
import opcodes
import opcodes_generated

# How many bytes to use per frame in the audio stream.  At the end of each
# frame we need to switch to special end-of-frame operations to cause the
# Apple II to manage the TCP socket buffers (ACK data received so far, and
# check we have at least another frame of data available)
#
# With an 8KB socket buffer this seems to be about the maximum we can get away
# with - it has to be page aligned, and 4KB causes stuttering even from a
# local playback source.
FRAME_SIZE = 2048


class Speaker:
    """Simulates the response of the Apple II speaker."""

    # TODO: move lookahead.evolve into Speaker method

    def __init__(self, sample_rate: float, freq: float, damping: float,
                 scale: float):
        """Initialize the Speaker object

        :arg sample_rate The sample rate of the simulated speaker (Hz)
        :arg freq The resonant frequency of the speaker
        :arg damping The exponential decay factor of the speaker response
        :arg scale Scale factor to normalize speaker position to desired range
        """
        self.sample_rate = sample_rate
        self.freq = freq
        self.damping = damping
        self.scale = numpy.float64(scale)  # TODO: analytic expression

        # See _Signal Processing in C_, C. Reid, T. Passin
        # https://archive.org/details/signalprocessing0000reid/
        dt = numpy.float64(1 / sample_rate)
        w = numpy.float64(freq * 2 * numpy.pi * dt)

        d = damping * dt
        e = numpy.exp(d)
        c1 = 2 * e * numpy.cos(w)
        c2 = e * e

        # Square wave impulse response parameters
        b2 = 0.0
        b1 = 1.0

        self.c1 = c1
        self.c2 = c2
        self.b1 = b1
        self.b2 = b2

def total_error(positions: numpy.ndarray, data: numpy.ndarray) -> numpy.ndarray:
    """Computes the total squared error for speaker position matrix vs data."""

    # Deal gracefully with the case where our speaker operation would slightly
    # run past the end of data
    min_len = min(len(positions), len(data))
    return numpy.sum(numpy.square(positions[:min_len] - data[:min_len]),
                     axis=-1)


@functools.lru_cache(None)
def frame_horizon(frame_offset: int, lookahead_steps: int):
    """Optimize frame_offset when more than lookahead_steps from end of frame.

    Candidate opcodes for all values of frame_offset are equal, until the
    end-of-frame opcode comes within our lookahead horizon.  This avoids
    needing to recompute many copies of the same candidate opcodes.

    TODO: this is a bit of a hack and we should be able to make the candidate
      opcode selection itself smarter about avoiding duplicate work
    """
    # TODO: This could be made tighter because a step is always at least 5
    #  cycles towards lookahead_steps.
    if frame_offset < (FRAME_SIZE - lookahead_steps):
        return 0
    return frame_offset


def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int,
                     sample_rate: int):
    """Computes optimal sequence of player opcodes to reproduce audio data."""
    # At resonance freq the scale is about 22400 but we can only access about 7%
    # of it across the frequency range.  This is also the equilibrium speaker
    # position when voltage is held constant. Normalize to this working
    # range for convenience.
    inv_scale = 22400 * 0.07759626164027278  # XXX

    # Speaker response parameters can be fitted in several ways.  First, by
    # recording audio of
    # - a single speaker click (impulse response)
    # - a frequency sweep over the entire audible spectrum
    #
    # Resonant frequency can be read off from the frequency spectrum.  For my
    # speaker there were two primary frequencies, at ~3875 and ~480Hz.
    # Looking at the click waveform the higher frequency mode dominates at
    # short time scales, and the lower frequency mode dominates at late
    # times.  Since we're interested in short timescale speaker control we
    # can ignore the latter.
    #
    # Damping factor can be estimated by fitting against the speaker click
    # waveform or the spectrum, e.g. by simulating a speaker click and then
    # computing its spectrum.
    #
    # TODO: other Apple II speakers almost certainly have different response
    #  characteristics, but hopefully not too widely varying.
    sp = Speaker(sample_rate, freq=3875, damping=-1210, scale=1 / inv_scale)

    # Starting speaker applied voltage.
    voltage1 = voltage2 = 1.0

    # last 2 speaker positions.
    # XXX 0.0?
    y1 = y2 = 1.0

    # Leave enough padding at the end to look ahead from the last data value,
    # and in case we schedule an end-of-frame opcode towards the end.
    # TODO: avoid temporarily doubling memory footprint to concatenate
    data = numpy.ascontiguousarray(numpy.concatenate(
        [data, numpy.zeros(max(lookahead_steps, opcodes.cycle_length(
            opcodes_generated.PlayerOps.TICK_00)), dtype=numpy.float32)]))

    # index within input audio data
    i = 0
    # Position in 2048-byte TCP frame
    frame_offset = 0

    # Total squared error of audio output
    total_err = 0.0

    # Keep track of how many large deviations we see from the target waveform
    # as another measure of audio quality
    clicks = 0
    # total squared error threshold to consider an operation as producing a
    # click
    click_threshold = 0.3

    # Keep track of how many opcodes we schedule, so we can print summary
    # statistics at the end
    opcode_counts = collections.defaultdict(int)

    # Always look ahead at least this many cycles.  We pick a higher value
    # when approaching end of frame, so we can maximize the
    min_lookahead_steps = lookahead_steps

    # Progress tracker
    # TODO: find a more adaptive ETA tracker, this one doesn't estimate well
    #   if the processing rate changes (which it does for us, since first few
    #   steps do extra work to precompute values that are cached for later
    #   steps)
    eta = ETA(total=1000, min_ms_between_updates=0)
    # Value of i at which we should next update eta
    next_eta_tick = 0

    while i < len(data):
        if i >= next_eta_tick:
            eta.print_status()
            next_eta_tick = int(eta.i * len(data) / 1000)

        if frame_offset >= (FRAME_SIZE - 5):  # XXX
            lookahead_steps = min_lookahead_steps + 130  # XXX parametrize
        else:
            lookahead_steps = min_lookahead_steps

        # The EOF opcodes need to act as a matched pair, so if we're emitting
        # the second one we need to pass in its predecessor
        last_opcode = opcode if frame_offset == FRAME_SIZE - 1 else None

        # Compute all possible opcode sequences we could emit starting at this
        # frame offset
        next_candidate_opcodes, voltages, lookahead_steps = \
            opcodes.candidate_opcodes(
                frame_horizon(frame_offset, lookahead_steps),
                lookahead_steps, last_opcode)

        # Simulate speaker trajectory for all of these  candidate opcode
        # sequences and pick the one that minimizes total error
        opcode_idx = lookahead.evolve_return_best(
            sp, y1, y2, voltage1, voltage2, voltage1 * voltages,
            data[i:i + lookahead_steps])
        opcode = next_candidate_opcodes[opcode_idx]
        opcode_length = opcodes.cycle_length(opcode)
        opcode_counts[opcode] += 1

        # Apply this opcode to evolve the speaker position
        opcode_voltages = (voltage1 * opcode.voltages).reshape((1, -1))
        all_positions = lookahead.evolve(
            sp, y1, y2, voltage1, voltage2, opcode_voltages)

        assert all_positions.shape[0] == 1
        assert all_positions.shape[1] == opcode_length

        # Update to new speaker state
        voltage1 = opcode_voltages[0, -1]
        voltage2 = opcode_voltages[0, -2]
        y1 = all_positions[0, -1]
        y2 = all_positions[0, -2]

        # Track accumulated error between desired and actual speaker trajectory
        new_error = total_error(
            all_positions[0] * sp.scale, data[i:i + opcode_length]).item()
        total_err += new_error
        if new_error > click_threshold:
            clicks += 1
            # XXX
            print(frame_offset, i / sample_rate, opcode, new_error,
                  numpy.mean(data[i:i + opcode_length]))

        # Emit chosen operation and simulated audio samples for recording
        yield opcode, numpy.array(
            all_positions * sp.scale, dtype=numpy.float32).reshape(-1)

        # Update input and output stream positions
        i += opcode_length
        frame_offset = (frame_offset + 1) % FRAME_SIZE

    # Make sure we have at least 2k left in stream so the player will do a
    # complete read of the last frame.
    # for _ in range(frame_offset % 2048, 2048):
    #     yield opcodes.Opcode.EXIT
    eta.done()

    # Print summary statistics
    print("Total error %f" % total_err)
    print("%d clicks" % clicks)
    print("Opcodes used:")
    for v, k in sorted(list(opcode_counts.items()), key=lambda kv: kv[1],
                       reverse=True):
        print("%s: %d" % (v, k))


def preprocess_audio(
        filename: str, target_sample_rate: int, normalize: float,
        normalization_percentile: int) -> numpy.ndarray:
    """Upscale input audio to target sample rate and normalize signal."""

    data, _ = librosa.load(filename, sr=target_sample_rate, mono=True)

    max_value = numpy.percentile(data, normalization_percentile)
    data /= max_value
    data *= normalize

    return data


def downsample_audio(simulated_audio, original_audio, input_rate, output_rate,
                     noise_output=False):
    """Downscale the 1MHz simulated audio output suitable for writing as .wav

    :arg simulated_audio The simulated audio data to downsample
    :arg original_audio The original audio data that was simulated
    :arg input_rate Sample rate of input audio
    :arg output_rate Desired sample rate of output audio
    :arg noise_output Whether to also produce a noise waveform, i.e. difference
      between input and output audio

    :returns Tuple of downsampled audio and noise data (or None
    if noise_output==False)
    """
    downsampled_output = librosa.resample(
        numpy.array(simulated_audio, dtype=numpy.float32),
        orig_sr=input_rate,
        target_sr=output_rate)

    downsampled_noise = None
    if noise_output:
        noise_len = min(len(simulated_audio), len(original_audio))
        downsampled_noise = librosa.resample(
            numpy.array(
                simulated_audio[:noise_len] - original_audio[:noise_len]),
            orig_sr=input_rate,
            target_sr=output_rate)

    return downsampled_output, downsampled_noise


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--clock", choices=['pal', 'ntsc'],
                        help="Whether target machine clock speed is PAL ("
                             "1015657Hz) or NTSC (1020484)",
                        required=True)
    # TODO: implement 6502 - JMP indirect takes 5 cycles instead of 6
    parser.add_argument("--step_size", type=int,
                        help="Delta encoding step size")
    # TODO: if we're not looking ahead beyond the longest (non-end-of-frame)
    #  opcode then this will reduce quality, e.g. two opcodes may truncate to
    #  the same prefix, but have different results when we apply them
    #  fully.
    parser.add_argument("--lookahead_cycles", type=int,
                        help="Number of clock cycles to look ahead in audio "
                             "stream.")
    parser.add_argument("--normalization", default=0.8, type=float,
                        help="Overall multiplier to rescale input audio "
                             "values.")
    parser.add_argument("--norm_percentile", default=100,
                        help="Normalize to specified percentile value of input "
                             "audio")
    parser.add_argument("--wav_output", type=str, help="output audio file")
    parser.add_argument("--noise_output", type=str, help="output audio file")
    parser.add_argument("input", type=str, help="input audio file to convert")
    parser.add_argument("output", type=str, help="output audio file")
    args = parser.parse_args()

    # Effective clock rate, including every-65 cycle "long cycle" that takes
    # 16/14 as long.
    cpu_clock_rate = 1015657 if args.clock == 'pal' else 1020484  # NTSC

    input_audio = preprocess_audio(
        args.input, cpu_clock_rate, args.normalization, args.norm_percentile)
    print("Done preprocessing audio")

    # Sample rate for output .wav files
    # TODO: flag
    output_rate = 44100

    # Buffers simulated audio output so we can downsample it in suitably
    # large chunks for writing to the output .wav file
    output_buffer = []

    # Python contexts for writing output files if requested
    opcode_context = open(args.output, "wb+")
    if args.wav_output:
        wav_context = sf.SoundFile(
            args.wav_output, "w", output_rate, channels=1, format='WAV')
    else:
        # We're not creating a file but still need a context
        # XXX does this work?
        wav_context = contextlib.nullcontext
    if args.noise_output:
        noise_context = sf.SoundFile(
            args.noise_output, "w", output_rate, channels=1,
            format='WAV')
    else:
        # We're not creating a file but still need a context
        noise_context = contextlib.nullcontext

    with wav_context as wav_f, noise_context as noise_f, opcode_context \
            as opcode_f:
        # Tracks current position in input audio waveform
        input_offset = 0

        # Process input audio, writing output to ][-Sound audio file
        # and (if requested) .wav files of simulated speaker audio and
        # noise (difference between original and simulated audio)
        for idx, sample_data in enumerate(audio_bytestream(
                input_audio, args.step_size, args.lookahead_cycles,
                cpu_clock_rate)):
            opcode, samples = sample_data
            opcode_f.write(bytes([opcode.byte]))

            output_buffer.extend(samples)
            input_offset += len(samples)
            # Keep accumulating as long as we have <1MB in the buffer, or are
            # within 1MB from the end.  This ensures we have enough samples to
            # downsample, including the last (partial) buffer.
            if (
                    len(output_buffer) < 1 * 1024 * 1024 or (
                    len(input_audio) - input_offset) < 1 * 1024 * 1024
            ):
                continue

            # TODO: don't bother computing if we're not writing wavs
            downsampled_audio, downsampled_noise = downsample_audio(
                output_buffer, input_audio[input_offset - len(output_buffer):],
                cpu_clock_rate, output_rate, bool(args.noise_output)
            )
            if args.wav_output:
                wav_f.write(downsampled_audio)
                wav_f.flush()
            if args.noise_output:
                noise_f.write(downsampled_noise)
                noise_f.flush()

            output_buffer = []

        # TODO: handle last buffer more cleanly than duplicating this code
        if output_buffer:
            downsampled_audio, downsampled_noise = downsample_audio(
                output_buffer, input_audio[input_offset - len(output_buffer):],
                cpu_clock_rate, output_rate, bool(args.noise_output)
            )
            if args.wav_output:
                wav_f.write(downsampled_audio)
            if args.noise_output:
                noise_f.write(downsampled_noise)


if __name__ == "__main__":
    main()