ii-sound/encode_audio.py

#!/usr/bin/env python3
# Delta modulation audio encoder for playback via Uthernet II streaming.
#
# Simulates the Apple II speaker at 1MHz (i.e. cycle-level) resolution,
# by modeling it as a damped harmonic oscillator.
#
# On the Apple II side we use an audio player that is able to toggle the
# speaker with 1MHz precision, i.e. on any CPU clock cycle, although with a
# lower limit of 10 cycles between toggles (i.e. 102KHz maximum
# frequency).
#
# In order to reproduce a target audio waveform, we upscale it to 1MHz sample
# rate, i.e. to determine the desired speaker position at every CPU clock
# cycle, and compute the sequence of player operations on the Apple II side to
# best reproduce this waveform.
#
# This means that we are able to control the Apple II speaker with cycle-level
# precision, which results in high audio fidelity with low noise.
#
# To further optimize the audio quality we look ahead some defined number of
# cycles and choose a speaker trajectory that minimizes errors over this range.
# e.g. this allows us to anticipate large amplitude changes by pre-moving
# the speaker to better approximate them.
#
# This also needs to take into account scheduling the "end of frame" opcode
# every 2048 output bytes, where the Apple II will manage the TCP socket buffer
# while ticking the speaker at a regular (a, b) cadence to attempt to
# continue tracking the waveform as best we can.  Since we are stepping away
# from cycle-level management of the speaker during this period, it does
# introduce some quality degradation (manifesting as a slight background
# "crackle" to the audio)

import argparse
import collections
import contextlib
import functools
import librosa
import numpy
import soundfile as sf
from eta import ETA

import lookahead
import opcodes
import opcodes_generated

# How many bytes to use per frame in the audio stream.  At the end of each
# frame we need to switch to special end-of-frame operations to cause the
# Apple II to manage the TCP socket buffers (ACK data received so far, and
# check we have at least another frame of data available)
#
# With an 8KB socket buffer this seems to be about the maximum we can get away
# with - it has to be page aligned, and 4KB causes stuttering even from a
# local playback source.
FRAME_SIZE = 2048


class Speaker:
    """Simulates the response of the Apple II speaker."""

    # TODO: move lookahead.evolve into Speaker method

    def __init__(self, sample_rate: float, freq: float, damping: float,
                 scale: float):
        """Initialize the Speaker object

        :arg sample_rate The sample rate of the simulated speaker (Hz)
        :arg freq The resonant frequency of the speaker
        :arg damping The exponential decay factor of the speaker response
        :arg scale Scale factor to normalize speaker position to desired range
        """
        self.sample_rate = sample_rate
        self.freq = freq
        self.damping = damping
        self.scale = numpy.float64(scale)  # TODO: analytic expression

        # See _Signal Processing in C_, C. Reid, T. Passin
        # https://archive.org/details/signalprocessing0000reid/

        dt = numpy.float64(1 / sample_rate)
        w = numpy.float64(freq * 2 * numpy.pi * dt)

        d = damping * dt
        e = numpy.exp(d)
        c1 = 2 * e * numpy.cos(w)
        c2 = e * e

        # Square wave impulse response parameters
        b2 = 0.0
        b1 = 1.0

        self.c1 = c1
        self.c2 = c2
        self.b1 = b1
        self.b2 = b2


def total_error(positions: numpy.ndarray, data: numpy.ndarray) -> numpy.ndarray:
    """Computes the total squared error for speaker position matrix vs data."""

    # Deal gracefully with the case where our speaker operation would slightly
    # run past the end of data
    min_len = min(len(positions), len(data))
    return numpy.sum(numpy.square(positions[:min_len] - data[:min_len]),
                     axis=-1)


@functools.lru_cache(None)
def frame_horizon(frame_offset: int, lookahead_steps: int):
    """Optimize frame_offset when more than lookahead_steps from end of frame.

    Candidate opcodes for all values of frame_offset are equal, until the
    end-of-frame opcode comes within our lookahead horizon.  This avoids
    needing to recompute many copies of the same candidate opcodes.

    TODO: this is a bit of a hack and we should be able to make the candidate
      opcode selection itself smarter about avoiding duplicate work
    """
    # TODO: This could be made tighter because a step is always at least 5
    #  cycles towards lookahead_steps.
    if frame_offset < (FRAME_SIZE - lookahead_steps):
        return 0
    return frame_offset


def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int,
                     sample_rate: int):
    """Computes optimal sequence of player opcodes to reproduce audio data."""
    # At resonance freq the scale is about 22400 but we can only access about 7%
    # of it across the frequency range.  This is also the equilibrium speaker
    # position when voltage is held constant. Normalize to this working
    # range for convenience.
    inv_scale = 22400 * 0.07759626164027278  # XXX

    # Speaker response parameters can be fitted in several ways.  First, by
    # recording audio of
    # - a single speaker click (impulse response)
    # - a frequency sweep over the entire audible spectrum
    #
    # Resonant frequency can be read off from the frequency spectrum.  For my
    # speaker there were two primary frequencies, at ~3875 and ~480Hz.
    # Looking at the click waveform the higher frequency mode dominates at
    # short time scales, and the lower frequency mode dominates at late
    # times.  Since we're interested in short timescale speaker control we
    # can ignore the latter.
    #
    # Damping factor can be estimated by fitting against the speaker click
    # waveform or the spectrum, e.g. by simulating a speaker click and then
    # computing its spectrum.
    #
    # TODO: other Apple II speakers almost certainly have different response
    #  characteristics, but hopefully not too widely varying.
    sp = Speaker(sample_rate, freq=3875, damping=-1210, scale=1 / inv_scale)

    # Starting speaker applied voltage.
    voltage1 = voltage2 = 1.0

    # last 2 speaker positions.
    # XXX 0.0?
    y1 = y2 = 1.0

    # Leave enough padding at the end to look ahead from the last data value,
    # and in case we schedule an end-of-frame opcode towards the end.
    # TODO: avoid temporarily doubling memory footprint to concatenate
    data = numpy.ascontiguousarray(numpy.concatenate(
        [data, numpy.zeros(max(lookahead_steps, opcodes.cycle_length(
            opcodes_generated.PlayerOps.TICK_00)), dtype=numpy.float32)]))

    # index within input audio data
    i = 0
    # Position in 2048-byte TCP frame
    frame_offset = 0

    # Total squared error of audio output
    total_err = 0.0

    # Keep track of how many large deviations we see from the target waveform
    # as another measure of audio quality
    clicks = 0
    # total squared error threshold to consider an operation as producing a
    # click
    click_threshold = 0.3

    # Keep track of how many opcodes we schedule, so we can print summary
    # statistics at the end
    opcode_counts = collections.defaultdict(int)

    # Always look ahead at least this many cycles.  We pick a higher value
    # when approaching end of frame, so we can maximize the
    min_lookahead_steps = lookahead_steps

    # Progress tracker
    # TODO: find a more adaptive ETA tracker, this one doesn't estimate well
    #   if the processing rate changes (which it does for us, since first few
    #   steps do extra work to precompute values that are cached for later
    #   steps)
    eta = ETA(total=1000, min_ms_between_updates=0)
    # Value of i at which we should next update eta
    next_eta_tick = 0

    while i < len(data):
        if i >= next_eta_tick:
            eta.print_status()
            next_eta_tick = int(eta.i * len(data) / 1000)

        if frame_offset >= (FRAME_SIZE - 5):  # XXX
            lookahead_steps = min_lookahead_steps + 130  # XXX parametrize
        else:
            lookahead_steps = min_lookahead_steps

        # The EOF opcodes need to act as a matched pair, so if we're emitting
        # the second one we need to pass in its predecessor
        last_opcode = opcode if frame_offset == FRAME_SIZE - 1 else None

        # Compute all possible opcode sequences we could emit starting at this
        # frame offset
        next_candidate_opcodes, voltages, lookahead_steps = \
            opcodes.candidate_opcodes(
                frame_horizon(frame_offset, lookahead_steps),
                lookahead_steps, last_opcode)

        # Simulate speaker trajectory for all of these  candidate opcode
        # sequences and pick the one that minimizes total error
        opcode_idx = lookahead.evolve_return_best(
            sp, y1, y2, voltage1, voltage2, voltage1 * voltages,
            data[i:i + lookahead_steps])
        opcode = next_candidate_opcodes[opcode_idx]
        opcode_length = opcodes.cycle_length(opcode)
        opcode_counts[opcode] += 1

        # Apply this opcode to evolve the speaker position
        opcode_voltages = (voltage1 * opcode.voltages).reshape((1, -1))
        all_positions = lookahead.evolve(
            sp, y1, y2, voltage1, voltage2, opcode_voltages)

        assert all_positions.shape[0] == 1
        assert all_positions.shape[1] == opcode_length

        # Update to new speaker state
        voltage1 = opcode_voltages[0, -1]
        voltage2 = opcode_voltages[0, -2]
        y1 = all_positions[0, -1]
        y2 = all_positions[0, -2]

        # Track accumulated error between desired and actual speaker trajectory
        new_error = total_error(
            all_positions[0] * sp.scale, data[i:i + opcode_length]).item()
        total_err += new_error
        if new_error > click_threshold:
            clicks += 1
            # XXX
            print(frame_offset, i / sample_rate, opcode, new_error,
                  numpy.mean(data[i:i + opcode_length]))

        # Emit chosen operation and simulated audio samples for recording
        yield opcode, numpy.array(
            all_positions * sp.scale, dtype=numpy.float32).reshape(-1)

        # Update input and output stream positions
        i += opcode_length
        frame_offset = (frame_offset + 1) % FRAME_SIZE

    # Make sure we have at least 2k left in stream so the player will do a
    # complete read of the last frame.
    # for _ in range(frame_offset % 2048, 2048):
    #     yield opcodes.Opcode.EXIT
    eta.done()

    # Print summary statistics
    print("Total error %f" % total_err)
    print("%d clicks" % clicks)
    print("Opcodes used:")
    for v, k in sorted(list(opcode_counts.items()), key=lambda kv: kv[1],
                       reverse=True):
        print("%s: %d" % (v, k))


def preprocess_audio(
        filename: str, target_sample_rate: int, normalize: float,
        normalization_percentile: int) -> numpy.ndarray:
    """Upscale input audio to target sample rate and normalize signal."""

    data, _ = librosa.load(filename, sr=target_sample_rate, mono=True)

    max_value = numpy.percentile(data, normalization_percentile)
    data /= max_value
    data *= normalize

    return data


def downsample_audio(simulated_audio, original_audio, input_rate, output_rate,
                     noise_output=False):
    """Downscale the 1MHz simulated audio output suitable for writing as .wav

    :arg simulated_audio The simulated audio data to downsample
    :arg original_audio The original audio data that was simulated
    :arg input_rate Sample rate of input audio
    :arg output_rate Desired sample rate of output audio
    :arg noise_output Whether to also produce a noise waveform, i.e. difference
      between input and output audio

    :returns Tuple of downsampled audio and noise data (or None
    if noise_output==False)
    """
    downsampled_output = librosa.resample(
        numpy.array(simulated_audio, dtype=numpy.float32),
        orig_sr=input_rate,
        target_sr=output_rate)

    downsampled_noise = None
    if noise_output:
        noise_len = min(len(simulated_audio), len(original_audio))
        downsampled_noise = librosa.resample(
            numpy.array(
                simulated_audio[:noise_len] - original_audio[:noise_len]),
            orig_sr=input_rate,
            target_sr=output_rate)

    return downsampled_output, downsampled_noise


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--clock", choices=['pal', 'ntsc'],
                        help="Whether target machine clock speed is PAL ("
                             "1015657Hz) or NTSC (1020484)",
                        required=True)
    # TODO: implement 6502 - JMP indirect takes 5 cycles instead of 6
    parser.add_argument("--step_size", type=int,
                        help="Delta encoding step size")
    # TODO: if we're not looking ahead beyond the longest (non-end-of-frame)
    #  opcode then this will reduce quality, e.g. two opcodes may truncate to
    #  the same prefix, but have different results when we apply them
    #  fully.
    parser.add_argument("--lookahead_cycles", type=int,
                        help="Number of clock cycles to look ahead in audio "
                             "stream.")
    parser.add_argument("--normalization", default=0.8, type=float,
                        help="Overall multiplier to rescale input audio "
                             "values.")
    parser.add_argument("--norm_percentile", default=100,
                        help="Normalize to specified percentile value of input "
                             "audio")
    parser.add_argument("--wav_output", type=str, help="output audio file")
    parser.add_argument("--noise_output", type=str, help="output audio file")
    parser.add_argument("input", type=str, help="input audio file to convert")
    parser.add_argument("output", type=str, help="output audio file")
    args = parser.parse_args()

    # Effective clock rate, including every-65 cycle "long cycle" that takes
    # 16/14 as long.
    cpu_clock_rate = 1015657 if args.clock == 'pal' else 1020484  # NTSC

    input_audio = preprocess_audio(
        args.input, cpu_clock_rate, args.normalization, args.norm_percentile)
    print("Done preprocessing audio")

    # Sample rate for output .wav files
    # TODO: flag
    output_rate = 44100

    # Buffers simulated audio output so we can downsample it in suitably
    # large chunks for writing to the output .wav file
    output_buffer = []

    # Python contexts for writing output files if requested
    opcode_context = open(args.output, "wb+")
    if args.wav_output:
        wav_context = sf.SoundFile(
            args.wav_output, "w", output_rate, channels=1, format='WAV')
    else:
        # We're not creating a file but still need a context
        # XXX does this work?
        wav_context = contextlib.nullcontext
    if args.noise_output:
        noise_context = sf.SoundFile(
            args.noise_output, "w", output_rate, channels=1,
            format='WAV')
    else:
        # We're not creating a file but still need a context
        noise_context = contextlib.nullcontext

    with wav_context as wav_f, noise_context as noise_f, opcode_context \
            as opcode_f:
        # Tracks current position in input audio waveform
        input_offset = 0

        # Process input audio, writing output to ][-Sound audio file
        # and (if requested) .wav files of simulated speaker audio and
        # noise (difference between original and simulated audio)
        for idx, sample_data in enumerate(audio_bytestream(
                input_audio, args.step_size, args.lookahead_cycles,
                cpu_clock_rate)):
            opcode, samples = sample_data
            opcode_f.write(bytes([opcode.byte]))

            output_buffer.extend(samples)
            input_offset += len(samples)
            # Keep accumulating as long as we have <1MB in the buffer, or are
            # within 1MB from the end.  This ensures we have enough samples to
            # downsample, including the last (partial) buffer.
            if (
                    len(output_buffer) < 1 * 1024 * 1024 or (
                    len(input_audio) - input_offset) < 1 * 1024 * 1024
            ):
                continue

            # TODO: don't bother computing if we're not writing wavs
            downsampled_audio, downsampled_noise = downsample_audio(
                output_buffer, input_audio[input_offset - len(output_buffer):],
                cpu_clock_rate, output_rate, bool(args.noise_output)
            )
            if args.wav_output:
                wav_f.write(downsampled_audio)
                wav_f.flush()
            if args.noise_output:
                noise_f.write(downsampled_noise)
                noise_f.flush()

            output_buffer = []

        # TODO: handle last buffer more cleanly than duplicating this code
        if output_buffer:
            downsampled_audio, downsampled_noise = downsample_audio(
                output_buffer, input_audio[input_offset - len(output_buffer):],
                cpu_clock_rate, output_rate, bool(args.noise_output)
            )
            if args.wav_output:
                wav_f.write(downsampled_audio)
            if args.noise_output:
                noise_f.write(downsampled_noise)


if __name__ == "__main__":
    main()