From cd17dce267b67987887f951319707ef0a114c39c Mon Sep 17 00:00:00 2001
From: kris <kris.kennaway@gmail.com>
Date: Thu, 14 Mar 2019 21:40:09 +0000
Subject: [PATCH] Normalize audio by tasting the first 10M of the audio stream
 and computing the 2.5%ile and 97.5%ile values, i.e. so that <2.5% of audio
 samples will clip.

---
 audio.py | 42 +++++++++++++++++++++++++++++++++---------
 movie.py |  2 +-
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/audio.py b/audio.py
index bc49381..079f075 100644
--- a/audio.py
+++ b/audio.py
@@ -10,9 +10,8 @@ import video
 
 class Audio:
     def __init__(
-            self, filename: str, normalization: float = 1.0):
+            self, filename: str, normalization: float = None):
         self.filename = filename
-        self.normalization = normalization
 
         # TODO: take into account that the available range is slightly offset
         # as fraction of total cycle count?
@@ -22,16 +21,41 @@ class Audio:
         # TODO: round to divisor of video frame rate
         self.sample_rate = 14340  # int(1024. * 1024 / self.cycles_per_tick)
 
+        self.normalization = normalization or self._normalization()
+        print(self.normalization)
+
+    def _decode(self, f, buf) -> np.array:
+        data = np.frombuffer(buf, dtype='int16').astype(
+            'float32').reshape((f.channels, -1), order='F')
+
+        a = librosa.core.to_mono(data)
+        a = librosa.resample(a, f.samplerate,
+                             self.sample_rate).flatten()
+
+        return a
+
+    def _normalization(self, read_bytes=1024*1024*10):
+        """Read first read_bytes of audio stream and compute normalization.
+
+        We compute the 2.5th and 97.5th percentiles i.e. only 2.5% of samples
+        will clip.
+        """
+        raw = bytearray()
+        with audioread.audio_open(self.filename) as f:
+            for buf in f.read_data():
+                raw.extend(bytearray(buf))
+                if len(raw) > read_bytes:
+                    break
+        a = self._decode(f, raw)
+        norm = np.max(np.abs(np.percentile(a, [2.5, 97.5])))
+        assert norm
+
+        return 16384. / norm
+
     def audio_stream(self):
         with audioread.audio_open(self.filename) as f:
             for buf in f.read_data(128 * 1024):
-
-                data = np.frombuffer(buf, dtype='int16').astype(
-                    'float32').reshape((f.channels, -1), order='F')
-
-                a = librosa.core.to_mono(data)
-                a = librosa.resample(a, f.samplerate,
-                                     self.sample_rate).flatten()
+                a = self._decode(f, buf)
 
                 a /= 16384  # normalize to -1.0 .. 1.0
                 a *= self.normalization
diff --git a/movie.py b/movie.py
index 820be92..0be899c 100644
--- a/movie.py
+++ b/movie.py
@@ -8,7 +8,7 @@ import video
 
 
 class Movie:
-    def __init__(self, filename: str, audio_normalization: float = 1.0):
+    def __init__(self, filename: str, audio_normalization: float = None):
         self.filename = filename  # type: str
         self.audio = audio.Audio(
             filename, normalization=audio_normalization)  # type: audio.Audio