Normalize audio by tasting the first 10M of the audio stream and

computing the 2.5%ile and 97.5%ile values, i.e. so that <2.5% of audio samples will clip.
2025-03-11 04:37:03 +00:00 · 2019-03-14 21:40:09 +00:00 · 2019-03-14 21:40:09 +00:00 · cd17dce267
commit cd17dce267
parent 2d410a4b13
2 changed files with 34 additions and 10 deletions
--- a/audio.py
+++ b/audio.py
@ -10,9 +10,8 @@ import video

 class Audio:
    def __init__(
-            self, filename: str, normalization: float = 1.0):
+            self, filename: str, normalization: float = None):
        self.filename = filename
-        self.normalization = normalization

        # TODO: take into account that the available range is slightly offset
        # as fraction of total cycle count?
@ -22,16 +21,41 @@ class Audio:
        # TODO: round to divisor of video frame rate
        self.sample_rate = 14340  # int(1024. * 1024 / self.cycles_per_tick)

+        self.normalization = normalization or self._normalization()
+        print(self.normalization)
+
+    def _decode(self, f, buf) -> np.array:
+        data = np.frombuffer(buf, dtype='int16').astype(
+            'float32').reshape((f.channels, -1), order='F')
+
+        a = librosa.core.to_mono(data)
+        a = librosa.resample(a, f.samplerate,
+                             self.sample_rate).flatten()
+
+        return a
+
+    def _normalization(self, read_bytes=1024*1024*10):
+        """Read first read_bytes of audio stream and compute normalization.
+
+        We compute the 2.5th and 97.5th percentiles i.e. only 2.5% of samples
+        will clip.
+        """
+        raw = bytearray()
+        with audioread.audio_open(self.filename) as f:
+            for buf in f.read_data():
+                raw.extend(bytearray(buf))
+                if len(raw) > read_bytes:
+                    break
+        a = self._decode(f, raw)
+        norm = np.max(np.abs(np.percentile(a, [2.5, 97.5])))
+        assert norm
+
+        return 16384. / norm
+
    def audio_stream(self):
        with audioread.audio_open(self.filename) as f:
            for buf in f.read_data(128 * 1024):
-
-                data = np.frombuffer(buf, dtype='int16').astype(
-                    'float32').reshape((f.channels, -1), order='F')
-
-                a = librosa.core.to_mono(data)
-                a = librosa.resample(a, f.samplerate,
-                                     self.sample_rate).flatten()
+                a = self._decode(f, buf)

                a /= 16384  # normalize to -1.0 .. 1.0
                a *= self.normalization
--- a/movie.py
+++ b/movie.py
@ -8,7 +8,7 @@ import video


 class Movie:
-    def __init__(self, filename: str, audio_normalization: float = 1.0):
+    def __init__(self, filename: str, audio_normalization: float = None):
        self.filename = filename  # type: str
        self.audio = audio.Audio(
            filename, normalization=audio_normalization)  # type: audio.Audio