mirror of
https://github.com/KrisKennaway/ii-vision.git
synced 2024-12-21 05:30:20 +00:00
Normalize audio by tasting the first 10M of the audio stream and
computing the 2.5%ile and 97.5%ile values, i.e. so that <2.5% of audio samples will clip.
This commit is contained in:
parent
2d410a4b13
commit
cd17dce267
42
audio.py
42
audio.py
@ -10,9 +10,8 @@ import video
|
||||
|
||||
class Audio:
|
||||
def __init__(
|
||||
self, filename: str, normalization: float = 1.0):
|
||||
self, filename: str, normalization: float = None):
|
||||
self.filename = filename
|
||||
self.normalization = normalization
|
||||
|
||||
# TODO: take into account that the available range is slightly offset
|
||||
# as fraction of total cycle count?
|
||||
@ -22,16 +21,41 @@ class Audio:
|
||||
# TODO: round to divisor of video frame rate
|
||||
self.sample_rate = 14340 # int(1024. * 1024 / self.cycles_per_tick)
|
||||
|
||||
self.normalization = normalization or self._normalization()
|
||||
print(self.normalization)
|
||||
|
||||
def _decode(self, f, buf) -> np.array:
|
||||
data = np.frombuffer(buf, dtype='int16').astype(
|
||||
'float32').reshape((f.channels, -1), order='F')
|
||||
|
||||
a = librosa.core.to_mono(data)
|
||||
a = librosa.resample(a, f.samplerate,
|
||||
self.sample_rate).flatten()
|
||||
|
||||
return a
|
||||
|
||||
def _normalization(self, read_bytes=1024*1024*10):
|
||||
"""Read first read_bytes of audio stream and compute normalization.
|
||||
|
||||
We compute the 2.5th and 97.5th percentiles i.e. only 2.5% of samples
|
||||
will clip.
|
||||
"""
|
||||
raw = bytearray()
|
||||
with audioread.audio_open(self.filename) as f:
|
||||
for buf in f.read_data():
|
||||
raw.extend(bytearray(buf))
|
||||
if len(raw) > read_bytes:
|
||||
break
|
||||
a = self._decode(f, raw)
|
||||
norm = np.max(np.abs(np.percentile(a, [2.5, 97.5])))
|
||||
assert norm
|
||||
|
||||
return 16384. / norm
|
||||
|
||||
def audio_stream(self):
|
||||
with audioread.audio_open(self.filename) as f:
|
||||
for buf in f.read_data(128 * 1024):
|
||||
|
||||
data = np.frombuffer(buf, dtype='int16').astype(
|
||||
'float32').reshape((f.channels, -1), order='F')
|
||||
|
||||
a = librosa.core.to_mono(data)
|
||||
a = librosa.resample(a, f.samplerate,
|
||||
self.sample_rate).flatten()
|
||||
a = self._decode(f, buf)
|
||||
|
||||
a /= 16384 # normalize to -1.0 .. 1.0
|
||||
a *= self.normalization
|
||||
|
2
movie.py
2
movie.py
@ -8,7 +8,7 @@ import video
|
||||
|
||||
|
||||
class Movie:
|
||||
def __init__(self, filename: str, audio_normalization: float = 1.0):
|
||||
def __init__(self, filename: str, audio_normalization: float = None):
|
||||
self.filename = filename # type: str
|
||||
self.audio = audio.Audio(
|
||||
filename, normalization=audio_normalization) # type: audio.Audio
|
||||
|
Loading…
Reference in New Issue
Block a user