Introduction
In this tutorial, I will look into how to prepare audio data, and mix it with noise. By doing so, we can generate new examples for free and make our training dataset more generalised. This step is also called data augmentation, this simplest technique usually works better.
Load Packages
import math
import torch
import torchaudio
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
Steps
To add noise in the background, we first need to load our signals using torchaudio
.
noise, sr = torchaudio.load(
r'16000_pcm_speeches\noise\_background_noise_\doing_the_dishes.wav'
)
signal, sr = torchaudio.load(
r'timit\data\TEST\DR1\FAKS0\SA1.WAV'
)
noise = noise[:, :signal.shape[1]]
For added diversity, I will also choose a fixed signal-to-noise ratio (SNR) to apply noises at a different volume compared to the original signal.
def add_noise(signal, noise=None, snr_db=0):
signal_rms = torch.norm(signal, p=2, dim=-1)
if noise is None:
noise_rms = signal_rms / (10 ** (snr_db / 10.0))
noise = torch.normal(mean=0, std=1, size=signal.shape) * np.sqrt(noise_rms)
noise_rms = torch.norm(noise, p=2, dim=-1)
snr = 10 ** (snr_db / 10.0)
scale = signal_rms / (snr * noise_rms)
return signal + scale * noise
Next we try to plot all the figures, and compare them with different settings of SNR.
def plot_waveforms(waveforms, sample_rate, titles=None, figsize=None):
waveforms = [waveform.numpy() for waveform in waveforms]
width, height = find_width_height(len(waveforms))
fig = plt.figure(figsize=figsize)
for idx, waveform in enumerate(waveforms):
num_samples = waveform.shape[1]
time_axis = torch.arange(0, num_samples) / sample_rate
fig.add_subplot(width, height, idx+1)
plt.plot(time_axis, waveform[0], linewidth=1, color='tab:blue')
plt.grid(True)
plt.title(titles[idx])
plt.tight_layout()
plt.show(block=True)
def factorisation(num):
factor = []
while num > 1:
for i in range(num - 1):
k = i + 2
if num % k == 0:
factor.append(k)
num = int(num / k)
break
return factor
def find_width_height(num):
factor = factorisation(num)
width = min(factor, key=lambda x:abs(x-math.sqrt(num)))
height = num // width
return width, height
Plot the wavforms.
plot_waveforms(
[
signal,
add_noise(signal, noise=None, snr_db=-5),
add_noise(signal, noise=None, snr_db=0),
add_noise(signal, noise=None, snr_db=5),
add_noise(signal, noise=None, snr_db=10),
add_noise(signal, noise=None, snr_db=20),
],
sr,
titles=[
'Original waveform',
'Noised waveform (-5 dB)',
'Noised waveform (0 dB)',
'Noised waveform (5 dB)',
'Noised waveform (10 dB)',
'Noised waveform (20 dB)'
],
figsize=(20, 10)
)