import torch
import torch.nn as nn
import torchaudio.transforms as T
import numpy as np
import math
import random
from kymatio.torch import Scattering1D
[docs]class AddWhiteNoise(nn.Module):
"""Transformation that adds white noise to the audio signal
Example :
>>> x = torch.zeros(16000)
>>> transform = AddWhiteNoise()
>>> x_with_noise = AddWhiteNoise(x)
"""
def __init__(self):
super().__init__()
[docs] def add_white_noise(self,audio_tensor,min_snr_db=20,max_snr_db=90,STD_n=0.5):
"""Adds a random gaussian white noise to the audio_tensor input
Args:
audio_tensor (torch.tensor): 1 dimensional pytorch tensor
min_snr_db (int, optional): minimum signal to noise ratio in dB. Defaults to 20.
max_snr_db (int, optional): maximum signal to noise ratio in dB. Defaults to 90.
STD_n (float, optional): Standard deviation of the gaussian distribution used to generate the noise. Defaults to 0.5.
Returns:
torch.tensor: tensor with noise
"""
noise=np.random.normal(0, STD_n, audio_tensor.shape)
noise_power = torch.from_numpy(noise).norm(p=2)
audio_power = audio_tensor.norm(p=2)
snr_db = random.randint(min_snr_db,max_snr_db)
snr = math.exp(snr_db / 10)
scale = snr * noise_power / audio_power
return (noise/scale+audio_tensor)/2
[docs] def forward(self,x):
return(self.add_white_noise(audio_tensor=x))
[docs]class SpecAugment(nn.Module):
"""Transformation that returns double time-masked and frequency-masked Mel-frequency cepstral coefficients of an audio tensor
Example :
>>> x = torch.zeros(16000)
>>> transform = MfccTransform()
>>> specgram = MfccTransform(x)
We can visualize the modified ceptrum with matplotlib using the following :
>>> fig, axs = plt.subplots(1, 1)
>>> axs.set_title(title or "Mel-frequency cepstrum")
>>> axs.set_ylabel(ylabel)
>>> axs.set_xlabel("frame")
>>> im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
>>> fig.colorbar(im, ax=axs)
>>> plt.show(block=False)
"""
def __init__(self):
super().__init__()
[docs] def spec_aug(self,tensor,time_mask=50,freq_mask=5,prob=0.8):
time_masking = T.TimeMasking(time_mask_param=time_mask,p=prob)
freq_masking = T.FrequencyMasking(freq_mask_param=freq_mask)
return freq_masking(freq_masking(time_masking(time_masking(tensor))))
[docs] def forward(self,x):
return self.spec_aug(tensor=x)
[docs]class Scattering(nn.Module):
"""Wrapper for kymatio's scattering transform. Returns the scattering coefficients of the input.
For more information about the transform checkout : https://www.kymat.io/
"""
def __init__(self):
super().__init__()
#Scattering hyperparameters
T=16000
J=4
Q=8
self.log_eps=1e-6
#Layers
self.scattering= Scattering1D(J=J,shape=T,Q=Q)
self.batch_norm= nn.BatchNorm2d(1)
[docs] def forward(self,x):
#print(x.shape)
x=self.scattering(x.squeeze(-1))
#print(x.shape)
x=torch.log(torch.abs(x)+self.log_eps)
x=self.batch_norm(x.unsqueeze(1))
return x.squeeze(1).permute(0,2,1)