Source code for transforms

import torch
import torch.nn as nn
import torchaudio.transforms as T
import numpy as np
import math
import random
from kymatio.torch import Scattering1D

[docs]class AddWhiteNoise(nn.Module): """Transformation that adds white noise to the audio signal Example : >>> x = torch.zeros(16000) >>> transform = AddWhiteNoise() >>> x_with_noise = AddWhiteNoise(x) """ def __init__(self): super().__init__()
[docs] def add_white_noise(self,audio_tensor,min_snr_db=20,max_snr_db=90,STD_n=0.5): """Adds a random gaussian white noise to the audio_tensor input Args: audio_tensor (torch.tensor): 1 dimensional pytorch tensor min_snr_db (int, optional): minimum signal to noise ratio in dB. Defaults to 20. max_snr_db (int, optional): maximum signal to noise ratio in dB. Defaults to 90. STD_n (float, optional): Standard deviation of the gaussian distribution used to generate the noise. Defaults to 0.5. Returns: torch.tensor: tensor with noise """ noise=np.random.normal(0, STD_n, audio_tensor.shape) noise_power = torch.from_numpy(noise).norm(p=2) audio_power = audio_tensor.norm(p=2) snr_db = random.randint(min_snr_db,max_snr_db) snr = math.exp(snr_db / 10) scale = snr * noise_power / audio_power return (noise/scale+audio_tensor)/2
[docs] def forward(self,x): return(self.add_white_noise(audio_tensor=x))
[docs]class MfccTransform(nn.Module): """Transformation that returns the Mel-frequency cepstral coefficients of an audio tensor Example : >>> x = torch.zeros(16000) >>> transform = MfccTransform() >>> specgram = MfccTransform(x) We can visualize the generated ceptrum with matplotlib using the following : >>> fig, axs = plt.subplots(1, 1) >>> axs.set_title(title or "Mel-frequency cepstrum") >>> axs.set_ylabel(ylabel) >>> axs.set_xlabel("frame") >>> im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto") >>> fig.colorbar(im, ax=axs) >>> plt.show(block=False) """ def __init__(self,sample_rate): super().__init__() self.sample_rate=sample_rate
[docs] def mfcc_transform(self,audio_tensor,sample_rate,n_fft=512,n_mfcc=64,hop_length=10,mel_scale='htk'): transform = T.MFCC( sample_rate=sample_rate, n_mfcc=n_mfcc, melkwargs={ "hop_length": hop_length, "mel_scale": mel_scale, "n_fft": n_fft, "n_mels":64, }, ) return transform(audio_tensor.to(dtype=torch.float32))
[docs] def forward(self, x): if len(x.shape)>1: batch_size, num_samples = x.shape mfcc_features = [] for i in range(batch_size): audio_tensor = x[i] # Extract each audio tensor from the batch mfcc = self.mfcc_transform(audio_tensor=audio_tensor,sample_rate=self.sample_rate) mfcc_features.append(mfcc) mfcc_features = torch.stack(mfcc_features) return mfcc_features.permute(0,2,1) else: return self.mfcc_transform(audio_tensor=x,sample_rate=self.sample_rate)
[docs]class SpecAugment(nn.Module): """Transformation that returns double time-masked and frequency-masked Mel-frequency cepstral coefficients of an audio tensor Example : >>> x = torch.zeros(16000) >>> transform = MfccTransform() >>> specgram = MfccTransform(x) We can visualize the modified ceptrum with matplotlib using the following : >>> fig, axs = plt.subplots(1, 1) >>> axs.set_title(title or "Mel-frequency cepstrum") >>> axs.set_ylabel(ylabel) >>> axs.set_xlabel("frame") >>> im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto") >>> fig.colorbar(im, ax=axs) >>> plt.show(block=False) """ def __init__(self): super().__init__()
[docs] def spec_aug(self,tensor,time_mask=50,freq_mask=5,prob=0.8): time_masking = T.TimeMasking(time_mask_param=time_mask,p=prob) freq_masking = T.FrequencyMasking(freq_mask_param=freq_mask) return freq_masking(freq_masking(time_masking(time_masking(tensor))))
[docs] def forward(self,x): return self.spec_aug(tensor=x)
[docs]class Scattering(nn.Module): """Wrapper for kymatio's scattering transform. Returns the scattering coefficients of the input. For more information about the transform checkout : https://www.kymat.io/ """ def __init__(self): super().__init__() #Scattering hyperparameters T=16000 J=4 Q=8 self.log_eps=1e-6 #Layers self.scattering= Scattering1D(J=J,shape=T,Q=Q) self.batch_norm= nn.BatchNorm2d(1)
[docs] def forward(self,x): #print(x.shape) x=self.scattering(x.squeeze(-1)) #print(x.shape) x=torch.log(torch.abs(x)+self.log_eps) x=self.batch_norm(x.unsqueeze(1)) return x.squeeze(1).permute(0,2,1)