Source code for models

import torch
import torch.nn as nn
import torch.nn.functional as F

from avalanche.models.generator import Generator

from matchbox.ConvASRDecoder import ConvASRDecoderClassification
from matchbox.ConvASREncoder import ConvASREncoder

from torchinfo import summary

from kymatio.torch import Scattering1D

[docs]class EncDecBaseModel(nn.Module): """Encoder decoder model for MatchboxNet from the paper : http://arxiv.org/abs/2004.08531 Args: num_mels (int): number of mel features in the mfcc transform preprocessing final_filter (int): size of final conv filter in the encoder num_classes (int): number of output classes for classification input_length (int): input time dimension length """ def __init__(self, num_mels, final_filter, num_classes, input_length): super(EncDecBaseModel, self).__init__() self.input_length = torch.tensor(input_length) self.encoder = ConvASREncoder(feat_in = num_mels) self.decoder = ConvASRDecoderClassification(feat_in = final_filter, num_classes= num_classes,return_logits=True)
[docs] def forward(self, input_signal): encoded, encoded_len = self.encoder(audio_signal=input_signal, length=self.input_length) logits = self.decoder(encoder_output=encoded) return logits
[docs]class AudioVAE(nn.Module,Generator): r"""A simple naive try at creating and Audio VAE for Generative Replay. .. attention:: This doesn't work really well, because of the poor generative capabilities of this model. """ def __init__(self, imgChannels=1, featureDim=15656, zDim=256): super(AudioVAE, self).__init__() # Initializing the 2 convolutional layers and 2 full-connected layers for the encoder self.encConv1 = nn.Conv2d(in_channels= imgChannels,out_channels= 2,kernel_size= (200,10)) self.encConv2 = nn.Conv2d(in_channels= 2,out_channels= 4,kernel_size= (200,10)) self.encConv3 = nn.Conv2d(in_channels= 4,out_channels= 4,kernel_size= (200,10)) self.encConv4 = nn.Conv2d(in_channels= 4,out_channels= 4,kernel_size= (200,10)) self.encConv5 = nn.Conv2d(in_channels= 4,out_channels= 4,kernel_size= (200,10)) self.flatten = nn.Flatten() self.encFC1 = nn.Linear(featureDim, zDim) self.encFC2 = nn.Linear(featureDim, zDim) # Initializing the fully-connected layer and 2 convolutional layers for decoder self.decFC1 = nn.Linear(zDim, featureDim) self.decConv1 = nn.ConvTranspose2d(4, 4, (200,10)) self.decConv2 = nn.ConvTranspose2d(4, 4, (100,10)) self.decConv3 = nn.ConvTranspose2d(4, 4, (100,10)) self.decConv4 = nn.ConvTranspose2d(4, 2, (100,10)) self.decConv5 = nn.ConvTranspose2d(2, imgChannels, (100,10))
[docs] def encoder(self, x): # Input is fed into 2 convolutional layers sequentially # The output feature map are fed into 2 fully-connected layers to predict mean (mu) and variance (logVar) # Mu and logVar are used for generating middle representation z and KL divergence loss x = F.relu(self.encConv1(x)) x = F.relu(self.encConv2(x)) x = F.relu(self.encConv3(x)) x = F.relu(self.encConv4(x)) x = F.relu(self.encConv5(x)) print(x.shape) x = self.flatten(x) print("hi",x.shape) mu = self.encFC1(x) logVar = self.encFC2(x) return mu, logVar
[docs] def reparameterize(self, mu, logVar): #Reparameterization takes in the input mu and logVar and sample the mu + std * eps std = torch.exp(logVar/2) eps = torch.randn_like(std) return mu + std * eps
[docs] def decoder(self, z): # z is fed back into a fully-connected layers and then into two transpose convolutional layers # The generated output is the same size of the original input x = F.relu(self.decFC1(z)) x = x.view([128, 4, 606, 19]) x = F.relu(self.decConv1(x)) x = F.relu(self.decConv2(x)) x = F.relu(self.decConv3(x)) x = F.relu(self.decConv4(x)) x = torch.sigmoid(self.decConv5(x)) return x
[docs] def generate(self, batch_size=None, condition=None): #feed to decoder with torch.no_grad(): return self.decoder(torch.randn(15656))
[docs] def forward(self, x): # The entire pipeline of the VAE: encoder -> reparameterization -> decoder # output, mu, and logVar are returned for loss computation mu, logVar = self.encoder(x) z = self.reparameterize(mu, logVar) out = self.decoder(z) return out, mu, logVar
[docs]class M5(nn.Module): """Basic M5 model from the paper https://arxiv.org/pdf/1610.00087.pdf Args: n_input (int, optional): Number of inputs. Defaults to 1. n_output (int, optional): Number of outputs. Defaults to 35. stride (int, optional): Convolution stride. Defaults to 16. n_channel (int, optional): Output channels of convolution layers. Defaults to 32. """ def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32): super().__init__() self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=160, stride=stride) self.bn1 = nn.BatchNorm1d(n_channel) self.pool1 = nn.MaxPool1d(4) self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3) self.bn2 = nn.BatchNorm1d(n_channel) self.pool2 = nn.MaxPool1d(4) self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3) self.bn3 = nn.BatchNorm1d(2 * n_channel) self.pool3 = nn.MaxPool1d(4) self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3) self.bn4 = nn.BatchNorm1d(2 * n_channel) self.pool4 = nn.MaxPool1d(4) self.fc1 = nn.Linear(2 * n_channel, n_output)
[docs] def forward(self, x): x = self.conv1(x) x = F.relu(self.bn1(x)) x = self.pool1(x) x = self.conv2(x) x = F.relu(self.bn2(x)) x = self.pool2(x) x = self.conv3(x) x = F.relu(self.bn3(x)) x = self.pool3(x) x = self.conv4(x) x = F.relu(self.bn4(x)) x = self.pool4(x) x = F.avg_pool1d(x, x.shape[-1]) x = x.permute(0, 2, 1) x = self.fc1(x) return F.log_softmax(x, dim=2).squeeze()
[docs]class Scattering_Classifier(nn.Module): """A try at directly classifying using a simple scattering classifier without a CNN. """ def __init__(self): super().__init__() #Scattering hyperparameters T=16000 J=12 Q=10 self.log_eps=1e-6 #Layers self.scattering= Scattering1D(J=J,shape=T,Q=Q,T=1000) self.batchnorm=nn.BatchNorm1d(673) self.fc1= nn.Linear(673,300) self.fc2= nn.Linear(300,90) self.fc3= nn.Linear(90,35)
[docs] def forward(self,x): #print(x.shape) x=self.scattering(x.squeeze()) #print(x.shape) x=x[:,1:,:] x=torch.log(torch.abs(x)+self.log_eps) #print(x.shape) x=torch.mean(x,dim=-1) x=self.batchnorm(x) x=F.relu(self.fc1(x)) x=F.relu(self.fc2(x)) x=self.fc3(x) return F.log_softmax(x,dim=-1)
if __name__=='__main__': device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = EncDecBaseModel(num_mels= 64, final_filter = 128, num_classes=35,input_length=1601) summary(model=model,device=device) model.to(device=device) test_input = torch.rand([256, 98, 64]) test_input= test_input.to(device=device) label=torch.Tensor(98) label=label.to(device) test_output = model(test_input) print(test_output.size())
[docs]class Pool(nn.Module): """A simple pooling model to add at the end of a feature extractor """ def __init__(self,channel_size): """Init function Args: channel_size (int): Since the pooling is done in 1D we have to specify the channel size to retreive the original shape. """ super().__init__() self.pooling=nn.AdaptiveAvgPool1d(1) self.channel_size=channel_size
[docs] def forward(self,x): x=self.pooling(x).view(-1,self.channel_size) return x
[docs]class Circularize(nn.Module): r"""A try at circularizing the latent space before LDA .. note:: We got 1% less accuracy than regular LDA. """ def __init__(self) : super().__init__()
[docs] def forward(self,x): return torch.cat((x,torch.square(x)),dim=-1)