import torch
import torch.nn as nn
import torch.nn.functional as F
from avalanche.models.generator import Generator
from matchbox.ConvASRDecoder import ConvASRDecoderClassification
from matchbox.ConvASREncoder import ConvASREncoder
from torchinfo import summary
from kymatio.torch import Scattering1D
[docs]class EncDecBaseModel(nn.Module):
"""Encoder decoder model for MatchboxNet from the paper : http://arxiv.org/abs/2004.08531
Args:
num_mels (int): number of mel features in the mfcc transform preprocessing
final_filter (int): size of final conv filter in the encoder
num_classes (int): number of output classes for classification
input_length (int): input time dimension length
"""
def __init__(self, num_mels,
final_filter,
num_classes,
input_length):
super(EncDecBaseModel, self).__init__()
self.input_length = torch.tensor(input_length)
self.encoder = ConvASREncoder(feat_in = num_mels)
self.decoder = ConvASRDecoderClassification(feat_in = final_filter, num_classes= num_classes,return_logits=True)
[docs] def forward(self, input_signal):
encoded, encoded_len = self.encoder(audio_signal=input_signal, length=self.input_length)
logits = self.decoder(encoder_output=encoded)
return logits
[docs]class AudioVAE(nn.Module,Generator):
r"""A simple naive try at creating and Audio VAE for Generative Replay.
.. attention::
This doesn't work really well, because of the poor generative capabilities of this model.
"""
def __init__(self, imgChannels=1, featureDim=15656, zDim=256):
super(AudioVAE, self).__init__()
# Initializing the 2 convolutional layers and 2 full-connected layers for the encoder
self.encConv1 = nn.Conv2d(in_channels= imgChannels,out_channels= 2,kernel_size= (200,10))
self.encConv2 = nn.Conv2d(in_channels= 2,out_channels= 4,kernel_size= (200,10))
self.encConv3 = nn.Conv2d(in_channels= 4,out_channels= 4,kernel_size= (200,10))
self.encConv4 = nn.Conv2d(in_channels= 4,out_channels= 4,kernel_size= (200,10))
self.encConv5 = nn.Conv2d(in_channels= 4,out_channels= 4,kernel_size= (200,10))
self.flatten = nn.Flatten()
self.encFC1 = nn.Linear(featureDim, zDim)
self.encFC2 = nn.Linear(featureDim, zDim)
# Initializing the fully-connected layer and 2 convolutional layers for decoder
self.decFC1 = nn.Linear(zDim, featureDim)
self.decConv1 = nn.ConvTranspose2d(4, 4, (200,10))
self.decConv2 = nn.ConvTranspose2d(4, 4, (100,10))
self.decConv3 = nn.ConvTranspose2d(4, 4, (100,10))
self.decConv4 = nn.ConvTranspose2d(4, 2, (100,10))
self.decConv5 = nn.ConvTranspose2d(2, imgChannels, (100,10))
[docs] def encoder(self, x):
# Input is fed into 2 convolutional layers sequentially
# The output feature map are fed into 2 fully-connected layers to predict mean (mu) and variance (logVar)
# Mu and logVar are used for generating middle representation z and KL divergence loss
x = F.relu(self.encConv1(x))
x = F.relu(self.encConv2(x))
x = F.relu(self.encConv3(x))
x = F.relu(self.encConv4(x))
x = F.relu(self.encConv5(x))
print(x.shape)
x = self.flatten(x)
print("hi",x.shape)
mu = self.encFC1(x)
logVar = self.encFC2(x)
return mu, logVar
[docs] def reparameterize(self, mu, logVar):
#Reparameterization takes in the input mu and logVar and sample the mu + std * eps
std = torch.exp(logVar/2)
eps = torch.randn_like(std)
return mu + std * eps
[docs] def decoder(self, z):
# z is fed back into a fully-connected layers and then into two transpose convolutional layers
# The generated output is the same size of the original input
x = F.relu(self.decFC1(z))
x = x.view([128, 4, 606, 19])
x = F.relu(self.decConv1(x))
x = F.relu(self.decConv2(x))
x = F.relu(self.decConv3(x))
x = F.relu(self.decConv4(x))
x = torch.sigmoid(self.decConv5(x))
return x
[docs] def generate(self, batch_size=None, condition=None):
#feed to decoder
with torch.no_grad():
return self.decoder(torch.randn(15656))
[docs] def forward(self, x):
# The entire pipeline of the VAE: encoder -> reparameterization -> decoder
# output, mu, and logVar are returned for loss computation
mu, logVar = self.encoder(x)
z = self.reparameterize(mu, logVar)
out = self.decoder(z)
return out, mu, logVar
[docs]class M5(nn.Module):
"""Basic M5 model from the paper https://arxiv.org/pdf/1610.00087.pdf
Args:
n_input (int, optional): Number of inputs. Defaults to 1.
n_output (int, optional): Number of outputs. Defaults to 35.
stride (int, optional): Convolution stride. Defaults to 16.
n_channel (int, optional): Output channels of convolution layers. Defaults to 32.
"""
def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
super().__init__()
self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=160, stride=stride)
self.bn1 = nn.BatchNorm1d(n_channel)
self.pool1 = nn.MaxPool1d(4)
self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
self.bn2 = nn.BatchNorm1d(n_channel)
self.pool2 = nn.MaxPool1d(4)
self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
self.bn3 = nn.BatchNorm1d(2 * n_channel)
self.pool3 = nn.MaxPool1d(4)
self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
self.bn4 = nn.BatchNorm1d(2 * n_channel)
self.pool4 = nn.MaxPool1d(4)
self.fc1 = nn.Linear(2 * n_channel, n_output)
[docs] def forward(self, x):
x = self.conv1(x)
x = F.relu(self.bn1(x))
x = self.pool1(x)
x = self.conv2(x)
x = F.relu(self.bn2(x))
x = self.pool2(x)
x = self.conv3(x)
x = F.relu(self.bn3(x))
x = self.pool3(x)
x = self.conv4(x)
x = F.relu(self.bn4(x))
x = self.pool4(x)
x = F.avg_pool1d(x, x.shape[-1])
x = x.permute(0, 2, 1)
x = self.fc1(x)
return F.log_softmax(x, dim=2).squeeze()
[docs]class Scattering_Classifier(nn.Module):
"""A try at directly classifying using a simple scattering classifier without a CNN.
"""
def __init__(self):
super().__init__()
#Scattering hyperparameters
T=16000
J=12
Q=10
self.log_eps=1e-6
#Layers
self.scattering= Scattering1D(J=J,shape=T,Q=Q,T=1000)
self.batchnorm=nn.BatchNorm1d(673)
self.fc1= nn.Linear(673,300)
self.fc2= nn.Linear(300,90)
self.fc3= nn.Linear(90,35)
[docs] def forward(self,x):
#print(x.shape)
x=self.scattering(x.squeeze())
#print(x.shape)
x=x[:,1:,:]
x=torch.log(torch.abs(x)+self.log_eps)
#print(x.shape)
x=torch.mean(x,dim=-1)
x=self.batchnorm(x)
x=F.relu(self.fc1(x))
x=F.relu(self.fc2(x))
x=self.fc3(x)
return F.log_softmax(x,dim=-1)
if __name__=='__main__':
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EncDecBaseModel(num_mels= 64, final_filter = 128, num_classes=35,input_length=1601)
summary(model=model,device=device)
model.to(device=device)
test_input = torch.rand([256, 98, 64])
test_input= test_input.to(device=device)
label=torch.Tensor(98)
label=label.to(device)
test_output = model(test_input)
print(test_output.size())
[docs]class Pool(nn.Module):
"""A simple pooling model to add at the end of a feature extractor
"""
def __init__(self,channel_size):
"""Init function
Args:
channel_size (int): Since the pooling is done in 1D we have to specify the channel size to retreive the original shape.
"""
super().__init__()
self.pooling=nn.AdaptiveAvgPool1d(1)
self.channel_size=channel_size
[docs] def forward(self,x):
x=self.pooling(x).view(-1,self.channel_size)
return x
[docs]class Circularize(nn.Module):
r"""A try at circularizing the latent space before LDA
.. note::
We got 1% less accuracy than regular LDA.
"""
def __init__(self) :
super().__init__()
[docs] def forward(self,x):
return torch.cat((x,torch.square(x)),dim=-1)