# Modified code : original was taken from the nvidia nemo library https://github.com/NVIDIA/NeMo/blob/557c4b7ae766faf050374e6b9a862e2e67385b10/nemo/collections/asr/modules/conv_asr.py#L56
from typing import List, Optional, Tuple
import torch.nn as nn
import torch
from matchbox.ConvBlock import ConvBlock, conv_activations, init_weights
[docs]class ConvASREncoder(nn.Module):
def __init__(
self,
activation: str = "relu",
feat_in: int = 64,
normalization_mode: str = "batch",
residual_mode: str = "add",
norm_groups: int = -1,
conv_mask: bool = True,
frame_splicing: int = 1,
init_mode: Optional[str] = 'xavier_uniform'
):
"""Matchbox net encoder model
Args:
activation (str, optional): Activation function for the layers of the network. Defaults to "relu".
feat_in (int, optional): number of features inputed. Defaults to 64.
normalization_mode (str, optional): Normalization mode for the normalization layers in the matchbox submodule. Defaults to "batch".
residual_mode (str, optional): Residual mode for the residual conncetions of the network. Defaults to "add".
norm_groups (int, optional): Defaults to -1.
conv_mask (bool, optional): Defaults to True.
frame_splicing (int, optional): Defaults to 1.
init_mode (Optional[str], optional): Weight initialisation strategy. Defaults to 'xavier_uniform'.
"""
super(ConvASREncoder, self).__init__()
activation = conv_activations[activation]()
# If the activation can be executed in place, do so.
if hasattr(activation, 'inplace'):
activation.inplace = True
feat_in = feat_in * frame_splicing
self._feat_in = feat_in
encoder_layers = []
self.dense_residual = False
self.conv_block_1 = ConvBlock(
feat_in,
128,
repeat=1,
kernel_size=[11],
stride=[1],
dilation=[1],
dropout=0.0,
residual=False,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_block_1)
feat_in = 128
self.conv_res_block_1 = ConvBlock(
feat_in,
64,
repeat=2,
kernel_size=[13],
stride=[1],
dilation=[1],
dropout=0.0,
residual=True,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_res_block_1)
feat_in = 64
self.conv_res_block_2 = ConvBlock(
feat_in,
64,
repeat=2,
kernel_size=[15],
stride=[1],
dilation=[1],
dropout=0.0,
residual=True,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_res_block_2)
feat_in = 64
self.conv_res_block_3 = ConvBlock(
feat_in,
64,
repeat=2,
kernel_size=[17],
stride=[1],
dilation=[1],
dropout=0.0,
residual=True,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_res_block_3)
feat_in = 64
self.conv_res_block_4 = ConvBlock(
feat_in,
64,
repeat=2,
kernel_size=[19],
stride=[1],
dilation=[1],
dropout=0.0,
residual=True,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_res_block_4)
feat_in = 64
self.conv_res_block_5 = ConvBlock(
feat_in,
64,
repeat=2,
kernel_size=[21],
stride=[1],
dilation=[1],
dropout=0.0,
residual=True,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_res_block_5)
feat_in = 64
self.conv_res_block_6 = ConvBlock(
feat_in,
64,
repeat=2,
kernel_size=[23],
stride=[1],
dilation=[1],
dropout=0.0,
residual=True,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_res_block_6)
feat_in = 64
self.conv_block_2 = ConvBlock(
feat_in,
128,
repeat=1,
kernel_size=[29],
stride=[1],
dilation=[2],
dropout=0.0,
residual=False,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_block_2)
feat_in = 128
self.conv_block_3 = ConvBlock(
feat_in,
128,
repeat=1,
kernel_size=[1],
stride=[1],
dilation=[1],
dropout=0.0,
residual=False,
groups=1,
separable=True,
heads=-1,
residual_mode=residual_mode,
normalization=normalization_mode,
norm_groups=norm_groups,
activation=activation,
residual_panes=[],
conv_mask=conv_mask,
kernel_size_factor=1.0,
stride_last=False
)
encoder_layers.append(self.conv_block_3)
feat_in = 128
self._feat_out = feat_in
self.encoder = torch.nn.Sequential(*encoder_layers)
#print(self.encoder)
self.apply(lambda x: init_weights(x, mode=init_mode))
[docs] def forward(self, audio_signal, length=None):
if len(audio_signal.shape)==3:
s_input, length = self.encoder(([audio_signal.permute(0,2,1)], length))
else:
raise TypeError("Unsupported input type : input must be of shape [batch,time,mfcc_features]")
if length is None:
return s_input[-1]
return s_input[-1], length