Source code for matchbox.ConvASRDecoder

# Modified code : original was taken from the nvidia nemo library https://github.com/NVIDIA/NeMo/blob/557c4b7ae766faf050374e6b9a862e2e67385b10/nemo/collections/asr/modules/conv_asr.py#L56

from matchbox.ConvBlock import init_weights
from typing import Optional
import torch
import torch.nn as nn

[docs]class ConvASRDecoderClassification(nn.Module):
    def __init__(
        self,
        feat_in: int,
        num_classes: int,
        init_mode: Optional[str] = "xavier_uniform",
        return_logits: bool = True,
        pooling_type='avg',
    ):
        """Matchbox net decoder

        Args:
            feat_in (int): number of futures inputed
            num_classes (int): number of classes on the output layer
            init_mode (Optional[str], optional): Weight initialisation strategy. Defaults to "xavier_uniform".
            return_logits (bool, optional): Return logit or softmax for greater flexibility. Defaults to True.
            pooling_type (str, optional): Pooling layer type on the output of the encoder. Defaults to 'avg'.

        Raises:
            ValueError: if pooling type isn't supported
        """
        super(ConvASRDecoderClassification, self).__init__()

        self._feat_in = feat_in
        self._return_logits = return_logits
        self._num_classes = num_classes

        if pooling_type == 'avg':
            self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        elif pooling_type == 'max':
            self.pooling = torch.nn.AdaptiveMaxPool1d(1)
        else:
            raise ValueError('Pooling type chosen is not valid. Must be either `avg` or `max`')

        self.decoder_layers = torch.nn.Sequential(torch.nn.Linear(self._feat_in, self._num_classes, bias=True))
        self.apply(lambda x: init_weights(x, mode=init_mode))

[docs]    def forward(self, encoder_output):
        batch, in_channels, timesteps = encoder_output.size()

        encoder_output = self.pooling(encoder_output).view(batch, in_channels)  # [B, C]
        logits = self.decoder_layers(encoder_output)  # [B, num_classes]

        if self._return_logits:
            return logits

        return torch.nn.functional.softmax(logits, dim=-1)