Source code for dataset

import torch
from torch.utils.data import Dataset
from torchaudio.datasets import SPEECHCOMMANDS
from torchaudio.datasets.utils import _load_waveform

from avalanche.benchmarks.utils import make_classification_dataset
from avalanche.benchmarks.datasets import default_dataset_location

import numpy as np

import os
import h5py
import typing
import logging
from tqdm import tqdm
from pathlib import Path


[docs]def speech_commands_collate(batch):
    """Collate function for setting up the dataloader

    Args:
        batch (int): batch size

    Returns:
        batch: return batched data in the form ; audio_tensor,target,task_label
    """
    #FIXME with precached data we seem to always take this first loop
    if len(batch)==1:
        waveform, label, rate, sid, uid, t_label = batch[0]
        waveform=waveform.squeeze()
        tensor_size=waveform.size(0)
        size=128
        if tensor_size < size:
            padding_size = size - tensor_size
            padded_tensor = torch.cat((waveform, torch.zeros(padding_size)), dim=0)
            return padded_tensor.unsqueeze(0),torch.tensor(label).unsqueeze(0),torch.tensor(t_label).unsqueeze(0)
        elif tensor_size > size:
            cut_tensor = waveform[:size]
            return cut_tensor.unsqueeze(0),torch.tensor(label).unsqueeze(0),torch.tensor(t_label).unsqueeze(0)
        else:
            return waveform.unsqueeze(0),torch.tensor(label).unsqueeze(0),torch.tensor(t_label).unsqueeze(0)

    else:
        
        tensors, targets, t_labels = [], [], []

        for waveform, label, rate, sid, uid, t_label in batch:#FIXME this is only a temporary solution for icarl 

            if isinstance(waveform,np.ndarray):
                waveform = torch.from_numpy(waveform)
            elif isinstance(waveform, torch.Tensor):
                pass
            else:
                raise ValueError("Waveform must be saved as torch.tensor or np.array")
            
            tensor_size=waveform.size(0)
            size=16000
            
            if tensor_size < size:
                padding_size = size - tensor_size
                waveform = torch.cat((waveform, torch.zeros(padding_size)), dim=0)
            elif tensor_size > size:
                waveform = waveform[:size]
        
            tensors += [waveform]

            targets += [torch.tensor(label)]
            t_labels += [torch.tensor(t_label)]

        tensors = [item.t() for item in tensors]
        tensors = torch.nn.utils.rnn.pad_sequence(
            tensors, batch_first=True, padding_value=0.0
        )
        #if len(tensors.size()) == 2:  # add feature dimension
        #    tensors = tensors.unsqueeze(-1)
        targets = torch.stack(targets)
        t_labels = torch.stack(t_labels)
        return tensors, targets, t_labels# Fix for convolution.permute(0,2,1)

[docs]@torch.no_grad()
def preprocess_and_save_dataset(dataset, save_path : str,transformation,output_shape=[]):
    """Function for preprocessing and saving datasets.

    .. important::
        This function only works for the SpeechCommands dataset, or for dataset that have those specific entries :  wave, label, rate, speaker_id, utterance_number

    Args:
        dataset (torch.utils.data.Dataset): The dataset to be processed
        save_path (str): Save path for the preprocessed dataset
        transformation (torch.nn.Module): The transformations that will be applied to the data
        output_shape (list): Output shape of an element of the transformation. Defaults to [].

    Raises:
        AttributeError: If given output shape is not a list or is an empty list
    """
    
    if os.path.isfile(save_path):
        # The preprocessed dataset already exists, no need to preprocess again
        logging.info("The preprocessed dataset already exists, using cache")
        return
    
    if output_shape==[] or not isinstance(output_shape,list):
        raise AttributeError("Specify the shape of the output of the transform using a list")

    # Create a new HDF5 file to store the preprocessed data
    with h5py.File(save_path, 'w') as f:

        # Create HDF5 datasets for the waveform, rate, label, speaker_id, and ut_number
        waveform_dset = f.create_dataset('waveform', shape=(len(dataset),*output_shape), dtype=np.dtype('float32'))
        rate_dset = f.create_dataset('rate', shape=(len(dataset),), dtype='int32')
        label_dset = f.create_dataset('label', shape=(len(dataset),), dtype='int32')
        speaker_id_dset = f.create_dataset('speaker_id', shape=(len(dataset),), dtype=h5py.special_dtype(vlen=str))
        ut_number_dset = f.create_dataset('ut_number', shape=(len(dataset),), dtype='int32')

        # Preprocess and save each item in the dataset
        with tqdm(total=len(dataset),desc="Preprocessing dataset") as pbar:
            for i, item in enumerate(dataset):
                wave, label, rate, speaker_id, ut_number = item
            
                # Apply preprocessing to the waveform
                wave = torch.nn.functional.pad(input=wave,pad=[0,16000-wave.shape[0]],mode='constant', value=0)
                wave = transformation(wave.unsqueeze(0)).squeeze(0)

                # Add the preprocessed data to the HDF5 datasets
                waveform_dset[i] = wave.numpy()
                rate_dset[i] = rate
                label_dset[i] = label
                speaker_id_dset[i] = speaker_id
                ut_number_dset[i] = ut_number

                # Update the progress bar
                pbar.update(1)

    

[docs]class SpeechCommandsData(SPEECHCOMMANDS):
    """Wrapper for torchaudio's speechcommand dataset.
    """
    def __init__(self, root, url, download, subset):
        super().__init__(root=root, download=download, subset=subset, url=url)
        self.labels_names = ["backward","bed","bird","cat","dog","down","eight","five","follow","forward","four","go","happy","house","learn","left","marvin","nine","no","off","on","one","right","seven","sheila","six","stop","three","tree","two","up","visual","wow","yes","zero"]


    def __getitem__(self, item):
        wave, rate, label, speaker_id, ut_number = super().__getitem__(item)

        label = self.labels_names.index(label)
        wave = wave.squeeze(0)  # (T,)

        return wave, label, rate, speaker_id, ut_number

[docs]class MLcommonsData():
    """Wrapper for a subset of the MlCommons `Multilingual Spoken Words dataset <https://mlcommons.org/en/multilingual-spoken-words/>`_
    """
    def __init__(self, root,sub_folder,subset,folder_in_archive = "MLCommons"):
        if sub_folder=='subset1':
            self.labels_names = ['about', 'books', 'car', 'county', 'different', 'door', 'felt', 'game', 'has', 'live', 'man', 'north', 'open', 'party', 'put', 'run', 'side', 'sun', 'thing', 'trying', 'who', 'words', 'back', 'boy', 'church', 'day', 'does', 'end', 'friend', 'general', 'here', 'love', 'need', 'now', 'our', 'people', 'river', 'service', 'song', 'sure', 'then', 'treasure', 'why', 'you']
        elif sub_folder=='subset2':
            self.labels_names= ["tried","hey","career","south","please","working","building","old","around","company","himself","language","album","family","young","returned","important","throughout","understand","include","business","daughter","everything","englishman","between","outside",'about', 'books', 'car', 'county', 'different', 'door', 'felt', 'game', 'has', 'live', 'man', 'north', 'open', 'party', 'put', 'run', 'side', 'sun', 'thing', 'trying', 'who', 'words', 'back', 'boy', 'church', 'day', 'does', 'end', 'friend', 'general', 'here', 'love', 'need', 'now', 'our', 'people', 'river', 'service', 'song', 'sure', 'then', 'treasure', 'why', 'you']
        else:
            raise NotImplemented()

        if subset is not None and subset not in ["training", "validation", "testing"]:
            raise ValueError("When `subset` is not None, it must be one of ['training', 'validation', 'testing'].")

        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)
        self._archive = os.path.join(root, folder_in_archive)

        basename = os.path.basename(sub_folder)
        archive = os.path.join(root, basename)

        basename = basename.rsplit(".", 2)[0]
        folder_in_archive = os.path.join(folder_in_archive, basename)

        self._path = os.path.join(root, folder_in_archive)

        if not os.path.exists(self._path):
            raise RuntimeError(
                f"The path {self._path} doesn't exist. "
                "Please check the ``root`` path or set `download=True` to download it"
            )

        if subset == "validation":
            self._walker = self._load_list(self._path, "validation_list.txt")
        elif subset == "testing":
            self._walker = self._load_list(self._path, "testing_list.txt")
        elif subset == "training":
            excludes = set(self._load_list(self._path, "validation_list.txt", "testing_list.txt"))
            walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
            self._walker = [
                w
                for w in walker
                if os.path.normpath(w) not in excludes
            ]
        else:
            walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
            self._walker = [w for w in walker]

    def _load_list(self,root, *filenames):
        output = []
        for filename in filenames:
            filepath = os.path.join(root, filename)
            with open(filepath) as fileobj:
                output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj]
        return output
    
[docs]    def get_metadata(self, n: int) -> typing.Tuple[str, int, str, str, int]:
        relpath = os.path.relpath(self._walker[n], self._archive)
        reldir, filename = os.path.split(relpath)
        _, label = os.path.split(reldir)
        return relpath, 16000, label
    
    def __getitem__(self, n: int) -> typing.Tuple[torch.Tensor, int, str, str, int]:
        metadata = self.get_metadata(n)
        waveform = _load_waveform(self._archive, metadata[0], metadata[1])
        return waveform.squeeze(0),self.labels_names.index(metadata[2]),metadata[1],0,0
    
    def __len__(self) -> int:
        return len(self._walker)


[docs]class CachedAudio(Dataset):
    """Wrapper for cached `hdf5 <https://www.h5py.org/>`_ audio datasets.
    """
    def __init__(self,subset,train_cache_path='../dataset_cache/',test_cache_path='../dataset_cache/'):
        self.labels_names = ["backward","bed","bird","cat","dog","down","eight","five","follow","forward","four","go","happy","house","learn","left","marvin","nine","no","off","on","one","right","seven","sheila","six","stop","three","tree","two","up","visual","wow","yes","zero"]
        self.subset=subset
        self.preprocessed_train_path = os.path.join(train_cache_path, "preprocessed_train.h5")
        self.preprocessed_test_path = os.path.join(test_cache_path, "preprocessed_test.h5")

    def __len__(self):
        if self.subset=='training':
            return h5py.File(self.preprocessed_train_path, 'r')['waveform'].len()
        elif self.subset=='testing':
            return h5py.File(self.preprocessed_test_path, 'r')['waveform'].len()
        else:
            raise ValueError("Unknown data subset. Choose from : training or testing.")


    def __getitem__(self,item):
        if self.subset=='training':
            # Load the preprocessed dataset
            with h5py.File(self.preprocessed_train_path, 'r') as f:
                wave = f['waveform'][item]
                rate = f['rate'][item]
                label = f['label'][item]
                speaker_id = f['speaker_id'][item]
                ut_number = f['ut_number'][item]
        elif self.subset=='testing':
            # Load the preprocessed dataset
            with h5py.File(self.preprocessed_test_path, 'r') as f:
                wave = f['waveform'][item]
                rate = f['rate'][item]
                label = f['label'][item]
                speaker_id = f['speaker_id'][item]
                ut_number = f['ut_number'][item]
        else:
            raise ValueError("Unknown data subset. Choose from : training or testing.")
        
        return torch.from_numpy(wave), label, rate, speaker_id, ut_number
    
[docs]class Audio_Dataset():
    """Avalanche audio datasets wrapper.
    """
    def __init__(self,train_transformation=None,test_transformation=None):
        
        self.labels_names  = ["backward","bed","bird","cat","dog","down","eight","five","follow","forward","four","go","happy","house","learn","left","marvin","nine","no","off","on","one","right","seven","sheila","six","stop","three","tree","two","up","visual","wow","yes","zero"]
        self.commons_names = ['about', 'books', 'car', 'county', 'different', 'door', 'felt', 'game', 'has', 'live', 'man', 'north', 'open', 'party', 'put', 'run', 'side', 'sun', 'thing', 'trying', 'who', 'words', 'back', 'boy', 'church', 'day', 'does', 'end', 'friend', 'general', 'here', 'love', 'need', 'now', 'our', 'people', 'river', 'service', 'song', 'sure', 'then', 'treasure', 'why', 'you']
        self.commons_names2= ["tried","hey","career","south","please","working","building","old","around","company","himself","language","album","family","young","returned","important","throughout","understand","include","business","daughter","everything","englishman","between","outside",'about', 'books', 'car', 'county', 'different', 'door', 'felt', 'game', 'has', 'live', 'man', 'north', 'open', 'party', 'put', 'run', 'side', 'sun', 'thing', 'trying', 'who', 'words', 'back', 'boy', 'church', 'day', 'does', 'end', 'friend', 'general', 'here', 'love', 'need', 'now', 'our', 'people', 'river', 'service', 'song', 'sure', 'then', 'treasure', 'why', 'you']

        self.train_transformation = train_transformation
        self.test_transformation = test_transformation
        self.transform_groups={
            'train':(self.train_transformation,None),
            'eval':(self.test_transformation,None)
        }

[docs]    def SpeechCommands(self,
        root=default_dataset_location("speechcommands"),
        url="speech_commands_v0.02",
        download=True,
        subset=None,
        transforms=None,
        pre_process=True,
        output_shape=[],
    ):
        """SpeechCommands dataset wrapper function for avalanche lib.

        Args:
            root (str, optional): dataset root location. Defaults to default_dataset_location("speechcommands").
            url (str, optional): version name of the dataset. Defaults to "speech_commands_v0.02".
            download (bool, optional): automatically download the dataset, if not present. Defaults to True.
            subset (str, optional): one of 'training', 'validation', 'testing'. Defaults to None.
            transforms (torch.nn.Module, optional): transformations applied to the data. Defaults to None.
            pre_process (bool, optional): Enable prior preprocessing and saving of the dataset. Defaults to True.
            output_shape (list) : Output shape of a transformed element.

        Raises:
            ValueError: If an unkown subset is chosen

        Returns:
            ClassificationDataset: Avalanche's classification dataset
        """
        
        dataset = SpeechCommandsData(
            root='../dataset/',
            download=download,
            subset=subset,
            url=url,
        )

        if pre_process:

            if subset=='training':
                self.preprocessed_train_path = os.path.join('../dataset_cache/', "preprocessed_train.h5")
                preprocess_and_save_dataset(dataset=dataset, save_path=self.preprocessed_train_path,transformation=self.train_transformation,output_shape=output_shape)
                cached_dataset=CachedAudio(subset=subset)
                labels = [datapoint[1] for datapoint in cached_dataset]
                
            elif subset=='testing':
                
                self.preprocessed_test_path = os.path.join('../dataset_cache/', "preprocessed_test.h5")
                preprocess_and_save_dataset(dataset=dataset, save_path=self.preprocessed_test_path,transformation=self.test_transformation,output_shape=output_shape)
                cached_dataset=CachedAudio(subset=subset)
                labels = [datapoint[1] for datapoint in cached_dataset]

            else:
                raise ValueError("Unknown data subset. Choose from : training or testing.")

            return make_classification_dataset(cached_dataset, collate_fn=speech_commands_collate, targets=labels)
        
        else:
            labels = [datapoint[1] for datapoint in dataset]

            return make_classification_dataset(dataset, collate_fn=speech_commands_collate, targets=labels,transform_groups=self.transform_groups)

[docs]    def MLCommons(self,
        root='../dataset/',
        sub_folder="subset2",
        subset="training",
        transforms=None,):
        """MLCommons dataset wrapper function for avalanche lib.

        Args:
            root (str, optional):  dataset root location. Defaults to '../dataset/'.
            sub_folder (str, optional): dataset subset. Defaults to "subset2".
            subset (str, optional): one of 'training', 'validation', 'testing'. Defaults to "training".
            transforms (_type_, optional): transformations applied to the data. Defaults to None.

        Returns:
            ClassificationDataset: Avalanche's classification dataset
        """

        #Because the Ml commons dataset has the same structure as the speech commands dataset we can use the same wrapper
        # we create empty validation and testing list because this is only used for pretraining
        dataset = MLcommonsData(
            root='../dataset/',
            sub_folder=sub_folder,
            subset=subset,
        )

        labels = [datapoint[1] for datapoint in dataset]

        return make_classification_dataset(dataset, collate_fn=speech_commands_collate, targets=labels,transform_groups=self.transform_groups)


    def __call__(self,train,pre_process,output_shape=[]):
        """Function call to AudioDataset

        Args:
            train (bool): True for training subset and False for testing
            pre_process (bool): Preprocess all the dataset before or do the preprocessing on the fly
            output_shape (list) : Output shape of a transformed element.

        Returns:
            ClassificationDataset: avalanche comatible speech command dataset
        """
        if train:
            return self.SpeechCommands(subset='training',pre_process=pre_process,transforms=self.transform_groups,output_shape=output_shape)
        else:
            return self.SpeechCommands(subset='testing',pre_process=pre_process,transforms=self.transform_groups,output_shape=output_shape)