Source code for ducho.multimodal.audio.AudioFeatureExtractor

import torchaudio
import torch
import numpy as np
from ducho.internal.father_classes.FeatureExtractorFather import FeatureExtractorFather
from transformers import Wav2Vec2Model


[docs] class AudioFeatureExtractor(FeatureExtractorFather): """ This class represents the Audio Feature Extractor utilized for feature extraction. """ def __init__(self, gpu='-1'): """ This function carries out Audio Feature Extraction, requiring the 'model_name', 'framework', and 'output_layer'. Args: gpu: A string indicating the GPU to be used. '-1' specifies the CPU. Returns: None """ self._model_to_initialize = None self._tokenizer = None super().__init__(gpu)
[docs] def set_model(self, model): """ This procedure facilitates the configuration of the Audio Feature Extractor model using YAML specifications. Args: model: The row of the YAML file containing the user's specifications. Returns: None """ model_name = model['name'] self._model_name = model_name if 'torch' in self._backend_libraries_list or 'torchaudio' in self._backend_libraries_list: model_to_initialize = getattr(torchaudio.pipelines, model_name) self._model = model_to_initialize.get_model() self._model.to(self._device) self._model.eval() # self._model.to(self._gpu) elif 'transformers' in self._backend_libraries_list: self._model = Wav2Vec2Model.from_pretrained(self._model_name)
[docs] def extract_feature(self, sample_input): """ This function extracts features from the input data. Prior to calling this function, the framework, model, and layer have to be configured using their respective set methods. Args: sample_input: The preprocessed data. Returns: A numpy array representing the extracted features, which will be stored in a .npy file using the appropriate method of the Dataset Class. """ audio = sample_input[0] sample_rate = sample_input[1] if 'torch' in self._backend_libraries_list or 'torchaudio' in self._backend_libraries_list: # extraction # num_layer is the number of layers to go through try: features, _ = self._model.extract_features(audio, num_layers=self._output_layer) feature = features[-1] # return the N-Dimensional Tensor as a numpy array return feature.detach().numpy() except AttributeError: if isinstance(list(self._model.children())[-1], torch.nn.Linear): feature_model = torch.nn.Sequential(*list(self._model.children())[:-self._output_layer]) else: feature_model = self._model feature_model.eval() output = np.squeeze(feature_model( audio[None, ...].to(self._device) ).data.cpu().numpy()) # update the framework list self._backend_libraries_list = ['torch'] return output elif 'transformers' in self._backend_libraries_list: # feature extraction outputs = self._model(audio, output_hidden_states=True) # layer extraction layer_output = outputs.hidden_states[self._output_layer] return layer_output.detach().numpy()