Source code for ducho.multimodal.audio.AudioFeatureExtractor
import torchaudio
import torch
import numpy as np
from ducho.internal.father_classes.FeatureExtractorFather import FeatureExtractorFather
from transformers import Wav2Vec2Model
[docs]
class AudioFeatureExtractor(FeatureExtractorFather):
"""
This class represents the Audio Feature Extractor utilized for feature extraction.
"""
def __init__(self, gpu='-1'):
"""
This function carries out Audio Feature Extraction, requiring the 'model_name', 'framework', and 'output_layer'.
Args:
gpu: A string indicating the GPU to be used. '-1' specifies the CPU.
Returns:
None
"""
self._model_to_initialize = None
self._tokenizer = None
super().__init__(gpu)
[docs]
def set_model(self, model):
"""
This procedure facilitates the configuration of the Audio Feature Extractor model using YAML specifications.
Args:
model: The row of the YAML file containing the user's specifications.
Returns:
None
"""
model_name = model['name']
self._model_name = model_name
if 'torch' in self._backend_libraries_list or 'torchaudio' in self._backend_libraries_list:
model_to_initialize = getattr(torchaudio.pipelines, model_name)
self._model = model_to_initialize.get_model()
self._model.to(self._device)
self._model.eval()
# self._model.to(self._gpu)
elif 'transformers' in self._backend_libraries_list:
self._model = Wav2Vec2Model.from_pretrained(self._model_name)
[docs]
def extract_feature(self, sample_input):
"""
This function extracts features from the input data. Prior to calling this function, the framework,
model, and layer have to be configured using their respective set methods.
Args:
sample_input: The preprocessed data.
Returns:
A numpy array representing the extracted features, which will be stored in a .npy file using the appropriate method of the Dataset Class.
"""
audio = sample_input[0]
sample_rate = sample_input[1]
if 'torch' in self._backend_libraries_list or 'torchaudio' in self._backend_libraries_list:
# extraction
# num_layer is the number of layers to go through
try:
features, _ = self._model.extract_features(audio, num_layers=self._output_layer)
feature = features[-1]
# return the N-Dimensional Tensor as a numpy array
return feature.detach().numpy()
except AttributeError:
if isinstance(list(self._model.children())[-1], torch.nn.Linear):
feature_model = torch.nn.Sequential(*list(self._model.children())[:-self._output_layer])
else:
feature_model = self._model
feature_model.eval()
output = np.squeeze(feature_model(
audio[None, ...].to(self._device)
).data.cpu().numpy())
# update the framework list
self._backend_libraries_list = ['torch']
return output
elif 'transformers' in self._backend_libraries_list:
# feature extraction
outputs = self._model(audio, output_hidden_states=True)
# layer extraction
layer_output = outputs.hidden_states[self._output_layer]
return layer_output.detach().numpy()