Source code for ducho.multimodal.multiple.visual_textual.VisualTextualDataset

import torch
from loguru import logger

from ducho.multimodal.visual.VisualDataset import VisualDataset
from ducho.multimodal.textual.TextualDataset import TextualDataset
import numpy, os


[docs] class VisualTextualDataset: """ This class represents the Visual-Textual Dataset used for the data loading process. """ def __init__(self, input_directory_path, output_directory_path, columns=None, model_name='openai/clip-vit-base-patch32', reshape=(224, 224)): """ It manages the Visual-Textual Dataset, which consists of a folder containing input data and another folder for output data. It handles the preprocessing of input data and manages the output data. Args: input_directory_path: A string representing the path to the folder containing the input data to be processed. output_directory_path: A string representing the path to the folder where the output data will be stored. If the folder does not exist, it will be created. columns: A list of columns to be considered. model_name: A string specifying the model to be used. This can be reset later. reshape: A tuple (int, int) representing the width and height for resizing the input images. This can be reset later. Returns: None """ self._backend_libraries_list = None self._model_name = model_name self._reshape = reshape self.input_image_path, self.input_text_path = input_directory_path['visual'], input_directory_path['textual'] self.output_image_path, self.output_text_path = output_directory_path['visual'], output_directory_path['textual'] self._visual_dataset = VisualDataset(self.input_image_path, self.output_image_path, model_name) self._textual_dataset = TextualDataset(self.input_text_path, self.output_text_path, columns) self.set_framework = self._visual_dataset.set_framework self.set_model = self._visual_dataset.set_model self.set_preprocessing_flag = self._visual_dataset.set_preprocessing_flag def __len__(self): return min(self._visual_dataset._num_samples, self._textual_dataset._num_samples) def __getitem__(self, idx): visual_input = self._visual_dataset.__getitem__(idx) textual_input, _ = self._textual_dataset.__getitem__(idx) return visual_input, textual_input
[docs] def create_output_file(self, index, extracted_data, model_layer, fusion=None): """ This procedure is responsible for generating output files. Args: index: The index of the file to be processed. extracted_data: A tuple containing the extracted features. model_layer: The name of the output layer for the selected model. fusion: A string indicating the type of fusion to perform. If None, the procedure generates two separate output files. Otherwise, it creates a single output file based on the specified fusion type. Returns: None """ # generate file name input_file_name = self._visual_dataset._filenames[index].split('.')[0] output_file_name = input_file_name + '.npy' # generate output path backend_library = self._visual_dataset._backend_libraries_list[0] if not fusion: # visual output_image_path = os.path.join(self.output_image_path, backend_library) output_image_path = os.path.join(output_image_path, self._model_name) output_image_path = os.path.join(output_image_path, str(model_layer)) if not os.path.exists(output_image_path): os.makedirs(output_image_path) # create file path = os.path.join(output_image_path, output_file_name) numpy.save(path, extracted_data[0]) # textual output_text_path = os.path.join(self.output_text_path, backend_library) output_text_path = os.path.join(output_text_path, self._model_name) output_text_path = os.path.join(output_text_path, str(model_layer)) if not os.path.exists(output_text_path): os.makedirs(output_text_path) # create file path = os.path.join(output_text_path, output_file_name) numpy.save(path, extracted_data[1]) else: last_image_path = os.path.basename(os.path.normpath(self.output_image_path)) last_text_path = os.path.basename(os.path.normpath(self.output_text_path)) first_path = self.output_image_path.replace(last_image_path, '') output_path = f'{last_image_path}_{last_text_path}_{fusion}' output_path = os.path.join(first_path, output_path, backend_library) output_path = os.path.join(output_path, self._model_name) output_path = os.path.join(output_path, str(model_layer)) if not os.path.exists(output_path): os.makedirs(output_path) # create file path = os.path.join(output_path, output_file_name) # fusion if fusion == 'concat': extracted_data = numpy.concatenate(extracted_data, axis=1) elif fusion == 'sum': if extracted_data[0].shape != extracted_data[1].shape: raise ValueError(f'The shapes of visual and textual embeddings should be the same for {fusion} fusion!') extracted_data = numpy.add(extracted_data[0], extracted_data[1]) elif fusion == 'mul': if extracted_data[0].shape != extracted_data[1].shape: raise ValueError(f'The shapes of visual and textual embeddings should be the same for {fusion} fusion!') extracted_data = numpy.multiply(extracted_data[0], extracted_data[1]) elif fusion == 'mean': if extracted_data[0].shape != extracted_data[1].shape: raise ValueError(f'The shapes of visual and textual embeddings should be the same for {fusion} fusion!') extracted_data = numpy.mean(extracted_data, axis=0) numpy.save(path, extracted_data)
[docs] def set_reshape(self, reshape): """ Set the reshape variable according to the desired value. Args: reshape: Tuple (int, int) representing the width and height for resizing the input. Returns: None """ self._reshape = reshape
[docs] def set_model_name(self, model_name): """ Set the model name for the serialization dir. Args: model_name: name of the multimodal model Returns: None """ self._model_name = model_name