Source code for ducho.multimodal.multiple.visual_textual.VisualTextualDataset

import torch
from loguru import logger

from ducho.multimodal.visual.VisualDataset import VisualDataset
from ducho.multimodal.textual.TextualDataset import TextualDataset
import numpy, os



[docs]
class VisualTextualDataset:
    """
    This class represents the Visual-Textual Dataset used for the data loading process.
    """
    def __init__(self,
                 input_directory_path,
                 output_directory_path,
                 columns=None,
                 model_name='openai/clip-vit-base-patch32',
                 reshape=(224, 224)):
        """
        It manages the Visual-Textual Dataset, which consists of a folder containing input data and another folder for output data.
        It handles the preprocessing of input data and manages the output data.

        Args:
            input_directory_path: A string representing the path to the folder containing the input data to be processed.
            output_directory_path: A string representing the path to the folder where the output data will be stored. If the folder does not exist, it will be created.
            columns: A list of columns to be considered.
            model_name: A string specifying the model to be used. This can be reset later.
            reshape: A tuple (int, int) representing the width and height for resizing the input images. This can be reset later.

        Returns:
            None
        """
        self._backend_libraries_list = None
        self._model_name = model_name
        self._reshape = reshape
        self.input_image_path, self.input_text_path = input_directory_path['visual'], input_directory_path['textual']
        self.output_image_path, self.output_text_path = output_directory_path['visual'], output_directory_path['textual']
        self._visual_dataset = VisualDataset(self.input_image_path, self.output_image_path, model_name)
        self._textual_dataset = TextualDataset(self.input_text_path, self.output_text_path, columns)
        self.set_framework = self._visual_dataset.set_framework
        self.set_model = self._visual_dataset.set_model
        self.set_preprocessing_flag = self._visual_dataset.set_preprocessing_flag

    def __len__(self):
        return min(self._visual_dataset._num_samples, self._textual_dataset._num_samples)

    def __getitem__(self, idx):
        visual_input = self._visual_dataset.__getitem__(idx)
        textual_input, _ = self._textual_dataset.__getitem__(idx)
        return visual_input, textual_input


[docs]
    def create_output_file(self, index, extracted_data, model_layer, fusion=None):
        """
        This procedure is responsible for generating output files.

        Args:
            index: The index of the file to be processed.
            extracted_data: A tuple containing the extracted features.
            model_layer: The name of the output layer for the selected model.
            fusion: A string indicating the type of fusion to perform. If None, the procedure generates two separate output files.
                    Otherwise, it creates a single output file based on the specified fusion type.


        Returns:
            None
        """
        # generate file name
        input_file_name = self._visual_dataset._filenames[index].split('.')[0]
        output_file_name = input_file_name + '.npy'

        # generate output path
        backend_library = self._visual_dataset._backend_libraries_list[0]

        if not fusion:
            # visual
            output_image_path = os.path.join(self.output_image_path, backend_library)
            output_image_path = os.path.join(output_image_path, self._model_name)
            output_image_path = os.path.join(output_image_path, str(model_layer))
            if not os.path.exists(output_image_path):
                os.makedirs(output_image_path)
            # create file
            path = os.path.join(output_image_path, output_file_name)
            numpy.save(path, extracted_data[0])

            # textual
            output_text_path = os.path.join(self.output_text_path, backend_library)
            output_text_path = os.path.join(output_text_path, self._model_name)
            output_text_path = os.path.join(output_text_path, str(model_layer))
            if not os.path.exists(output_text_path):
                os.makedirs(output_text_path)
            # create file
            path = os.path.join(output_text_path, output_file_name)
            numpy.save(path, extracted_data[1])
        else:
            last_image_path = os.path.basename(os.path.normpath(self.output_image_path))
            last_text_path = os.path.basename(os.path.normpath(self.output_text_path))
            first_path = self.output_image_path.replace(last_image_path, '')
            output_path = f'{last_image_path}_{last_text_path}_{fusion}'
            output_path = os.path.join(first_path, output_path, backend_library)
            output_path = os.path.join(output_path, self._model_name)
            output_path = os.path.join(output_path, str(model_layer))
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            # create file
            path = os.path.join(output_path, output_file_name)
            # fusion
            if fusion == 'concat':
                extracted_data = numpy.concatenate(extracted_data, axis=1)
            elif fusion == 'sum':
                if extracted_data[0].shape != extracted_data[1].shape:
                    raise ValueError(f'The shapes of visual and textual embeddings should be the same for {fusion} fusion!')
                extracted_data = numpy.add(extracted_data[0], extracted_data[1])
            elif fusion == 'mul':
                if extracted_data[0].shape != extracted_data[1].shape:
                    raise ValueError(f'The shapes of visual and textual embeddings should be the same for {fusion} fusion!')
                extracted_data = numpy.multiply(extracted_data[0], extracted_data[1])
            elif fusion == 'mean':
                if extracted_data[0].shape != extracted_data[1].shape:
                    raise ValueError(f'The shapes of visual and textual embeddings should be the same for {fusion} fusion!')
                extracted_data = numpy.mean(extracted_data, axis=0)

            numpy.save(path, extracted_data)



[docs]
    def set_reshape(self, reshape):
        """
        Set the reshape variable according to the desired value.

        Args:
             reshape: Tuple (int, int) representing the width and height for resizing the input.

        Returns:
            None
        """
        self._reshape = reshape



[docs]
    def set_model_name(self, model_name):
        """
        Set the model name for the serialization dir.

        Args:
             model_name: name of the multimodal model

        Returns:
            None
        """
        self._model_name = model_name