dataloader.py

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

S_features_filter = [
    "abs_S_Smin",
    "rel_S_Smin_semi_width",
    "rel_S_Smin_full_width",
    "abs_S_Smax",
    "rel_S_Smax_semi_width",
    "rel_S_Smax_full_width",
    "count_anomalies_S",
    "ratio_anomalies_S",
    "max_variation_S",
]
T_features_filter = [
    "abs_T_Tmin",
    "rel_T_Tmin_semi_width",
    "rel_T_Tmin_full_width",
    "abs_T_Tmax",
    "rel_T_Tmax_semi_width",
    "rel_T_Tmax_full_width",
    "count_anomalies_T",
    "ratio_anomalies_T",
    "max_variation_T",
]
B_features_filter = ["mean_correlation", "nb_measurements"]


class DataLoader:
    def __init__(
        self,
        variant: str,
        features: str,
        batch_size: int,
        rebalance_train: bool,
        rebalance_test,
        seed: int,
    ):
        """Creates a DataLoader object with the following attributes:

        Parameters
        ----------
        variant : str
            the variant of the dataset to load either v1,v2,v3,v4
        features : str
            the features to use in the dataset, either temperature, salinity or both
        batch_size : int
            the batch size to use in the DataLoader
        rebalance_train : bool
            whether to rebalance the training dataset
        rebalance_test : bool
            whether to rebalance the test dataset
        seed : int
            the seed to use for reproducibility
        """

        self.features = features
        self.batch_size = batch_size
        self.rebalance_train = rebalance_train
        self.rebalance_test = rebalance_test
        self.seed = seed

        PICKLE_PATH = f"dataset_pandas/dataset_2_{variant}.pkl"

        self.train_df, self.test_df = self._prepare_test_train_df(PICKLE_PATH)

        self.X_train, self.y_train, self.X_test, self.y_test = (
            self._prepare_features_and_labels()
        )

        self.train_loader = torch.utils.data.DataLoader(
            list(zip(self.X_train, self.y_train)), batch_size=batch_size, shuffle=True
        )

        self.test_loader = torch.utils.data.DataLoader(
            list(zip(self.X_test, self.y_test)), batch_size=batch_size, shuffle=False
        )

    def _prepare_test_train_df(self, path) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Reads the dataset from a pickle file and prepares the training and testing datasets.
        According to whether the features are temperature, salinity or both.

        Parameters
        ----------
        path : str
            the path to the pickle file

        Returns
        -------
        tuple[pd.DataFrame, pd.DataFrame]
            the training and testing datasets
        """
        df = pd.read_pickle(path)

        if self.features == "both":
            # create a single output column that is one if either of the FalseorTrue columns is true
            df["FalseorTrue"] = (
                df[["FalseorTrue_T", "FalseorTrue_S"]].any(axis=1).astype(int)
            )
            self.num_features = 22

        if self.features == "temperature":
            df["FalseorTrue"] = df["FalseorTrue_T"].astype(int)
            df = df.drop(columns=S_features_filter)
            df = df[df["alarm_temp"] == 1]
            df = df.drop(columns=["alarm_salinity", "alarm_temp"])
            self.num_features = 11

        if self.features == "salinity":
            df["FalseorTrue"] = df["FalseorTrue_S"].astype(int)
            df = df.drop(columns=T_features_filter)
            df = df[df["alarm_salinity"] == 1]
            df = df.drop(columns=["alarm_salinity", "alarm_temp"])
            self.num_features = 11

        # drop the individual FalseorTrue columns
        df = df.drop(columns=["FalseorTrue_T", "FalseorTrue_S"])

        # split the into training and testing sets
        train_df, test_df = train_test_split(df, test_size=0.2, random_state=self.seed)

        if self.rebalance_train:
            train_df = self._undersample(train_df, "FalseorTrue")

        if self.rebalance_test:
            test_df = self._undersample(test_df, "FalseorTrue")

        return train_df, test_df

    def _prepare_features_and_labels(
        self,
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepares the features and labels for the training and testing datasets.

        Returns
        -------
        tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
            the features and labels for the training and testing datasets
        """
        # Separate train dataset into features and labels
        X_train = self.train_df.drop(columns=["FalseorTrue"]).values

        # set label to true when either of the alarms is true
        y_train = self.train_df["FalseorTrue"].values

        # Separate test dataset into features and labels
        X_test = self.test_df.drop(columns=["FalseorTrue"]).values
        y_test = self.test_df["FalseorTrue"].values

        X_train = X_train.astype(np.float32)
        y_train = y_train.astype(np.float32)

        X_test = X_test.astype(np.float32)
        y_test = y_test.astype(np.float32)

        return X_train, y_train, X_test, y_test

    def _undersample(self, df, column):
        """
        Undersamples the majority class to balance the dataset.

        Parameters
        ----------

        df : pd.DataFrame
            the dataframe to undersample
        column : str
            the column to balance on

        Returns
        -------
        pd.DataFrame
            the undersampled dataframe
        """
        # calculate the number of samples in the minority class
        min_FalseorTrue = df[column].value_counts().min()
        print(f"Min FalseorTrue: {min_FalseorTrue}")
        # Balance the training set
        df = pd.concat(
            [
                df[df[column] == 1].sample(min_FalseorTrue, random_state=self.seed),
                df[df[column] == 0],
            ]
        )
        return df