added experiment for data scores in the main

4eca4b1f · MOREAU Yanice, Maurice, Gerard · 77c83668 · 4eca4b1f · 4eca4b1f · 4eca4b1f
Commit 4eca4b1f authored 1 year ago by MOREAU Yanice, Maurice, Gerard
--- a/README.md
+++ b/README.md
 # ML Project - Operator Decision
+
 This repository is an implementation of a Multi Layer Perceptron for the classification of alarms raised by the statistical model implemented by PokaPok association for the monitoring of the state of the ocean.

 To set the environment for training and running the model, install the requirements using [environment.yaml]()

-
 The repository is made of the following folders:

 * [dataset_pandas](): Resulting datasets from feature engineering of the profiler's data
@@ -27,6 +27,7 @@ To **run** the model, the standalone script ``main.py`` is available. In the [ma

 1. ``--run single``: to run a single training of the model. This is useful to investigate the accuracy and loss evolution plots, as well as the confusion matrix for different setups.
 2. ``--run multiple``: to run the model for a fix model seed and different data splits. This is used to investigate the robustness of the model.
+3. ``--run generate_scores``: to generate the statistics about the data, that are stored in the logs and can then be plotted using the file bat_profile_plots.py

 Two notebooks are provided to help gain information about the datasets:


--- a/generate_data_scores.py
+++ b/generate_data_scores.py
@@ -440,19 +440,6 @@ def sort_testdata_into_cm(test_df, y_test_pred, y_test_pred_binary):

    return results

-def increment_dict_key(d, key):
-    """
-    Increment the value of a key in a dictionary by one if it exists.
-    Otherwise, create the key with a value of one.
-    
-    Parameters:
-    d (dict): The dictionary to update.
-    key: The key to increment or create.
-    """
-    if key in d.keys():
-        d[key] += 1
-    else:
-        d[key] = 1

 if __name__ == "__main__":
    false_positives = {}
@@ -510,11 +497,3 @@ if __name__ == "__main__":
        json.dump(false_positives, json_file)
    with open('./logs/false_negatives_2_v2.json', 'w') as json_file:
        json.dump(false_negatives, json_file)
\ No newline at end of file
-
-# TEST LOGS
-
-
-## Dataset_2_v1
-# 100 runs in a row and check the FP,FN
-# 55 FN / 11.9 FP  :  avg on the test set 
-# Few regular FN but a lot for FP
\ No newline at end of file
--- a/main.py
+++ b/main.py
@@ -8,6 +8,7 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.metrics import confusion_matrix
 import argparse
+import json

 args = argparse.ArgumentParser()
 args.add_argument("--run", type=str, default="single")
@@ -106,9 +107,69 @@ def multiple_runs():
            CSV_NAME, data_seed, model_seed, accuracy, recall, precision, f1, f2
        )

+def generate_scores():
+    """
+    This function generates a json file counting how many times 
+    each profile was classified wrong over N_RUNS runs
+    """
+
+    # Parameters
+    N_RUNS = 1
+    DATASET_NAME = "2_v2" # For the saved JSON names, you should change the actual dataset in mlp.py
+
+    def increment_dict_key(d, key):
+        if key in d.keys():
+            d[key] += 1
+        else:
+            d[key] = 1
+
+    false_positives = {}
+    false_negatives = {}
+
+    # Trainings loop
+    for seed in range(N_RUNS):
+            
+        # Instantiate the DataLoader with the desired parameters
+        dl = DataLoader(
+            variant="v1",
+            features="both",
+            batch_size=32,
+            seed=seed,
+            rebalance_train=True,
+            rebalance_test=True,  ## Gives you a better idea of the model's performance
+        )
+
+        # Instantiate the MLPWrapper with the desired parameters
+        mlpw = m.MLPWrapper(
+            input_features=dl.num_features,
+            growth_rate=16,
+            train_loader=dl.train_loader,
+            test_loader=dl.test_loader,
+            learning_rate=1e-4,
+            model_seed=seed,
+            device="cpu",
+        )
+        mlpw.train(epochs=20)
+        y_test_pred, y_test_pred_binary = m.get_evaluation(mlpw.model, dl.X_test)
+
+        results = m.sort_testdata_into_cm(dl.test_df, y_test_pred, y_test_pred_binary)
+        for index, row in results.iterrows():
+            if row["CM"] == "FP":
+                increment_dict_key(false_positives, index)
+            if row["CM"] == "FN":
+                increment_dict_key(false_negatives, index)
+
+    # Save results to JSON files
+    with open(f'./logs/false_positives_{DATASET_NAME}.json', 'w') as json_file:
+        json.dump(false_positives, json_file)
+    with open(f'./logs/false_negatives_{DATASET_NAME}.json', 'w') as json_file:
+        json.dump(false_negatives, json_file)
+

 if __name__ == "__main__":
    if args.run == "single":
        run_and_plot_distributions()
    elif args.run == "multiple":
        multiple_runs()
+    elif args.run == "generate_scores":
+        generate_scores()