Skip to content
Snippets Groups Projects
Commit 998bf353 authored by CHAUDHARY Akshat's avatar CHAUDHARY Akshat
Browse files

all functions have been added

parent 531494af
Branches main
No related tags found
No related merge requests found
from typing import Tuple
from dataset.data import Dataset
from dataset.nfv2 import load_data
import math
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
SEED = 1138
#Loading various Datasets
cicids: Tuple[Dataset, Dataset] = load_data("data/sampled/cicids_sampled.csv.gz", seed=SEED, n_partition=1, only_benign=False)
botiot: Tuple[Dataset, Dataset] = load_data("data/sampled/botiot_sampled.csv.gz", seed=SEED, n_partition=1, only_benign=False)
nb15: Tuple[Dataset, Dataset] = load_data("data/sampled/nb15_sampled.csv.gz", seed=SEED, n_partition=1, only_benign=False)
toniot: Tuple[Dataset, Dataset] = load_data("data/sampled/toniot_sampled.csv.gz", seed=SEED, n_partition=1, only_benign=False)
cicids_train, cicids_test = cicids
botiot_train, botiot_test = botiot
nb15_train, nb15_test = nb15
toniot_train, toniot_test = toniot
import random
......@@ -116,7 +104,7 @@ def partition_3(dataset_train):
return partitions
#Partition 4 : One Client per dataset
#Partition 4 : One Client per dataset : All labels
def partition_4(datasets):
partitions = []
......@@ -142,6 +130,172 @@ def partition_4(datasets):
return partitions
#Partition 5 : One client per dataset : Some labels : `NUM_REMOVED` labels randomly removed from each dataset
#By default NUM_REMOVED = 2 , but be careful it should not exceed the total number of classes of any dataset!
def partition_5(datasets,NUM_REMOVED=2):
partitions = []
for i, dataset in enumerate(datasets):
X = dataset[0]
y = dataset[1]
m = dataset[2]["Attack"]
# Randomly select two classes to remove
classes_to_remove = random.sample(m.unique().tolist(), NUM_REMOVED)
# Filter the dataset to remove the selected classes
mask = ~m.isin(classes_to_remove)
X_filtered = X[mask]
y_filtered = y[mask]
m_filtered = m[mask]
partition = (X_filtered, y_filtered, m_filtered)
partitions.append(partition)
# Plot the partition
label_counts = m_filtered.value_counts()
plt.figure(figsize=(6, 4))
plt.bar(label_counts.index, label_counts.values)
plt.title(f"Partition {i+1}")
plt.xlabel("Attack Label")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
return partitions
#Partition 6 : Multiple client per dataset : 1 label per client within each dataset
def partition_6(datasets):
partitions = []
unique_labels = set()
# Collect unique labels from all datasets
for dataset in datasets:
m = dataset[2]["Attack"]
unique_labels.update(m.unique())
# Create partitions with unique labels
partition_number = 1
for label in unique_labels:
for i, dataset in enumerate(datasets):
X = dataset[0]
y = dataset[1]
m = dataset[2]["Attack"]
mask = m == label
if mask.sum() > 0:
X_filtered = X[mask]
y_filtered = y[mask]
m_filtered = m[mask]
partition = (X_filtered, y_filtered, m_filtered)
partitions.append(partition)
# Plot the partition
label_counts = m_filtered.value_counts()
plt.figure(figsize=(6, 4))
plt.bar(label_counts.index, label_counts.values)
plt.title(f"Partition {partition_number}: {label}")
plt.xlabel("Attack Label")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
partition_number += 1
return partitions
#Partition 7 : Multiple client per dataset : Each client has all the labels of the dataset
def partition_7(datasets, NUM_CLIENTS=10):
partitions = []
for dataset_idx, dataset in enumerate(datasets):
X = dataset[0]
y = dataset[1]
m = dataset[2]["Attack"]
# Split the dataset into NUM_CLIENTS parts
partition_size = math.floor(len(X) / NUM_CLIENTS)
dataset_partitions = []
for i in range(NUM_CLIENTS):
idx_from, idx_to = i * partition_size, (i + 1) * partition_size
X_part = X[idx_from:idx_to]
y_part = y[idx_from:idx_to]
m_part = m[idx_from:idx_to]
dataset_partitions.append((X_part, y_part, m_part))
partitions.extend(dataset_partitions)
# Plot the distribution of dataset partitions
fig, axes = plt.subplots(1, NUM_CLIENTS, figsize=(15, 5))
for i, ax in enumerate(axes):
ax.set_title(f"Dataset {dataset_idx+1}, Partition {i+1}")
ax.set_ylabel("Count")
ax.set_xlabel("Attack Label")
ax.tick_params(axis='x', rotation=90)
ax.bar(dataset_partitions[i][2].value_counts().index, dataset_partitions[i][2].value_counts())
plt.show()
return partitions
#Partition 8 : Multiple client per dataset : Each client has {TOTAL-NUM_REMOVED_CLASSES} number of labels of that dataset
def partition_8(dataset_train, NUM_REMOVED_CLASSES, NUM_CLIENTS=10):
partitions = []
for dataset_idx, dataset in enumerate(dataset_train):
X = dataset[0]
y = dataset[1]
m = dataset[2]["Attack"]
# Shuffle the dataset
X, y, m = shuffle(X, y, m)
# Split the data into NUM_CLIENTS parts
partition_size = math.floor(len(X) / NUM_CLIENTS)
dataset_partitions = []
for i in range(NUM_CLIENTS):
idx_from, idx_to = i * partition_size, (i + 1) * partition_size
X_part = X[idx_from:idx_to]
y_part = y[idx_from:idx_to]
m_part = m[idx_from:idx_to]
# Randomly remove NUM_REMOVED_CLASSES from each partition
unique_labels = m_part.unique()
classes_to_remove = np.random.choice(unique_labels, NUM_REMOVED_CLASSES, replace=False)
mask = ~m_part.isin(classes_to_remove)
X_part = X_part[mask]
y_part = y_part[mask]
m_part = m_part[mask]
dataset_partitions.append((X_part, y_part, m_part))
partitions.extend(dataset_partitions)
# Plot the distribution of dataset partitions
fig, axes = plt.subplots(1, NUM_CLIENTS, figsize=(15, 5))
for i, ax in enumerate(axes):
ax.set_title(f"Dataset {dataset_idx+1}, Partition {i+1}")
ax.set_ylabel("Count")
ax.set_xlabel("Attack Label")
ax.tick_params(axis='x', rotation=90)
ax.bar(dataset_partitions[i][2].value_counts().index, dataset_partitions[i][2].value_counts())
plt.show()
return partitions
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment