From 82161d31a3a5603dc709c1f8f2a3d68f94a4ec38 Mon Sep 17 00:00:00 2001
From: awenjb <126257927+awenjb@users.noreply.github.com>
Date: Fri, 4 Apr 2025 16:27:08 +0200
Subject: [PATCH] Data preprocess

- clean up data
- fill missing data
---
 julia/data_preprocess.jl | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/julia/data_preprocess.jl b/julia/data_preprocess.jl
index 27036f2..36d727d 100644
--- a/julia/data_preprocess.jl
+++ b/julia/data_preprocess.jl
@@ -1,5 +1,4 @@
-using CSV
-using DataFrames
+using CSV, DataFrames, StatsBase, Random, Distributions
 
 
 input = "/home/a24jacqb/Documents/Code/pdptw-main/julia/LCN_01.csv"
@@ -35,10 +34,42 @@ function count_is_missing(column)
     return count(x -> ismissing(x), column)
 end
 
+
 # remove doublon
+df = unique(df)
 
-# remove empty order
+# remove missing # order
+dropmissing!(df, ["# order"])
 
 # fill missing data (weight & revenue) with a normal distribution
 
+
+function fill_missing(value, type)
+    if ismissing(value) && type == "PICKUP"
+        new_value = round(rand(Normal(mu, sigma)), digits=2)
+        new_value = max(new_value, 1.0)
+        new_value = min(new_value, maxi)
+        return new_value
+    else
+        return value
+    end
+end
+
+df.weight = map(parse_weight, df.weight)
+
+maxi = maximum(skipmissing(df.weight))
+mu = mean(skipmissing(df.weight))
+sigma = std(skipmissing(df.weight))
+
+df.weight = map(fill_missing, df.weight, df.type)
+
+
+for row in eachrow(df)
+    if row.type == "DROPOFF"
+        row.weight = - df.weight[(df."# order" .== row."# order") .& (df.type .== "PICKUP")][1]
+    end
+end
+
+println(first(df, 15))
+
 CSV.write("clean_LCN_01.csv", df)
\ No newline at end of file
-- 
GitLab