zhihanyue · jumairamiller · Aug 28, 2024 · Sep 8, 2024 · Sep 8, 2024 · Sep 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,21 @@
+# Ignore dataset files that are not Python
 /datasets/*[!.py]
+
+# Ignore Jupyter notebook checkpoints
 .ipynb_checkpoints
+*.idea/
+# Ignore Python cache files
 __pycache__
+
+# Ignore everything in the training folder except .py files
 training/*
+!training/*.py
+
+# Ignore specific nohup files in the scripts folder
 scripts/_nohup
+
+# Ignore rsync filter file
 .rsync-filter
+
+# Ignore the ts2vec_env directory (virtual environment)
+ts2vec_env/
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/ts2vec.iml b/.idea/ts2vec.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/datasets/stage1_online_retail_pre_processing.py b/datasets/stage1_online_retail_pre_processing.py
@@ -0,0 +1,80 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Step 1: Load and Combine Data from Excel Sheets
+def load_and_combine_sheets(file_path, sheets):
+    """
+    Load data from multiple Excel sheets and combine into a single DataFrame.
+    """
+    combined_data = pd.DataFrame()
+    for sheet in sheets:
+        logging.info(f"Loading data from sheet: {sheet}")
+        sheet_data = pd.read_excel(file_path, sheet_name=sheet)
+        combined_data = pd.concat([combined_data, sheet_data], ignore_index=True)
+    logging.info("Data successfully loaded and combined.")
+    return combined_data
+
+# Step 2: Preprocess Data
+def preprocess_data(data):
+    """
+    Preprocess data by converting dates, normalizing numeric columns, and handling missing values.
+    """
+    # Convert 'InvoiceDate' to datetime and ensure numerical consistency in key columns
+    logging.info("Preprocessing data: converting dates and normalizing numeric columns.")
+    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
+    data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
+    data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
+    data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')
+
+    # Remove rows where the Invoice starts with 'C' (canceled orders)
+    data = data[~data['Invoice'].astype(str).str.startswith('C')]
+
+    # Drop rows with missing critical data
+    data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])
+
+    # Normalize 'Quantity' and 'Price' using Min-Max scaling to insure values are positive
+    scaler = MinMaxScaler()
+    data[['Quantity', 'Price']] = scaler.fit_transform(data[['Quantity', 'Price']])
+    logging.info("Data normalized and missing values handled.")
+
+    return data
+
+# Step 3: Aggregate Data
+def aggregate_data(data):
+    """
+    Aggregate data by summing 'Quantity' and averaging 'Price' daily.
+    """
+    logging.info("Aggregating data by Date.")
+    # Group by Date, aggregating Quantity and Price
+    data_agg = data.groupby(pd.Grouper(key='InvoiceDate', freq='D')).agg({
+        'Quantity': 'sum',
+        'Price': 'mean'
+    }).reset_index()
+
+    logging.info("Data aggregation complete.")
+    return data_agg
+
+# Main Function to Run All Steps
+def main():
+    # File path and sheets to load
+    file_path = 'online_retail_II.xlsx'
+    sheets = ['Year 2009-2010', 'Year 2010-2011']
+
+    # Load and preprocess the data
+    combined_data = load_and_combine_sheets(file_path, sheets)
+    cleaned_data = preprocess_data(combined_data)
+
+    # Aggregate the data
+    aggregated_data = aggregate_data(cleaned_data)
+
+    # Save the final reshaped and adjusted data to CSV
+    aggregated_data.to_csv('ts2vec_online_retail_II_data.csv', index=False)
+    logging.info("Final data saved successfully.")
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/stage2_online_retail_pre_processing_.py b/datasets/stage2_online_retail_pre_processing_.py
@@ -0,0 +1,67 @@
+import pandas as pd
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Step 1: Load the original Online Retail II dataset
+def load_data(file_path):
+    logging.info(f"Loading data from: {file_path}")
+    data = pd.read_excel(file_path)
+    logging.info(f"Data successfully loaded with {len(data)} records.")
+    return data
+
+# Step 2: Clean and preprocess the dataset
+def preprocess_data(data):
+    logging.info("Preprocessing data: cleaning and handling missing values.")
+    # Convert 'InvoiceDate' to datetime and ensure numerical consistency
+    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
+    data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
+    data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
+    data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')
+
+    # Remove cancelled orders (invoices starting with 'C')
+    data = data[~data['Invoice'].str.startswith('C', na=False)]
+
+    # Drop rows with missing values in key columns
+    data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])
+
+    logging.info(f"Data cleaned. Remaining records: {len(data)}.")
+    return data
+
+# Step 3: Group by CustomerID and InvoiceNo
+def group_by_customer_invoice(data):
+    logging.info("Grouping by Customer ID and Invoice Number.")
+    # Group by CustomerID and InvoiceNo to represent each invoice as a time series record
+    grouped = data.groupby(['Customer ID', 'Invoice']).agg({
+        'InvoiceDate': 'first',  # First date of the invoice
+        'Quantity': 'sum',       # Sum of quantities in the invoice
+        'Price': 'mean'          # Average price in the invoice
+    }).reset_index()
+
+    logging.info(f"Grouped data created with {len(grouped)} records.")
+    return grouped
+
+# Step 4: Save the restructured dataset
+def save_data(grouped_data, output_file):
+    logging.info(f"Saving restructured data to {output_file}.")
+    grouped_data.to_csv(output_file, index=False)
+    logging.info("Data successfully saved.")
+
+# Main function to run the entire preprocessing pipeline
+def main():
+    file_path = 'online_retail_II.xlsx'
+    output_file = ('restructured_ts2vec_online_retail.csv')
+
+    # Load and preprocess the data
+    data = load_data(file_path)
+    cleaned_data = preprocess_data(data)
+
+    # Group data by CustomerID and InvoiceNo
+    grouped_data = group_by_customer_invoice(cleaned_data)
+
+    # Save the restructured dataset
+    save_data(grouped_data, output_file)
+
+if __name__ == "__main__":
+    main()