Skip to content
Open
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
# Ignore dataset files that are not Python
/datasets/*[!.py]

# Ignore Jupyter notebook checkpoints
.ipynb_checkpoints
*.idea/
# Ignore Python cache files
__pycache__

# Ignore everything in the training folder except .py files
training/*
!training/*.py

# Ignore specific nohup files in the scripts folder
scripts/_nohup

# Ignore rsync filter file
.rsync-filter

# Ignore the ts2vec_env directory (virtual environment)
ts2vec_env/
10 changes: 10 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/ts2vec.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

138 changes: 138 additions & 0 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

80 changes: 80 additions & 0 deletions datasets/stage1_online_retail_pre_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: Load and Combine Data from Excel Sheets
def load_and_combine_sheets(file_path, sheets):
"""
Load data from multiple Excel sheets and combine into a single DataFrame.
"""
combined_data = pd.DataFrame()
for sheet in sheets:
logging.info(f"Loading data from sheet: {sheet}")
sheet_data = pd.read_excel(file_path, sheet_name=sheet)
combined_data = pd.concat([combined_data, sheet_data], ignore_index=True)
logging.info("Data successfully loaded and combined.")
return combined_data

# Step 2: Preprocess Data
def preprocess_data(data):
"""
Preprocess data by converting dates, normalizing numeric columns, and handling missing values.
"""
# Convert 'InvoiceDate' to datetime and ensure numerical consistency in key columns
logging.info("Preprocessing data: converting dates and normalizing numeric columns.")
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')

# Remove rows where the Invoice starts with 'C' (canceled orders)
data = data[~data['Invoice'].astype(str).str.startswith('C')]

# Drop rows with missing critical data
data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])

# Normalize 'Quantity' and 'Price' using Min-Max scaling to insure values are positive
scaler = MinMaxScaler()
data[['Quantity', 'Price']] = scaler.fit_transform(data[['Quantity', 'Price']])
logging.info("Data normalized and missing values handled.")

return data

# Step 3: Aggregate Data
def aggregate_data(data):
"""
Aggregate data by summing 'Quantity' and averaging 'Price' daily.
"""
logging.info("Aggregating data by Date.")
# Group by Date, aggregating Quantity and Price
data_agg = data.groupby(pd.Grouper(key='InvoiceDate', freq='D')).agg({
'Quantity': 'sum',
'Price': 'mean'
}).reset_index()

logging.info("Data aggregation complete.")
return data_agg

# Main Function to Run All Steps
def main():
# File path and sheets to load
file_path = 'online_retail_II.xlsx'
sheets = ['Year 2009-2010', 'Year 2010-2011']

# Load and preprocess the data
combined_data = load_and_combine_sheets(file_path, sheets)
cleaned_data = preprocess_data(combined_data)

# Aggregate the data
aggregated_data = aggregate_data(cleaned_data)

# Save the final reshaped and adjusted data to CSV
aggregated_data.to_csv('ts2vec_online_retail_II_data.csv', index=False)
logging.info("Final data saved successfully.")

if __name__ == "__main__":
main()
67 changes: 67 additions & 0 deletions datasets/stage2_online_retail_pre_processing_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: Load the original Online Retail II dataset
def load_data(file_path):
logging.info(f"Loading data from: {file_path}")
data = pd.read_excel(file_path)
logging.info(f"Data successfully loaded with {len(data)} records.")
return data

# Step 2: Clean and preprocess the dataset
def preprocess_data(data):
logging.info("Preprocessing data: cleaning and handling missing values.")
# Convert 'InvoiceDate' to datetime and ensure numerical consistency
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')

# Remove cancelled orders (invoices starting with 'C')
data = data[~data['Invoice'].str.startswith('C', na=False)]

# Drop rows with missing values in key columns
data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])

logging.info(f"Data cleaned. Remaining records: {len(data)}.")
return data

# Step 3: Group by CustomerID and InvoiceNo
def group_by_customer_invoice(data):
logging.info("Grouping by Customer ID and Invoice Number.")
# Group by CustomerID and InvoiceNo to represent each invoice as a time series record
grouped = data.groupby(['Customer ID', 'Invoice']).agg({
'InvoiceDate': 'first', # First date of the invoice
'Quantity': 'sum', # Sum of quantities in the invoice
'Price': 'mean' # Average price in the invoice
}).reset_index()

logging.info(f"Grouped data created with {len(grouped)} records.")
return grouped

# Step 4: Save the restructured dataset
def save_data(grouped_data, output_file):
logging.info(f"Saving restructured data to {output_file}.")
grouped_data.to_csv(output_file, index=False)
logging.info("Data successfully saved.")

# Main function to run the entire preprocessing pipeline
def main():
file_path = 'online_retail_II.xlsx'
output_file = ('restructured_ts2vec_online_retail.csv')

# Load and preprocess the data
data = load_data(file_path)
cleaned_data = preprocess_data(data)

# Group data by CustomerID and InvoiceNo
grouped_data = group_by_customer_invoice(cleaned_data)

# Save the restructured dataset
save_data(grouped_data, output_file)

if __name__ == "__main__":
main()
Loading