diff --git a/.gitignore b/.gitignore
index 78a0cc4e..bb10ee7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,21 @@
+# Ignore dataset files that are not Python
/datasets/*[!.py]
+
+# Ignore Jupyter notebook checkpoints
.ipynb_checkpoints
+*.idea/
+# Ignore Python cache files
__pycache__
+
+# Ignore everything in the training folder except .py files
training/*
+!training/*.py
+
+# Ignore specific nohup files in the scripts folder
scripts/_nohup
+
+# Ignore rsync filter file
.rsync-filter
+
+# Ignore the ts2vec_env directory (virtual environment)
+ts2vec_env/
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 00000000..feccba82
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/ts2vec.iml b/.idea/ts2vec.iml
new file mode 100644
index 00000000..ec63674c
--- /dev/null
+++ b/.idea/ts2vec.iml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 00000000..35eb1ddf
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 00000000..ef09f3fa
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,138 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {
+ "lastFilter": {
+ "state": "OPEN",
+ "assignee": "jumairamiller"
+ }
+}
+ {
+ "selectedUrlAndAccountId": {
+ "url": "https://github.com/jumairamiller/ts2vec.git",
+ "accountId": "1c182f61-b702-458b-8f9a-03f9dbad0d94"
+ },
+ "recentNewPullRequestHead": {
+ "server": {
+ "useHttp": false,
+ "host": "github.com",
+ "port": null,
+ "suffix": null
+ },
+ "owner": "jumairamiller",
+ "repository": "ts2vec"
+ }
+}
+
+
+
+ {
+ "associatedIndex": 3
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/datasets/stage1_online_retail_pre_processing.py b/datasets/stage1_online_retail_pre_processing.py
new file mode 100644
index 00000000..eee5d36d
--- /dev/null
+++ b/datasets/stage1_online_retail_pre_processing.py
@@ -0,0 +1,80 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Step 1: Load and Combine Data from Excel Sheets
+def load_and_combine_sheets(file_path, sheets):
+ """
+ Load data from multiple Excel sheets and combine into a single DataFrame.
+ """
+ combined_data = pd.DataFrame()
+ for sheet in sheets:
+ logging.info(f"Loading data from sheet: {sheet}")
+ sheet_data = pd.read_excel(file_path, sheet_name=sheet)
+ combined_data = pd.concat([combined_data, sheet_data], ignore_index=True)
+ logging.info("Data successfully loaded and combined.")
+ return combined_data
+
+# Step 2: Preprocess Data
+def preprocess_data(data):
+ """
+ Preprocess data by converting dates, normalizing numeric columns, and handling missing values.
+ """
+ # Convert 'InvoiceDate' to datetime and ensure numerical consistency in key columns
+ logging.info("Preprocessing data: converting dates and normalizing numeric columns.")
+ data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
+ data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
+ data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
+ data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')
+
+ # Remove rows where the Invoice starts with 'C' (canceled orders)
+ data = data[~data['Invoice'].astype(str).str.startswith('C')]
+
+ # Drop rows with missing critical data
+ data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])
+
+ # Normalize 'Quantity' and 'Price' using Min-Max scaling to insure values are positive
+ scaler = MinMaxScaler()
+ data[['Quantity', 'Price']] = scaler.fit_transform(data[['Quantity', 'Price']])
+ logging.info("Data normalized and missing values handled.")
+
+ return data
+
+# Step 3: Aggregate Data
+def aggregate_data(data):
+ """
+ Aggregate data by summing 'Quantity' and averaging 'Price' daily.
+ """
+ logging.info("Aggregating data by Date.")
+ # Group by Date, aggregating Quantity and Price
+ data_agg = data.groupby(pd.Grouper(key='InvoiceDate', freq='D')).agg({
+ 'Quantity': 'sum',
+ 'Price': 'mean'
+ }).reset_index()
+
+ logging.info("Data aggregation complete.")
+ return data_agg
+
+# Main Function to Run All Steps
+def main():
+ # File path and sheets to load
+ file_path = 'online_retail_II.xlsx'
+ sheets = ['Year 2009-2010', 'Year 2010-2011']
+
+ # Load and preprocess the data
+ combined_data = load_and_combine_sheets(file_path, sheets)
+ cleaned_data = preprocess_data(combined_data)
+
+ # Aggregate the data
+ aggregated_data = aggregate_data(cleaned_data)
+
+ # Save the final reshaped and adjusted data to CSV
+ aggregated_data.to_csv('ts2vec_online_retail_II_data.csv', index=False)
+ logging.info("Final data saved successfully.")
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/stage2_online_retail_pre_processing_.py b/datasets/stage2_online_retail_pre_processing_.py
new file mode 100644
index 00000000..f33962ef
--- /dev/null
+++ b/datasets/stage2_online_retail_pre_processing_.py
@@ -0,0 +1,67 @@
+import pandas as pd
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Step 1: Load the original Online Retail II dataset
+def load_data(file_path):
+ logging.info(f"Loading data from: {file_path}")
+ data = pd.read_excel(file_path)
+ logging.info(f"Data successfully loaded with {len(data)} records.")
+ return data
+
+# Step 2: Clean and preprocess the dataset
+def preprocess_data(data):
+ logging.info("Preprocessing data: cleaning and handling missing values.")
+ # Convert 'InvoiceDate' to datetime and ensure numerical consistency
+ data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
+ data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
+ data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
+ data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')
+
+ # Remove cancelled orders (invoices starting with 'C')
+ data = data[~data['Invoice'].str.startswith('C', na=False)]
+
+ # Drop rows with missing values in key columns
+ data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])
+
+ logging.info(f"Data cleaned. Remaining records: {len(data)}.")
+ return data
+
+# Step 3: Group by CustomerID and InvoiceNo
+def group_by_customer_invoice(data):
+ logging.info("Grouping by Customer ID and Invoice Number.")
+ # Group by CustomerID and InvoiceNo to represent each invoice as a time series record
+ grouped = data.groupby(['Customer ID', 'Invoice']).agg({
+ 'InvoiceDate': 'first', # First date of the invoice
+ 'Quantity': 'sum', # Sum of quantities in the invoice
+ 'Price': 'mean' # Average price in the invoice
+ }).reset_index()
+
+ logging.info(f"Grouped data created with {len(grouped)} records.")
+ return grouped
+
+# Step 4: Save the restructured dataset
+def save_data(grouped_data, output_file):
+ logging.info(f"Saving restructured data to {output_file}.")
+ grouped_data.to_csv(output_file, index=False)
+ logging.info("Data successfully saved.")
+
+# Main function to run the entire preprocessing pipeline
+def main():
+ file_path = 'online_retail_II.xlsx'
+ output_file = ('restructured_ts2vec_online_retail.csv')
+
+ # Load and preprocess the data
+ data = load_data(file_path)
+ cleaned_data = preprocess_data(data)
+
+ # Group data by CustomerID and InvoiceNo
+ grouped_data = group_by_customer_invoice(cleaned_data)
+
+ # Save the restructured dataset
+ save_data(grouped_data, output_file)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/datautils.py b/datautils.py
index f8cfddd7..a31ec3d5 100644
--- a/datautils.py
+++ b/datautils.py
@@ -1,3 +1,4 @@
+import logging
import os
import numpy as np
import pandas as pd
@@ -5,6 +6,9 @@
import random
from datetime import datetime
import pickle
+
+import torch
+
from utils import pkl_load, pad_nan_to_target
from scipy.io.arff import loadarff
from sklearn.preprocessing import StandardScaler, MinMaxScaler
@@ -68,7 +72,7 @@ def load_UCR(dataset):
'UMD'
]:
return train[..., np.newaxis], train_labels, test[..., np.newaxis], test_labels
-
+
mean = np.nanmean(train)
std = np.nanstd(train)
train = (train - mean) / std
@@ -79,7 +83,7 @@ def load_UCR(dataset):
def load_UEA(dataset):
train_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TRAIN.arff')[0]
test_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TEST.arff')[0]
-
+
def extract_data(data):
res_data = []
res_labels = []
@@ -89,31 +93,31 @@ def extract_data(data):
res_data.append(t_data)
res_labels.append(t_label)
return np.array(res_data).swapaxes(1, 2), np.array(res_labels)
-
+
train_X, train_y = extract_data(train_data)
test_X, test_y = extract_data(test_data)
-
+
scaler = StandardScaler()
scaler.fit(train_X.reshape(-1, train_X.shape[-1]))
train_X = scaler.transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape)
test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape)
-
+
labels = np.unique(train_y)
transform = { k : i for i, k in enumerate(labels)}
train_y = np.vectorize(transform.get)(train_y)
test_y = np.vectorize(transform.get)(test_y)
return train_X, train_y, test_X, test_y
-
-
+
+
def load_forecast_npy(name, univar=False):
- data = np.load(f'datasets/{name}.npy')
+ data = np.load(f'datasets/{name}.npy')
if univar:
data = data[: -1:]
-
+
train_slice = slice(None, int(0.6 * len(data)))
valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
test_slice = slice(int(0.8 * len(data)), None)
-
+
scaler = StandardScaler().fit(data[train_slice])
data = scaler.transform(data)
data = np.expand_dims(data, 0)
@@ -135,10 +139,32 @@ def _get_time_features(dt):
def load_forecast_csv(name, univar=False):
- data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True)
+ """
+ Loads and processes time series data for forecasting tasks, supporting both the pre-processed
+ Online Retail II dataset and existing datasets like ETTh1, ETTm1, and electricity.
+
+ Parameters:
+ name (str): The name of the dataset file (without the .csv extension).
+ univar (bool): Whether to load the univariate version of the data.
+
+ Returns:
+ tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
+ """
+ # Try loading with 'date' as index, if it fails, try with 'InvoiceDate' to load online retail data
+ try:
+ data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True)
+ except ValueError:
+ data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True)
+
+ # Ensure index is parsed as datetime
+ if not pd.api.types.is_datetime64_any_dtype(data.index):
+ data.index = pd.to_datetime(data.index)
+
+ # Extract time features for the date/index column
dt_embed = _get_time_features(data.index)
n_covariate_cols = dt_embed.shape[-1]
-
+
+ # Handle univariate or multivariate cases
if univar:
if name in ('ETTh1', 'ETTh2', 'ETTm1', 'ETTm2'):
data = data[['OT']]
@@ -146,8 +172,11 @@ def load_forecast_csv(name, univar=False):
data = data[['MT_001']]
else:
data = data.iloc[:, -1:]
-
+
+ # Convert data to numpy array
data = data.to_numpy()
+
+ # Define train, validation, and test splits based on dataset
if name == 'ETTh1' or name == 'ETTh2':
train_slice = slice(None, 12*30*24)
valid_slice = slice(12*30*24, 16*30*24)
@@ -157,35 +186,54 @@ def load_forecast_csv(name, univar=False):
valid_slice = slice(12*30*24*4, 16*30*24*4)
test_slice = slice(16*30*24*4, 20*30*24*4)
else:
- train_slice = slice(None, int(0.6 * len(data)))
+ # Default case for other or custom datasets
+ train_slice = slice(0, int(0.6 * len(data)))
valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
- test_slice = slice(int(0.8 * len(data)), None)
-
- scaler = StandardScaler().fit(data[train_slice])
+ test_slice = slice(int(0.8 * len(data)), len(data))
+
+ # Normalise data
+ scaler = None
+ if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail':
+ scaler = MinMaxScaler().fit(data[train_slice])
+ else:
+ scaler = StandardScaler().fit(data[train_slice])
+
data = scaler.transform(data)
- if name in ('electricity'):
+
+ # Reshape data based on dataset structure
+ if name in 'electricity':
data = np.expand_dims(data.T, -1) # Each variable is an instance rather than a feature
else:
- data = np.expand_dims(data, 0)
-
+ data = np.expand_dims(data, 0) # Single instance case
+
if n_covariate_cols > 0:
- dt_scaler = StandardScaler().fit(dt_embed[train_slice])
+ if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail':
+ dt_scaler = MinMaxScaler().fit(dt_embed[train_slice])
+ else:
+ dt_scaler = StandardScaler().fit(dt_embed[train_slice])
dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0)
+ # Concatenating the time embeddings to the data
+ ''' NOTE:
+ The np.repeat(dt_embed, data.shape[0], axis=0) function is used to repeat the time embeddings
+ for each instance only in the case of the 'electricity' dataset. This ensures that the time
+ embeddings are correctly aligned with the data instances'''
data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1)
-
+
if name in ('ETTh1', 'ETTh2', 'electricity'):
pred_lens = [24, 48, 168, 336, 720]
+ elif name == 'ts2vec_online_retail_II_data':
+ pred_lens = [1, 2, 3, 4, 5]
else:
pred_lens = [24, 48, 96, 288, 672]
-
+
return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
def load_anomaly(name):
res = pkl_load(f'datasets/{name}.pkl')
return res['all_train_data'], res['all_train_labels'], res['all_train_timestamps'], \
- res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \
- res['delay']
+ res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \
+ res['delay']
def gen_ano_train_data(all_train_data):
@@ -196,3 +244,134 @@ def gen_ano_train_data(all_train_data):
pretrain_data.append(train_data)
pretrain_data = np.expand_dims(np.stack(pretrain_data), 2)
return pretrain_data
+
+#-----------------------------------------------------------------------------
+def _get_time_features(dt):
+ return np.stack([
+ dt.dayofweek.to_numpy(), # Day of the week
+ dt.day.to_numpy(), # Day of the month
+ dt.dayofyear.to_numpy(), # Day of the year
+ dt.month.to_numpy(), # Month
+ dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year
+ ], axis=1).astype(np.float)
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def load_online_retail(name):
+# """
+# Loads and preprocesses the Online Retail dataset for forecasting tasks.
+# Ensures both Price, Quantity, and customer embeddings are included throughout. """
+#
+# Returns:
+# train_data: Dictionary mapping customer_id to their training data (DataFrame).
+# valid_data: Dictionary mapping customer_id to their validation data (DataFrame).
+# test_data: Dictionary mapping customer_id to their test data (DataFrame).
+# customer_embeddings: Fixed embeddings for each customer ID.
+# customer_id_to_index: Mapping from customer IDs to embedding indices.
+# scaler: Fitted scaler for data normalization.
+# """
+
+ # Load data
+ data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True)
+
+ # Convert 'InvoiceDate' to datetime if not already done
+ if not pd.api.types.is_datetime64_any_dtype(data.index):
+ data.index = pd.to_datetime(data.index)
+
+ # Remove rows with any None or NaN values
+ data.dropna(inplace=True)
+
+ # Remove rows with negative 'Quantity' or 'Price' values
+ data = data[(data['Quantity'] > 0) & (data['Price'] > 0)]
+ logging.info(f"Data shape after removing negative values: {data.shape}")
+
+ # Remove Invoice column
+ data.drop(columns=['Invoice'], inplace=True)
+
+ # Rename 'Customer ID' to 'CustomerID'
+ if 'Customer ID' in data.columns:
+ data.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)
+
+ if 'CustomerID' not in data.columns:
+ raise KeyError("The 'CustomerID' column is missing from the dataset.")
+ logging.info(f"Data columns and shape: {data.columns, data.shape}")
+
+ # Extract customerIDs and create numerical mapping
+ customer_ids = data['CustomerID'].unique()
+ logging.info(f"Unique customer ID count: {customer_ids.shape[0]}")
+ customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)}
+ logging.info(f"Unique customer ID to index count: {len(customer_id_to_index)}")
+
+ # Create fixed embeddings for each unique customer ID
+ customer_embeddings = torch.nn.Embedding(len(customer_ids), 4)
+ customer_embeddings.weight.requires_grad = False # Fixed embeddings
+
+ # Map customer IDs to their embeddings
+ customer_embeddings_tensor = customer_embeddings(torch.tensor(data['CustomerID'].map(customer_id_to_index).values))
+ logging.info(f"Customer embeddings shape: {customer_embeddings_tensor.shape}")
+ logging.info(f"Customer embeddings: {customer_embeddings_tensor}")
+
+ # Replace 'CustomerID' column with the corresponding customer embedding
+ customer_embeddings_expanded = customer_embeddings_tensor.detach().numpy()
+ logging.info(f"Customer embeddings expanded shape: {customer_embeddings_expanded.shape}")
+
+ # Drop the original 'CustomerID' column
+ data = data.drop(columns=['CustomerID'])
+
+ # Extract time features for the date/index column
+ dt_embed = _get_time_features(data.index)
+ n_covariate_cols = dt_embed.shape[-1] + customer_embeddings_expanded.shape[-1]
+ logging.info(f"Number of covariate columns: {n_covariate_cols}")
+ logging.info(f"Time features shape: {dt_embed.shape}")
+
+ # Convert the DataFrame to a NumPy array
+ data = data.to_numpy()
+
+ train_slice = slice(0, int(0.6 * len(data)))
+ valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
+ test_slice = slice(int(0.8 * len(data)), len(data))
+
+ # Normalise data
+ scaler = MinMaxScaler().fit(data[train_slice])
+ data = scaler.transform(data)
+
+ # Reshape data based on dataset structure
+ data = np.expand_dims(data, 0) # Single instance case
+
+ if n_covariate_cols > 0:
+ dt_scaler = MinMaxScaler().fit(dt_embed[train_slice])
+ dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0)
+ # Concatenating the time embeddings to the data
+ data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1)
+
+ cid_scaler = MinMaxScaler().fit(customer_embeddings_expanded[train_slice])
+ cid_embed = np.expand_dims(cid_scaler.transform(customer_embeddings_expanded), 0)
+ # Concatenating the customer ID embeddings to the data
+ data = np.concatenate([np.repeat(cid_embed, data.shape[0], axis=0), data], axis=-1)
+
+ pred_lens = [1, 2, 3, 4, 5]
+
+ return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
+
+
+
+ # # Filter customerIDs with at least 5 records
+ # customer_counts = data['CustomerID'].value_counts()
+ # valid_customers = customer_counts[customer_counts >= 5].index
+ # data = data[data['CustomerID'].isin(valid_customers)]
+ #
+ # # Group by CustomerID and sort by InvoiceDate
+ # grouped = data.groupby('CustomerID').apply(lambda x: x.sort_values('InvoiceDate')).reset_index(drop=True)
+ #
+ # # Create time series data for each CustomerID
+ # customer_data = {}
+ # for customer_id, group in grouped.groupby('CustomerID'):
+ # group.set_index('InvoiceDate', inplace=True)
+ # customer_data[customer_id] = group[['CustomerID', 'Quantity', 'Price']]
+
+ # Set other forecasting parameters
+ # scaler = MinMaxScaler() # alternative to StandardScaler which avoid negative values
+ # pred_lens = [1, 2, 3]
+ # n_covariate_cols = 2
+ #
+ # return customer_data
diff --git a/models/encoder.py b/models/encoder.py
index cedb13b0..c4aad5d5 100644
--- a/models/encoder.py
+++ b/models/encoder.py
@@ -41,7 +41,7 @@ def __init__(self, input_dims, output_dims, hidden_dims=64, depth=10, mask_mode=
def forward(self, x, mask=None): # x: B x T x input_dims
nan_mask = ~x.isnan().any(axis=-1)
x[~nan_mask] = 0
- x = self.input_fc(x) # B x T x Ch
+ x = self.input_fc(x) # B x T x Ch NOTE: Ch = output_dims (I think...)
# generate & apply mask
if mask is None:
diff --git a/requirements.txt b/requirements.txt
index 9cc27049..0173de5d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,7 @@ numpy==1.19.2
statsmodels==0.12.2
pandas==1.0.1
scikit_learn==0.24.2
+xlrd==1.2.0
+
+matplotlib==3.3.4
+scikit-learn~=0.24.2
\ No newline at end of file
diff --git a/tasks/__init__.py b/tasks/__init__.py
index 5e38a07c..3a09b083 100644
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -1,3 +1,3 @@
from .classification import eval_classification
-from .forecasting import eval_forecasting
+from .forecasting import eval_forecasting, eval_forecasting_customer_embed
from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart
diff --git a/tasks/_eval_protocols.py b/tasks/_eval_protocols.py
index e413bcf4..d0e9af8b 100644
--- a/tasks/_eval_protocols.py
+++ b/tasks/_eval_protocols.py
@@ -79,6 +79,18 @@ def fit_knn(features, y):
return pipe
def fit_ridge(train_features, train_y, valid_features, valid_y, MAX_SAMPLES=100000):
+ # Replace NaN and infinite values with 0
+ train_features = np.nan_to_num(train_features, nan=0.0, posinf=0.0, neginf=0.0)
+ train_y = np.nan_to_num(train_y, nan=0.0, posinf=0.0, neginf=0.0)
+ valid_features = np.nan_to_num(valid_features, nan=0.0, posinf=0.0, neginf=0.0)
+ valid_y = np.nan_to_num(valid_y, nan=0.0, posinf=0.0, neginf=0.0)
+
+ # Ensure values are within the range of float64
+ train_features = np.clip(train_features, -1e308, 1e308)
+ train_y = np.clip(train_y, -1e308, 1e308)
+ valid_features = np.clip(valid_features, -1e308, 1e308)
+ valid_y = np.clip(valid_y, -1e308, 1e308)
+
# If the training set is too large, subsample MAX_SAMPLES examples
if train_features.shape[0] > MAX_SAMPLES:
split = train_test_split(
diff --git a/tasks/forecasting.py b/tasks/forecasting.py
index b3493d83..c5b0276c 100644
--- a/tasks/forecasting.py
+++ b/tasks/forecasting.py
@@ -9,17 +9,17 @@ def generate_pred_samples(features, data, pred_len, drop=0):
features = features[:, drop:]
labels = labels[:, drop:]
return features.reshape(-1, features.shape[-1]), \
- labels.reshape(-1, labels.shape[2]*labels.shape[3])
+ labels.reshape(-1, labels.shape[2]*labels.shape[3])
def cal_metrics(pred, target):
return {
'MSE': ((pred - target) ** 2).mean(),
'MAE': np.abs(pred - target).mean()
}
-
-def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols):
+
+def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, dataset_name = None):
padding = 200
-
+
t = time.time()
all_repr = model.encode(
data,
@@ -29,15 +29,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
batch_size=256
)
ts2vec_infer_time = time.time() - t
-
+
train_repr = all_repr[:, train_slice]
valid_repr = all_repr[:, valid_slice]
test_repr = all_repr[:, test_slice]
-
+
train_data = data[:, train_slice, n_covariate_cols:]
valid_data = data[:, valid_slice, n_covariate_cols:]
test_data = data[:, test_slice, n_covariate_cols:]
-
+
ours_result = {}
lr_train_time = {}
lr_infer_time = {}
@@ -46,26 +46,48 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding)
valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len)
test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len)
-
+
t = time.time()
lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels)
lr_train_time[pred_len] = time.time() - t
-
+
t = time.time()
test_pred = lr.predict(test_features)
lr_infer_time[pred_len] = time.time() - t
- ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2]
- test_pred = test_pred.reshape(ori_shape)
- test_labels = test_labels.reshape(ori_shape)
-
+ if dataset_name == 'ts2vec_online_retail_II_data':
+ test_pred = test_pred.reshape(-1, 2)
+ test_labels = test_labels.reshape(-1, 2)
+ elif dataset_name == 'restructured_ts2vec_online_retail':
+ test_pred = test_pred.reshape(-1, 4)
+ test_labels = test_labels.reshape(-1, 4)
+ else:
+ ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2]
+ test_pred = test_pred.reshape(ori_shape)
+ test_labels = test_labels.reshape(ori_shape)
+
if test_data.shape[0] > 1:
test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3)
test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3)
+ elif dataset_name in ['ts2vec_online_retail_II_data','restructured_ts2vec_online_retail']:
+ if dataset_name == 'ts2vec_online_retail_II_data':
+ test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape)
+ test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape)
+ else: # restructured_ts2vec_online_retail
+ test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 4)).reshape(test_pred.shape)
+ test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 4)).reshape(test_labels.shape)
+
+ # Remove NaN values from all datasets
+ valid_indices = ~np.isnan(test_pred).any(axis=1) & ~np.isnan(test_pred_inv).any(axis=1) & ~np.isnan(test_labels).any(axis=1) & ~np.isnan(test_labels_inv).any(axis=1)
+ test_pred = test_pred[valid_indices]
+ test_pred_inv = test_pred_inv[valid_indices]
+ test_labels = test_labels[valid_indices]
+ test_labels_inv = test_labels_inv[valid_indices]
+
else:
test_pred_inv = scaler.inverse_transform(test_pred)
test_labels_inv = scaler.inverse_transform(test_labels)
-
+
out_log[pred_len] = {
'norm': test_pred,
'raw': test_pred_inv,
@@ -76,11 +98,80 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
'norm': cal_metrics(test_pred, test_labels),
'raw': cal_metrics(test_pred_inv, test_labels_inv)
}
-
- eval_res = {
- 'ours': ours_result,
- 'ts2vec_infer_time': ts2vec_infer_time,
- 'lr_train_time': lr_train_time,
- 'lr_infer_time': lr_infer_time
- }
+
+ eval_res = {
+ 'ours': ours_result,
+ 'ts2vec_infer_time': ts2vec_infer_time,
+ 'lr_train_time': lr_train_time,
+ 'lr_infer_time': lr_infer_time
+ }
return out_log, eval_res
+
+def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols):
+ padding = 200
+
+ t = time.time()
+ all_repr = model.encode(
+ data,
+ causal=True,
+ sliding_length=1,
+ sliding_padding=padding,
+ batch_size=256
+ )
+ ts2vec_infer_time = time.time() - t
+
+ # Extract representations (embeddings) for each time slice
+ train_repr = all_repr[:, train_slice]
+ valid_repr = all_repr[:, valid_slice]
+ test_repr = all_repr[:, test_slice]
+
+ # Extract data for each time slice
+ train_data = data[:, train_slice, n_covariate_cols:]
+ valid_data = data[:, valid_slice, n_covariate_cols:]
+ test_data = data[:, test_slice, n_covariate_cols:]
+
+ ours_result = {}
+ lr_train_time = {}
+ lr_infer_time = {}
+ out_log = {}
+ for pred_len in pred_lens:
+ train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding)
+ valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len)
+ test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len)
+
+ t = time.time()
+ lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels)
+ lr_train_time[pred_len] = time.time() - t
+
+ t = time.time()
+ test_pred = lr.predict(test_features)
+ lr_infer_time[pred_len] = time.time() - t
+
+ test_pred = test_pred.reshape(-1, 2)
+ test_labels = test_labels.reshape(-1, 2)
+
+ test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape)
+ test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape)
+
+ # NOTE: You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization)
+
+ out_log[pred_len] = {
+ 'norm': test_pred,
+ 'raw': test_pred_inv,
+ 'norm_gt': test_labels,
+ 'raw_gt': test_labels_inv
+ }
+ ours_result[pred_len] = {
+ 'norm': cal_metrics(test_pred, test_labels),
+ 'raw': cal_metrics(test_pred_inv, test_labels_inv)
+ }
+
+ eval_res = {
+ 'ours': ours_result,
+ 'ts2vec_infer_time': ts2vec_infer_time,
+ 'lr_train_time': lr_train_time,
+ 'lr_infer_time': lr_infer_time
+ }
+
+ return out_log, eval_res
+
diff --git a/train.py b/train.py
index d0ade884..fb8346c2 100644
--- a/train.py
+++ b/train.py
@@ -1,18 +1,27 @@
+import config
+import device
+import pandas as pd
import torch
+from matplotlib import pyplot as plt
+from scipy.ndimage import label
+from torch import nn
import numpy as np
import argparse
import os
import sys
import time
import datetime
+
+from sklearn.preprocessing import MinMaxScaler
+
from ts2vec import TS2Vec
import tasks
import datautils
from utils import init_dl_program, name_with_datetime, pkl_save, data_dropout
def save_checkpoint_callback(
- save_every=1,
- unit='epoch'
+ save_every=1,
+ unit='epoch'
):
assert unit in ('epoch', 'iter')
def callback(model, loss):
@@ -32,62 +41,72 @@ def callback(model, loss):
parser.add_argument('--repr-dims', type=int, default=320, help='The representation dimension (defaults to 320)')
parser.add_argument('--max-train-length', type=int, default=3000, help='For sequence with a length greater than , it would be cropped into some sequences, each of which has a length less than (defaults to 3000)')
parser.add_argument('--iters', type=int, default=None, help='The number of iterations')
- parser.add_argument('--epochs', type=int, default=None, help='The number of epochs')
+ parser.add_argument('--epochs', type=int, default=8, help='The number of epochs')
parser.add_argument('--save-every', type=int, default=None, help='Save the checkpoint every iterations/epochs')
parser.add_argument('--seed', type=int, default=None, help='The random seed')
parser.add_argument('--max-threads', type=int, default=None, help='The maximum allowed number of threads used by this process')
parser.add_argument('--eval', action="store_true", help='Whether to perform evaluation after training')
parser.add_argument('--irregular', type=float, default=0, help='The ratio of missing observations (defaults to 0)')
args = parser.parse_args()
-
+
print("Dataset:", args.dataset)
print("Arguments:", str(args))
-
+
device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads)
-
+
+ # Define variables outside the conditional block for online retail dataset
+ customer_embedding_layer = None
+ customer_id_to_index = None
+
+
print('Loading data... ', end='')
if args.loader == 'UCR':
task_type = 'classification'
train_data, train_labels, test_data, test_labels = datautils.load_UCR(args.dataset)
-
+
elif args.loader == 'UEA':
task_type = 'classification'
train_data, train_labels, test_data, test_labels = datautils.load_UEA(args.dataset)
-
+
elif args.loader == 'forecast_csv':
task_type = 'forecasting'
data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset)
train_data = data[:, train_slice]
-
+
elif args.loader == 'forecast_csv_univar':
task_type = 'forecasting'
data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset, univar=True)
train_data = data[:, train_slice]
-
+
elif args.loader == 'forecast_npy':
task_type = 'forecasting'
data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset)
train_data = data[:, train_slice]
-
+
elif args.loader == 'forecast_npy_univar':
task_type = 'forecasting'
data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset, univar=True)
train_data = data[:, train_slice]
-
+
elif args.loader == 'anomaly':
task_type = 'anomaly_detection'
all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
train_data = datautils.gen_ano_train_data(all_train_data)
-
+
elif args.loader == 'anomaly_coldstart':
task_type = 'anomaly_detection_coldstart'
all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
train_data, _, _, _ = datautils.load_UCR('FordA')
-
+
+ elif args.loader == 'retail':
+ task_type = 'forecasting'
+ # Load the data
+ data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_online_retail(args.dataset)
+ train_data = data[:, train_slice]
+
else:
raise ValueError(f"Unknown loader {args.loader}.")
-
-
+
if args.irregular > 0:
if task_type == 'classification':
train_data = data_dropout(train_data, args.irregular)
@@ -95,52 +114,96 @@ def callback(model, loss):
else:
raise ValueError(f"Task type {task_type} is not supported when irregular>0.")
print('done')
-
+
config = dict(
batch_size=args.batch_size,
lr=args.lr,
output_dims=args.repr_dims,
max_train_length=args.max_train_length
)
-
+
if args.save_every is not None:
unit = 'epoch' if args.epochs is not None else 'iter'
config[f'after_{unit}_callback'] = save_checkpoint_callback(args.save_every, unit)
run_dir = 'training/' + args.dataset + '__' + name_with_datetime(args.run_name)
os.makedirs(run_dir, exist_ok=True)
-
+
t = time.time()
-
+
+ # Define the TS2Vec model
+
+
model = TS2Vec(
input_dims=train_data.shape[-1],
device=device,
**config
)
+ # Training code
loss_log = model.fit(
train_data,
n_epochs=args.epochs,
n_iters=args.iters,
verbose=True
)
+
model.save(f'{run_dir}/model.pkl')
+ # Plot the training loss log
+ plt.plot(loss_log, label='Training Loss')
+ plt.xlabel('Epoch')
+ plt.ylabel('Loss')
+ plt.title('Training Loss Over Epochs')
+ plt.grid(True)
+ plt.savefig(f'{run_dir}/loss_plot.png')
+ # Draw a straight line to show the general trend
+ z = np.polyfit(range(len(loss_log)), loss_log, 1)
+ p = np.poly1d(z)
+ plt.plot(range(len(loss_log)), p(range(len(loss_log))), "r--", label='Training Loss Trend')
+ plt.legend()
+ # Save the plot
+ plt.savefig(f'{run_dir}/train_loss_plot.png')
+ plt.show()
+
t = time.time() - t
print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n")
if args.eval:
+ out = None # Initialise 'out' as None to avoid the NameError in the case of unsupervised tasks
+
if task_type == 'classification':
out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm')
elif task_type == 'forecasting':
- out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
+ # add case for unsupervised evaluation on Online Retail dataset
+ if args.dataset in 'ts2vec_online_retail_II_data' or 'restructured_ts2vec_online_retail':
+ # print data shapes and check for NaN or infinite values
+ print("Data shape:", data.shape)
+ print("Train slice:", train_slice)
+ print("Valid slice:", valid_slice)
+ print("Test slice:", test_slice)
+ print("Scaler:", scaler)
+ print("Prediction lengths:", pred_lens)
+ print("Number of covariate columns:", n_covariate_cols)
+ if args.loader == 'retail':
+ out, eval_res = tasks.eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
+ else:
+ out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset)
+
+ else:
+ out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
elif task_type == 'anomaly_detection':
out, eval_res = tasks.eval_anomaly_detection(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay)
elif task_type == 'anomaly_detection_coldstart':
out, eval_res = tasks.eval_anomaly_detection_coldstart(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay)
else:
assert False
- pkl_save(f'{run_dir}/out.pkl', out)
+
+ # Save only eval_res when 'out' is None (i.e., for unsupervised tasks)
+ if out is not None:
+ pkl_save(f'{run_dir}/out.pkl', out)
+
pkl_save(f'{run_dir}/eval_res.pkl', eval_res)
print('Evaluation result:', eval_res)
- print("Finished.")
+ print("Finished.")
+
diff --git a/ts2vec.py b/ts2vec.py
index de909bd5..76dc227d 100644
--- a/ts2vec.py
+++ b/ts2vec.py
@@ -5,7 +5,6 @@
from models import TSEncoder
from models.losses import hierarchical_contrastive_loss
from utils import take_per_row, split_with_nan, centerize_vary_length_series, torch_pad_nan
-import math
class TS2Vec:
'''The TS2Vec model'''