diff --git a/.gitignore b/.gitignore index 78a0cc4e..bb10ee7d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,21 @@ +# Ignore dataset files that are not Python /datasets/*[!.py] + +# Ignore Jupyter notebook checkpoints .ipynb_checkpoints +*.idea/ +# Ignore Python cache files __pycache__ + +# Ignore everything in the training folder except .py files training/* +!training/*.py + +# Ignore specific nohup files in the scripts folder scripts/_nohup + +# Ignore rsync filter file .rsync-filter + +# Ignore the ts2vec_env directory (virtual environment) +ts2vec_env/ \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..feccba82 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/ts2vec.iml b/.idea/ts2vec.iml new file mode 100644 index 00000000..ec63674c --- /dev/null +++ b/.idea/ts2vec.iml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..35eb1ddf --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 00000000..ef09f3fa --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + { + "lastFilter": { + "state": "OPEN", + "assignee": "jumairamiller" + } +} + { + "selectedUrlAndAccountId": { + "url": "https://github.com/jumairamiller/ts2vec.git", + "accountId": "1c182f61-b702-458b-8f9a-03f9dbad0d94" + }, + "recentNewPullRequestHead": { + "server": { + "useHttp": false, + "host": "github.com", + "port": null, + "suffix": null + }, + "owner": "jumairamiller", + "repository": "ts2vec" + } +} + + + { + "associatedIndex": 3 +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/datasets/stage1_online_retail_pre_processing.py b/datasets/stage1_online_retail_pre_processing.py new file mode 100644 index 00000000..eee5d36d --- /dev/null +++ b/datasets/stage1_online_retail_pre_processing.py @@ -0,0 +1,80 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Step 1: Load and Combine Data from Excel Sheets +def load_and_combine_sheets(file_path, sheets): + """ + Load data from multiple Excel sheets and combine into a single DataFrame. + """ + combined_data = pd.DataFrame() + for sheet in sheets: + logging.info(f"Loading data from sheet: {sheet}") + sheet_data = pd.read_excel(file_path, sheet_name=sheet) + combined_data = pd.concat([combined_data, sheet_data], ignore_index=True) + logging.info("Data successfully loaded and combined.") + return combined_data + +# Step 2: Preprocess Data +def preprocess_data(data): + """ + Preprocess data by converting dates, normalizing numeric columns, and handling missing values. + """ + # Convert 'InvoiceDate' to datetime and ensure numerical consistency in key columns + logging.info("Preprocessing data: converting dates and normalizing numeric columns.") + data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce') + data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce') + data['Price'] = pd.to_numeric(data['Price'], errors='coerce') + data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce') + + # Remove rows where the Invoice starts with 'C' (canceled orders) + data = data[~data['Invoice'].astype(str).str.startswith('C')] + + # Drop rows with missing critical data + data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price']) + + # Normalize 'Quantity' and 'Price' using Min-Max scaling to insure values are positive + scaler = MinMaxScaler() + data[['Quantity', 'Price']] = scaler.fit_transform(data[['Quantity', 'Price']]) + logging.info("Data normalized and missing values handled.") + + return data + +# Step 3: Aggregate Data +def aggregate_data(data): + """ + Aggregate data by summing 'Quantity' and averaging 'Price' daily. + """ + logging.info("Aggregating data by Date.") + # Group by Date, aggregating Quantity and Price + data_agg = data.groupby(pd.Grouper(key='InvoiceDate', freq='D')).agg({ + 'Quantity': 'sum', + 'Price': 'mean' + }).reset_index() + + logging.info("Data aggregation complete.") + return data_agg + +# Main Function to Run All Steps +def main(): + # File path and sheets to load + file_path = 'online_retail_II.xlsx' + sheets = ['Year 2009-2010', 'Year 2010-2011'] + + # Load and preprocess the data + combined_data = load_and_combine_sheets(file_path, sheets) + cleaned_data = preprocess_data(combined_data) + + # Aggregate the data + aggregated_data = aggregate_data(cleaned_data) + + # Save the final reshaped and adjusted data to CSV + aggregated_data.to_csv('ts2vec_online_retail_II_data.csv', index=False) + logging.info("Final data saved successfully.") + +if __name__ == "__main__": + main() diff --git a/datasets/stage2_online_retail_pre_processing_.py b/datasets/stage2_online_retail_pre_processing_.py new file mode 100644 index 00000000..f33962ef --- /dev/null +++ b/datasets/stage2_online_retail_pre_processing_.py @@ -0,0 +1,67 @@ +import pandas as pd +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Step 1: Load the original Online Retail II dataset +def load_data(file_path): + logging.info(f"Loading data from: {file_path}") + data = pd.read_excel(file_path) + logging.info(f"Data successfully loaded with {len(data)} records.") + return data + +# Step 2: Clean and preprocess the dataset +def preprocess_data(data): + logging.info("Preprocessing data: cleaning and handling missing values.") + # Convert 'InvoiceDate' to datetime and ensure numerical consistency + data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce') + data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce') + data['Price'] = pd.to_numeric(data['Price'], errors='coerce') + data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce') + + # Remove cancelled orders (invoices starting with 'C') + data = data[~data['Invoice'].str.startswith('C', na=False)] + + # Drop rows with missing values in key columns + data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price']) + + logging.info(f"Data cleaned. Remaining records: {len(data)}.") + return data + +# Step 3: Group by CustomerID and InvoiceNo +def group_by_customer_invoice(data): + logging.info("Grouping by Customer ID and Invoice Number.") + # Group by CustomerID and InvoiceNo to represent each invoice as a time series record + grouped = data.groupby(['Customer ID', 'Invoice']).agg({ + 'InvoiceDate': 'first', # First date of the invoice + 'Quantity': 'sum', # Sum of quantities in the invoice + 'Price': 'mean' # Average price in the invoice + }).reset_index() + + logging.info(f"Grouped data created with {len(grouped)} records.") + return grouped + +# Step 4: Save the restructured dataset +def save_data(grouped_data, output_file): + logging.info(f"Saving restructured data to {output_file}.") + grouped_data.to_csv(output_file, index=False) + logging.info("Data successfully saved.") + +# Main function to run the entire preprocessing pipeline +def main(): + file_path = 'online_retail_II.xlsx' + output_file = ('restructured_ts2vec_online_retail.csv') + + # Load and preprocess the data + data = load_data(file_path) + cleaned_data = preprocess_data(data) + + # Group data by CustomerID and InvoiceNo + grouped_data = group_by_customer_invoice(cleaned_data) + + # Save the restructured dataset + save_data(grouped_data, output_file) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/datautils.py b/datautils.py index f8cfddd7..a31ec3d5 100644 --- a/datautils.py +++ b/datautils.py @@ -1,3 +1,4 @@ +import logging import os import numpy as np import pandas as pd @@ -5,6 +6,9 @@ import random from datetime import datetime import pickle + +import torch + from utils import pkl_load, pad_nan_to_target from scipy.io.arff import loadarff from sklearn.preprocessing import StandardScaler, MinMaxScaler @@ -68,7 +72,7 @@ def load_UCR(dataset): 'UMD' ]: return train[..., np.newaxis], train_labels, test[..., np.newaxis], test_labels - + mean = np.nanmean(train) std = np.nanstd(train) train = (train - mean) / std @@ -79,7 +83,7 @@ def load_UCR(dataset): def load_UEA(dataset): train_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TRAIN.arff')[0] test_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TEST.arff')[0] - + def extract_data(data): res_data = [] res_labels = [] @@ -89,31 +93,31 @@ def extract_data(data): res_data.append(t_data) res_labels.append(t_label) return np.array(res_data).swapaxes(1, 2), np.array(res_labels) - + train_X, train_y = extract_data(train_data) test_X, test_y = extract_data(test_data) - + scaler = StandardScaler() scaler.fit(train_X.reshape(-1, train_X.shape[-1])) train_X = scaler.transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape) test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape) - + labels = np.unique(train_y) transform = { k : i for i, k in enumerate(labels)} train_y = np.vectorize(transform.get)(train_y) test_y = np.vectorize(transform.get)(test_y) return train_X, train_y, test_X, test_y - - + + def load_forecast_npy(name, univar=False): - data = np.load(f'datasets/{name}.npy') + data = np.load(f'datasets/{name}.npy') if univar: data = data[: -1:] - + train_slice = slice(None, int(0.6 * len(data))) valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) test_slice = slice(int(0.8 * len(data)), None) - + scaler = StandardScaler().fit(data[train_slice]) data = scaler.transform(data) data = np.expand_dims(data, 0) @@ -135,10 +139,32 @@ def _get_time_features(dt): def load_forecast_csv(name, univar=False): - data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True) + """ + Loads and processes time series data for forecasting tasks, supporting both the pre-processed + Online Retail II dataset and existing datasets like ETTh1, ETTm1, and electricity. + + Parameters: + name (str): The name of the dataset file (without the .csv extension). + univar (bool): Whether to load the univariate version of the data. + + Returns: + tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols + """ + # Try loading with 'date' as index, if it fails, try with 'InvoiceDate' to load online retail data + try: + data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True) + except ValueError: + data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True) + + # Ensure index is parsed as datetime + if not pd.api.types.is_datetime64_any_dtype(data.index): + data.index = pd.to_datetime(data.index) + + # Extract time features for the date/index column dt_embed = _get_time_features(data.index) n_covariate_cols = dt_embed.shape[-1] - + + # Handle univariate or multivariate cases if univar: if name in ('ETTh1', 'ETTh2', 'ETTm1', 'ETTm2'): data = data[['OT']] @@ -146,8 +172,11 @@ def load_forecast_csv(name, univar=False): data = data[['MT_001']] else: data = data.iloc[:, -1:] - + + # Convert data to numpy array data = data.to_numpy() + + # Define train, validation, and test splits based on dataset if name == 'ETTh1' or name == 'ETTh2': train_slice = slice(None, 12*30*24) valid_slice = slice(12*30*24, 16*30*24) @@ -157,35 +186,54 @@ def load_forecast_csv(name, univar=False): valid_slice = slice(12*30*24*4, 16*30*24*4) test_slice = slice(16*30*24*4, 20*30*24*4) else: - train_slice = slice(None, int(0.6 * len(data))) + # Default case for other or custom datasets + train_slice = slice(0, int(0.6 * len(data))) valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) - test_slice = slice(int(0.8 * len(data)), None) - - scaler = StandardScaler().fit(data[train_slice]) + test_slice = slice(int(0.8 * len(data)), len(data)) + + # Normalise data + scaler = None + if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail': + scaler = MinMaxScaler().fit(data[train_slice]) + else: + scaler = StandardScaler().fit(data[train_slice]) + data = scaler.transform(data) - if name in ('electricity'): + + # Reshape data based on dataset structure + if name in 'electricity': data = np.expand_dims(data.T, -1) # Each variable is an instance rather than a feature else: - data = np.expand_dims(data, 0) - + data = np.expand_dims(data, 0) # Single instance case + if n_covariate_cols > 0: - dt_scaler = StandardScaler().fit(dt_embed[train_slice]) + if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail': + dt_scaler = MinMaxScaler().fit(dt_embed[train_slice]) + else: + dt_scaler = StandardScaler().fit(dt_embed[train_slice]) dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0) + # Concatenating the time embeddings to the data + ''' NOTE: + The np.repeat(dt_embed, data.shape[0], axis=0) function is used to repeat the time embeddings + for each instance only in the case of the 'electricity' dataset. This ensures that the time + embeddings are correctly aligned with the data instances''' data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1) - + if name in ('ETTh1', 'ETTh2', 'electricity'): pred_lens = [24, 48, 168, 336, 720] + elif name == 'ts2vec_online_retail_II_data': + pred_lens = [1, 2, 3, 4, 5] else: pred_lens = [24, 48, 96, 288, 672] - + return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols def load_anomaly(name): res = pkl_load(f'datasets/{name}.pkl') return res['all_train_data'], res['all_train_labels'], res['all_train_timestamps'], \ - res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \ - res['delay'] + res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \ + res['delay'] def gen_ano_train_data(all_train_data): @@ -196,3 +244,134 @@ def gen_ano_train_data(all_train_data): pretrain_data.append(train_data) pretrain_data = np.expand_dims(np.stack(pretrain_data), 2) return pretrain_data + +#----------------------------------------------------------------------------- +def _get_time_features(dt): + return np.stack([ + dt.dayofweek.to_numpy(), # Day of the week + dt.day.to_numpy(), # Day of the month + dt.dayofyear.to_numpy(), # Day of the year + dt.month.to_numpy(), # Month + dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year + ], axis=1).astype(np.float) + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +def load_online_retail(name): +# """ +# Loads and preprocesses the Online Retail dataset for forecasting tasks. +# Ensures both Price, Quantity, and customer embeddings are included throughout. """ +# +# Returns: +# train_data: Dictionary mapping customer_id to their training data (DataFrame). +# valid_data: Dictionary mapping customer_id to their validation data (DataFrame). +# test_data: Dictionary mapping customer_id to their test data (DataFrame). +# customer_embeddings: Fixed embeddings for each customer ID. +# customer_id_to_index: Mapping from customer IDs to embedding indices. +# scaler: Fitted scaler for data normalization. +# """ + + # Load data + data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True) + + # Convert 'InvoiceDate' to datetime if not already done + if not pd.api.types.is_datetime64_any_dtype(data.index): + data.index = pd.to_datetime(data.index) + + # Remove rows with any None or NaN values + data.dropna(inplace=True) + + # Remove rows with negative 'Quantity' or 'Price' values + data = data[(data['Quantity'] > 0) & (data['Price'] > 0)] + logging.info(f"Data shape after removing negative values: {data.shape}") + + # Remove Invoice column + data.drop(columns=['Invoice'], inplace=True) + + # Rename 'Customer ID' to 'CustomerID' + if 'Customer ID' in data.columns: + data.rename(columns={'Customer ID': 'CustomerID'}, inplace=True) + + if 'CustomerID' not in data.columns: + raise KeyError("The 'CustomerID' column is missing from the dataset.") + logging.info(f"Data columns and shape: {data.columns, data.shape}") + + # Extract customerIDs and create numerical mapping + customer_ids = data['CustomerID'].unique() + logging.info(f"Unique customer ID count: {customer_ids.shape[0]}") + customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)} + logging.info(f"Unique customer ID to index count: {len(customer_id_to_index)}") + + # Create fixed embeddings for each unique customer ID + customer_embeddings = torch.nn.Embedding(len(customer_ids), 4) + customer_embeddings.weight.requires_grad = False # Fixed embeddings + + # Map customer IDs to their embeddings + customer_embeddings_tensor = customer_embeddings(torch.tensor(data['CustomerID'].map(customer_id_to_index).values)) + logging.info(f"Customer embeddings shape: {customer_embeddings_tensor.shape}") + logging.info(f"Customer embeddings: {customer_embeddings_tensor}") + + # Replace 'CustomerID' column with the corresponding customer embedding + customer_embeddings_expanded = customer_embeddings_tensor.detach().numpy() + logging.info(f"Customer embeddings expanded shape: {customer_embeddings_expanded.shape}") + + # Drop the original 'CustomerID' column + data = data.drop(columns=['CustomerID']) + + # Extract time features for the date/index column + dt_embed = _get_time_features(data.index) + n_covariate_cols = dt_embed.shape[-1] + customer_embeddings_expanded.shape[-1] + logging.info(f"Number of covariate columns: {n_covariate_cols}") + logging.info(f"Time features shape: {dt_embed.shape}") + + # Convert the DataFrame to a NumPy array + data = data.to_numpy() + + train_slice = slice(0, int(0.6 * len(data))) + valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) + test_slice = slice(int(0.8 * len(data)), len(data)) + + # Normalise data + scaler = MinMaxScaler().fit(data[train_slice]) + data = scaler.transform(data) + + # Reshape data based on dataset structure + data = np.expand_dims(data, 0) # Single instance case + + if n_covariate_cols > 0: + dt_scaler = MinMaxScaler().fit(dt_embed[train_slice]) + dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0) + # Concatenating the time embeddings to the data + data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1) + + cid_scaler = MinMaxScaler().fit(customer_embeddings_expanded[train_slice]) + cid_embed = np.expand_dims(cid_scaler.transform(customer_embeddings_expanded), 0) + # Concatenating the customer ID embeddings to the data + data = np.concatenate([np.repeat(cid_embed, data.shape[0], axis=0), data], axis=-1) + + pred_lens = [1, 2, 3, 4, 5] + + return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols + + + + # # Filter customerIDs with at least 5 records + # customer_counts = data['CustomerID'].value_counts() + # valid_customers = customer_counts[customer_counts >= 5].index + # data = data[data['CustomerID'].isin(valid_customers)] + # + # # Group by CustomerID and sort by InvoiceDate + # grouped = data.groupby('CustomerID').apply(lambda x: x.sort_values('InvoiceDate')).reset_index(drop=True) + # + # # Create time series data for each CustomerID + # customer_data = {} + # for customer_id, group in grouped.groupby('CustomerID'): + # group.set_index('InvoiceDate', inplace=True) + # customer_data[customer_id] = group[['CustomerID', 'Quantity', 'Price']] + + # Set other forecasting parameters + # scaler = MinMaxScaler() # alternative to StandardScaler which avoid negative values + # pred_lens = [1, 2, 3] + # n_covariate_cols = 2 + # + # return customer_data diff --git a/models/encoder.py b/models/encoder.py index cedb13b0..c4aad5d5 100644 --- a/models/encoder.py +++ b/models/encoder.py @@ -41,7 +41,7 @@ def __init__(self, input_dims, output_dims, hidden_dims=64, depth=10, mask_mode= def forward(self, x, mask=None): # x: B x T x input_dims nan_mask = ~x.isnan().any(axis=-1) x[~nan_mask] = 0 - x = self.input_fc(x) # B x T x Ch + x = self.input_fc(x) # B x T x Ch NOTE: Ch = output_dims (I think...) # generate & apply mask if mask is None: diff --git a/requirements.txt b/requirements.txt index 9cc27049..0173de5d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,7 @@ numpy==1.19.2 statsmodels==0.12.2 pandas==1.0.1 scikit_learn==0.24.2 +xlrd==1.2.0 + +matplotlib==3.3.4 +scikit-learn~=0.24.2 \ No newline at end of file diff --git a/tasks/__init__.py b/tasks/__init__.py index 5e38a07c..3a09b083 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,3 +1,3 @@ from .classification import eval_classification -from .forecasting import eval_forecasting +from .forecasting import eval_forecasting, eval_forecasting_customer_embed from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart diff --git a/tasks/_eval_protocols.py b/tasks/_eval_protocols.py index e413bcf4..d0e9af8b 100644 --- a/tasks/_eval_protocols.py +++ b/tasks/_eval_protocols.py @@ -79,6 +79,18 @@ def fit_knn(features, y): return pipe def fit_ridge(train_features, train_y, valid_features, valid_y, MAX_SAMPLES=100000): + # Replace NaN and infinite values with 0 + train_features = np.nan_to_num(train_features, nan=0.0, posinf=0.0, neginf=0.0) + train_y = np.nan_to_num(train_y, nan=0.0, posinf=0.0, neginf=0.0) + valid_features = np.nan_to_num(valid_features, nan=0.0, posinf=0.0, neginf=0.0) + valid_y = np.nan_to_num(valid_y, nan=0.0, posinf=0.0, neginf=0.0) + + # Ensure values are within the range of float64 + train_features = np.clip(train_features, -1e308, 1e308) + train_y = np.clip(train_y, -1e308, 1e308) + valid_features = np.clip(valid_features, -1e308, 1e308) + valid_y = np.clip(valid_y, -1e308, 1e308) + # If the training set is too large, subsample MAX_SAMPLES examples if train_features.shape[0] > MAX_SAMPLES: split = train_test_split( diff --git a/tasks/forecasting.py b/tasks/forecasting.py index b3493d83..c5b0276c 100644 --- a/tasks/forecasting.py +++ b/tasks/forecasting.py @@ -9,17 +9,17 @@ def generate_pred_samples(features, data, pred_len, drop=0): features = features[:, drop:] labels = labels[:, drop:] return features.reshape(-1, features.shape[-1]), \ - labels.reshape(-1, labels.shape[2]*labels.shape[3]) + labels.reshape(-1, labels.shape[2]*labels.shape[3]) def cal_metrics(pred, target): return { 'MSE': ((pred - target) ** 2).mean(), 'MAE': np.abs(pred - target).mean() } - -def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols): + +def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, dataset_name = None): padding = 200 - + t = time.time() all_repr = model.encode( data, @@ -29,15 +29,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, batch_size=256 ) ts2vec_infer_time = time.time() - t - + train_repr = all_repr[:, train_slice] valid_repr = all_repr[:, valid_slice] test_repr = all_repr[:, test_slice] - + train_data = data[:, train_slice, n_covariate_cols:] valid_data = data[:, valid_slice, n_covariate_cols:] test_data = data[:, test_slice, n_covariate_cols:] - + ours_result = {} lr_train_time = {} lr_infer_time = {} @@ -46,26 +46,48 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding) valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len) test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len) - + t = time.time() lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels) lr_train_time[pred_len] = time.time() - t - + t = time.time() test_pred = lr.predict(test_features) lr_infer_time[pred_len] = time.time() - t - ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2] - test_pred = test_pred.reshape(ori_shape) - test_labels = test_labels.reshape(ori_shape) - + if dataset_name == 'ts2vec_online_retail_II_data': + test_pred = test_pred.reshape(-1, 2) + test_labels = test_labels.reshape(-1, 2) + elif dataset_name == 'restructured_ts2vec_online_retail': + test_pred = test_pred.reshape(-1, 4) + test_labels = test_labels.reshape(-1, 4) + else: + ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2] + test_pred = test_pred.reshape(ori_shape) + test_labels = test_labels.reshape(ori_shape) + if test_data.shape[0] > 1: test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3) test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3) + elif dataset_name in ['ts2vec_online_retail_II_data','restructured_ts2vec_online_retail']: + if dataset_name == 'ts2vec_online_retail_II_data': + test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape) + test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape) + else: # restructured_ts2vec_online_retail + test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 4)).reshape(test_pred.shape) + test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 4)).reshape(test_labels.shape) + + # Remove NaN values from all datasets + valid_indices = ~np.isnan(test_pred).any(axis=1) & ~np.isnan(test_pred_inv).any(axis=1) & ~np.isnan(test_labels).any(axis=1) & ~np.isnan(test_labels_inv).any(axis=1) + test_pred = test_pred[valid_indices] + test_pred_inv = test_pred_inv[valid_indices] + test_labels = test_labels[valid_indices] + test_labels_inv = test_labels_inv[valid_indices] + else: test_pred_inv = scaler.inverse_transform(test_pred) test_labels_inv = scaler.inverse_transform(test_labels) - + out_log[pred_len] = { 'norm': test_pred, 'raw': test_pred_inv, @@ -76,11 +98,80 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, 'norm': cal_metrics(test_pred, test_labels), 'raw': cal_metrics(test_pred_inv, test_labels_inv) } - - eval_res = { - 'ours': ours_result, - 'ts2vec_infer_time': ts2vec_infer_time, - 'lr_train_time': lr_train_time, - 'lr_infer_time': lr_infer_time - } + + eval_res = { + 'ours': ours_result, + 'ts2vec_infer_time': ts2vec_infer_time, + 'lr_train_time': lr_train_time, + 'lr_infer_time': lr_infer_time + } return out_log, eval_res + +def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols): + padding = 200 + + t = time.time() + all_repr = model.encode( + data, + causal=True, + sliding_length=1, + sliding_padding=padding, + batch_size=256 + ) + ts2vec_infer_time = time.time() - t + + # Extract representations (embeddings) for each time slice + train_repr = all_repr[:, train_slice] + valid_repr = all_repr[:, valid_slice] + test_repr = all_repr[:, test_slice] + + # Extract data for each time slice + train_data = data[:, train_slice, n_covariate_cols:] + valid_data = data[:, valid_slice, n_covariate_cols:] + test_data = data[:, test_slice, n_covariate_cols:] + + ours_result = {} + lr_train_time = {} + lr_infer_time = {} + out_log = {} + for pred_len in pred_lens: + train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding) + valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len) + test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len) + + t = time.time() + lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels) + lr_train_time[pred_len] = time.time() - t + + t = time.time() + test_pred = lr.predict(test_features) + lr_infer_time[pred_len] = time.time() - t + + test_pred = test_pred.reshape(-1, 2) + test_labels = test_labels.reshape(-1, 2) + + test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape) + test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape) + + # NOTE: You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization) + + out_log[pred_len] = { + 'norm': test_pred, + 'raw': test_pred_inv, + 'norm_gt': test_labels, + 'raw_gt': test_labels_inv + } + ours_result[pred_len] = { + 'norm': cal_metrics(test_pred, test_labels), + 'raw': cal_metrics(test_pred_inv, test_labels_inv) + } + + eval_res = { + 'ours': ours_result, + 'ts2vec_infer_time': ts2vec_infer_time, + 'lr_train_time': lr_train_time, + 'lr_infer_time': lr_infer_time + } + + return out_log, eval_res + diff --git a/train.py b/train.py index d0ade884..fb8346c2 100644 --- a/train.py +++ b/train.py @@ -1,18 +1,27 @@ +import config +import device +import pandas as pd import torch +from matplotlib import pyplot as plt +from scipy.ndimage import label +from torch import nn import numpy as np import argparse import os import sys import time import datetime + +from sklearn.preprocessing import MinMaxScaler + from ts2vec import TS2Vec import tasks import datautils from utils import init_dl_program, name_with_datetime, pkl_save, data_dropout def save_checkpoint_callback( - save_every=1, - unit='epoch' + save_every=1, + unit='epoch' ): assert unit in ('epoch', 'iter') def callback(model, loss): @@ -32,62 +41,72 @@ def callback(model, loss): parser.add_argument('--repr-dims', type=int, default=320, help='The representation dimension (defaults to 320)') parser.add_argument('--max-train-length', type=int, default=3000, help='For sequence with a length greater than , it would be cropped into some sequences, each of which has a length less than (defaults to 3000)') parser.add_argument('--iters', type=int, default=None, help='The number of iterations') - parser.add_argument('--epochs', type=int, default=None, help='The number of epochs') + parser.add_argument('--epochs', type=int, default=8, help='The number of epochs') parser.add_argument('--save-every', type=int, default=None, help='Save the checkpoint every iterations/epochs') parser.add_argument('--seed', type=int, default=None, help='The random seed') parser.add_argument('--max-threads', type=int, default=None, help='The maximum allowed number of threads used by this process') parser.add_argument('--eval', action="store_true", help='Whether to perform evaluation after training') parser.add_argument('--irregular', type=float, default=0, help='The ratio of missing observations (defaults to 0)') args = parser.parse_args() - + print("Dataset:", args.dataset) print("Arguments:", str(args)) - + device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads) - + + # Define variables outside the conditional block for online retail dataset + customer_embedding_layer = None + customer_id_to_index = None + + print('Loading data... ', end='') if args.loader == 'UCR': task_type = 'classification' train_data, train_labels, test_data, test_labels = datautils.load_UCR(args.dataset) - + elif args.loader == 'UEA': task_type = 'classification' train_data, train_labels, test_data, test_labels = datautils.load_UEA(args.dataset) - + elif args.loader == 'forecast_csv': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset) train_data = data[:, train_slice] - + elif args.loader == 'forecast_csv_univar': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset, univar=True) train_data = data[:, train_slice] - + elif args.loader == 'forecast_npy': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset) train_data = data[:, train_slice] - + elif args.loader == 'forecast_npy_univar': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset, univar=True) train_data = data[:, train_slice] - + elif args.loader == 'anomaly': task_type = 'anomaly_detection' all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data = datautils.gen_ano_train_data(all_train_data) - + elif args.loader == 'anomaly_coldstart': task_type = 'anomaly_detection_coldstart' all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data, _, _, _ = datautils.load_UCR('FordA') - + + elif args.loader == 'retail': + task_type = 'forecasting' + # Load the data + data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_online_retail(args.dataset) + train_data = data[:, train_slice] + else: raise ValueError(f"Unknown loader {args.loader}.") - - + if args.irregular > 0: if task_type == 'classification': train_data = data_dropout(train_data, args.irregular) @@ -95,52 +114,96 @@ def callback(model, loss): else: raise ValueError(f"Task type {task_type} is not supported when irregular>0.") print('done') - + config = dict( batch_size=args.batch_size, lr=args.lr, output_dims=args.repr_dims, max_train_length=args.max_train_length ) - + if args.save_every is not None: unit = 'epoch' if args.epochs is not None else 'iter' config[f'after_{unit}_callback'] = save_checkpoint_callback(args.save_every, unit) run_dir = 'training/' + args.dataset + '__' + name_with_datetime(args.run_name) os.makedirs(run_dir, exist_ok=True) - + t = time.time() - + + # Define the TS2Vec model + + model = TS2Vec( input_dims=train_data.shape[-1], device=device, **config ) + # Training code loss_log = model.fit( train_data, n_epochs=args.epochs, n_iters=args.iters, verbose=True ) + model.save(f'{run_dir}/model.pkl') + # Plot the training loss log + plt.plot(loss_log, label='Training Loss') + plt.xlabel('Epoch') + plt.ylabel('Loss') + plt.title('Training Loss Over Epochs') + plt.grid(True) + plt.savefig(f'{run_dir}/loss_plot.png') + # Draw a straight line to show the general trend + z = np.polyfit(range(len(loss_log)), loss_log, 1) + p = np.poly1d(z) + plt.plot(range(len(loss_log)), p(range(len(loss_log))), "r--", label='Training Loss Trend') + plt.legend() + # Save the plot + plt.savefig(f'{run_dir}/train_loss_plot.png') + plt.show() + t = time.time() - t print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n") if args.eval: + out = None # Initialise 'out' as None to avoid the NameError in the case of unsupervised tasks + if task_type == 'classification': out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm') elif task_type == 'forecasting': - out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) + # add case for unsupervised evaluation on Online Retail dataset + if args.dataset in 'ts2vec_online_retail_II_data' or 'restructured_ts2vec_online_retail': + # print data shapes and check for NaN or infinite values + print("Data shape:", data.shape) + print("Train slice:", train_slice) + print("Valid slice:", valid_slice) + print("Test slice:", test_slice) + print("Scaler:", scaler) + print("Prediction lengths:", pred_lens) + print("Number of covariate columns:", n_covariate_cols) + if args.loader == 'retail': + out, eval_res = tasks.eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) + else: + out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset) + + else: + out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) elif task_type == 'anomaly_detection': out, eval_res = tasks.eval_anomaly_detection(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay) elif task_type == 'anomaly_detection_coldstart': out, eval_res = tasks.eval_anomaly_detection_coldstart(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay) else: assert False - pkl_save(f'{run_dir}/out.pkl', out) + + # Save only eval_res when 'out' is None (i.e., for unsupervised tasks) + if out is not None: + pkl_save(f'{run_dir}/out.pkl', out) + pkl_save(f'{run_dir}/eval_res.pkl', eval_res) print('Evaluation result:', eval_res) - print("Finished.") + print("Finished.") + diff --git a/ts2vec.py b/ts2vec.py index de909bd5..76dc227d 100644 --- a/ts2vec.py +++ b/ts2vec.py @@ -5,7 +5,6 @@ from models import TSEncoder from models.losses import hierarchical_contrastive_loss from utils import take_per_row, split_with_nan, centerize_vary_length_series, torch_pad_nan -import math class TS2Vec: '''The TS2Vec model'''