From 297e087a9e7132b3b83adbfb887089230b2658c4 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Wed, 28 Aug 2024 17:11:14 +0100 Subject: [PATCH 01/10] . --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index d0ade884..ecc30a60 100644 --- a/train.py +++ b/train.py @@ -32,7 +32,7 @@ def callback(model, loss): parser.add_argument('--repr-dims', type=int, default=320, help='The representation dimension (defaults to 320)') parser.add_argument('--max-train-length', type=int, default=3000, help='For sequence with a length greater than , it would be cropped into some sequences, each of which has a length less than (defaults to 3000)') parser.add_argument('--iters', type=int, default=None, help='The number of iterations') - parser.add_argument('--epochs', type=int, default=None, help='The number of epochs') + parser.add_argument('--epochs', type=int, default=8, help='The number of epochs') parser.add_argument('--save-every', type=int, default=None, help='Save the checkpoint every iterations/epochs') parser.add_argument('--seed', type=int, default=None, help='The random seed') parser.add_argument('--max-threads', type=int, default=None, help='The maximum allowed number of threads used by this process') From cda6a296578190fa350ac18450c5f0833699753e Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Mon, 9 Sep 2024 00:42:52 +0100 Subject: [PATCH 02/10] TS2Vec is now able to process the Online Retail II dataset using an editing version of load_forecasting_csv. It can also evaluate this dataset with a new function eval_forecasting_unsupervised. Also includes additional utility functions and configuration --- .gitignore | 15 ++++ datautils.py | 202 ++++++++++++++++++++++++++++++++++++++----- tasks/__init__.py | 2 +- tasks/forecasting.py | 52 ++++++++--- train.py | 53 +++++++----- 5 files changed, 270 insertions(+), 54 deletions(-) diff --git a/.gitignore b/.gitignore index 78a0cc4e..9d59144c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,21 @@ +# Ignore dataset files that are not Python /datasets/*[!.py] + +# Ignore Jupyter notebook checkpoints .ipynb_checkpoints + +# Ignore Python cache files __pycache__ + +# Ignore everything in the training folder except .py files training/* +!training/*.py + +# Ignore specific nohup files in the scripts folder scripts/_nohup + +# Ignore rsync filter file .rsync-filter + +# Ignore the ts2vec_env directory (virtual environment) +ts2vec_env/ \ No newline at end of file diff --git a/datautils.py b/datautils.py index f8cfddd7..c618fb66 100644 --- a/datautils.py +++ b/datautils.py @@ -68,7 +68,7 @@ def load_UCR(dataset): 'UMD' ]: return train[..., np.newaxis], train_labels, test[..., np.newaxis], test_labels - + mean = np.nanmean(train) std = np.nanstd(train) train = (train - mean) / std @@ -79,7 +79,7 @@ def load_UCR(dataset): def load_UEA(dataset): train_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TRAIN.arff')[0] test_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TEST.arff')[0] - + def extract_data(data): res_data = [] res_labels = [] @@ -89,31 +89,31 @@ def extract_data(data): res_data.append(t_data) res_labels.append(t_label) return np.array(res_data).swapaxes(1, 2), np.array(res_labels) - + train_X, train_y = extract_data(train_data) test_X, test_y = extract_data(test_data) - + scaler = StandardScaler() scaler.fit(train_X.reshape(-1, train_X.shape[-1])) train_X = scaler.transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape) test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape) - + labels = np.unique(train_y) transform = { k : i for i, k in enumerate(labels)} train_y = np.vectorize(transform.get)(train_y) test_y = np.vectorize(transform.get)(test_y) return train_X, train_y, test_X, test_y - - + + def load_forecast_npy(name, univar=False): - data = np.load(f'datasets/{name}.npy') + data = np.load(f'datasets/{name}.npy') if univar: data = data[: -1:] - + train_slice = slice(None, int(0.6 * len(data))) valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) test_slice = slice(int(0.8 * len(data)), None) - + scaler = StandardScaler().fit(data[train_slice]) data = scaler.transform(data) data = np.expand_dims(data, 0) @@ -135,10 +135,32 @@ def _get_time_features(dt): def load_forecast_csv(name, univar=False): - data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True) + """ + Loads and processes time series data for forecasting tasks, supporting both the pre-processed + Online Retail II dataset and existing datasets like ETTh1, ETTm1, and electricity. + + Parameters: + name (str): The name of the dataset file (without the .csv extension). + univar (bool): Whether to load the univariate version of the data. + + Returns: + tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols + """ + # Try loading with 'date' as index, if it fails, try with 'InvoiceDate' + try: + data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True) + except ValueError: + data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True) + + # Ensure index is parsed as datetime + if not pd.api.types.is_datetime64_any_dtype(data.index): + data.index = pd.to_datetime(data.index) + + # Extract time features for the date/index column dt_embed = _get_time_features(data.index) n_covariate_cols = dt_embed.shape[-1] - + + # Handle univariate or multivariate cases if univar: if name in ('ETTh1', 'ETTh2', 'ETTm1', 'ETTm2'): data = data[['OT']] @@ -146,8 +168,15 @@ def load_forecast_csv(name, univar=False): data = data[['MT_001']] else: data = data.iloc[:, -1:] - + else: + # Online Retail II case + if name == 'ts2vec_online_retail_II_data': + data = data[['Quantity', 'Price']] + + # Convert data to numpy array data = data.to_numpy() + + # Define train, validation, and test splits based on dataset if name == 'ETTh1' or name == 'ETTh2': train_slice = slice(None, 12*30*24) valid_slice = slice(12*30*24, 16*30*24) @@ -157,35 +186,39 @@ def load_forecast_csv(name, univar=False): valid_slice = slice(12*30*24*4, 16*30*24*4) test_slice = slice(16*30*24*4, 20*30*24*4) else: + # Default case for custom datasets train_slice = slice(None, int(0.6 * len(data))) valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) test_slice = slice(int(0.8 * len(data)), None) - + + # Normalise data scaler = StandardScaler().fit(data[train_slice]) data = scaler.transform(data) - if name in ('electricity'): + + # Reshape data based on dataset structure + if name in 'electricity': data = np.expand_dims(data.T, -1) # Each variable is an instance rather than a feature else: - data = np.expand_dims(data, 0) - + data = np.expand_dims(data, 0) # Single instance case + if n_covariate_cols > 0: dt_scaler = StandardScaler().fit(dt_embed[train_slice]) dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0) data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1) - + if name in ('ETTh1', 'ETTh2', 'electricity'): pred_lens = [24, 48, 168, 336, 720] else: pred_lens = [24, 48, 96, 288, 672] - + return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols def load_anomaly(name): res = pkl_load(f'datasets/{name}.pkl') return res['all_train_data'], res['all_train_labels'], res['all_train_timestamps'], \ - res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \ - res['delay'] + res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \ + res['delay'] def gen_ano_train_data(all_train_data): @@ -196,3 +229,130 @@ def gen_ano_train_data(all_train_data): pretrain_data.append(train_data) pretrain_data = np.expand_dims(np.stack(pretrain_data), 2) return pretrain_data + +#----------------------------------------------------------------------------- +def _get_time_features(dt): + return np.stack([ + dt.dayofweek.to_numpy(), # Day of the week + dt.day.to_numpy(), # Day of the month + dt.dayofyear.to_numpy(), # Day of the year + dt.month.to_numpy(), # Month + dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year + ], axis=1).astype(np.float) + +# +# def load_online_retail(name='Cleaned_Online_Retail', agg_freq='D'): +# """ +# Loads and preprocesses the Online Retail dataset for forecasting tasks. +# +# Parameters: +# name (str): Name of the dataset file (without extension). +# agg_freq (str): Aggregation frequency (e.g., 'D' for daily, 'W' for weekly, '2W' for bi-weekly, 'M' for monthly). +# +# Returns: +# data (np.ndarray): Preprocessed time series data. +# train_slice (slice): Slice object for training data. +# valid_slice (slice): Slice object for validation data. +# test_slice (slice): Slice object for testing data. +# scaler (StandardScaler): Fitted scaler object. +# pred_lens (list): List of prediction lengths. +# n_covariate_cols (int): Number of covariate columns. +# """ +# +# # Load data +# file_path = f'datasets/UEA/{name}.csv' +# df = pd.read_csv(file_path, parse_dates=['InvoiceDate']) +# +# # Convert 'InvoiceDate' to datetime if not already done +# if not pd.api.types.is_datetime64_any_dtype(df['InvoiceDate']): +# df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']) +# +# # Set 'InvoiceDate' as index +# df.set_index('InvoiceDate', inplace=True) +# +# # Handle different aggregation frequencies +# if agg_freq == 'D': +# freq = 'D' +# elif agg_freq == 'W': +# freq = 'W' +# elif agg_freq == '2W': +# freq = '2W' +# elif agg_freq == 'M': +# freq = 'M' +# else: +# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") +# +# # Aggregate quantity sold per specified frequency +# df_agg = df.resample(freq).sum()['Quantity'] +# +# # Handle missing values by filling with zeros +# df_agg = df_agg.fillna(0) +# +# # Generate time features +# time_features = _get_time_features(df_agg.index) +# n_covariate_cols = time_features.shape[1] +# +# # Convert to numpy array +# data = df_agg.values +# data = data.reshape(-1, 1) # Reshape to (timesteps, features) +# +# # Combine data with time features +# data = np.concatenate([time_features, data], axis=1) +# +# # Train/Validation/Test split +# total_length = len(data) +# train_size = int(total_length * 0.6) +# valid_size = int(total_length * 0.2) +# +# train_slice = slice(None, train_size) +# valid_slice = slice(train_size, train_size + valid_size) +# test_slice = slice(train_size + valid_size, None) +# +# # Scaling +# scaler = StandardScaler() +# data[train_slice] = scaler.fit_transform(data[train_slice]) +# data[valid_slice] = scaler.transform(data[valid_slice]) +# data[test_slice] = scaler.transform(data[test_slice]) +# +# # Reshape to (1, timesteps, features) as expected by TS2Vec +# data = data[np.newaxis, ...] +# +# # Define prediction lengths based on aggregation frequency +# if agg_freq == 'D': +# pred_lens = [1, 2, 3, 4, 5, 6, 7] # Predicting each day for the upcoming week +# elif agg_freq == 'W': +# pred_lens = [1, 2, 3, 4] # Predicting each week for the upcoming month (4 weeks) +# elif agg_freq == '2W': +# pred_lens = [1, 2, 3] # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals) +# elif agg_freq == 'M': +# pred_lens = [1, 2, 3] # Predicting 1 month, 2 months, and 3 months ahead (in months) +# else: +# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") +# +# return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols +# train_slice = slice(None, train_size) +# valid_slice = slice(train_size, train_size + valid_size) +# test_slice = slice(train_size + valid_size, None) +# +# # Scaling +# scaler = StandardScaler() +# data[train_slice] = scaler.fit_transform(data[train_slice]) +# data[valid_slice] = scaler.transform(data[valid_slice]) +# data[test_slice] = scaler.transform(data[test_slice]) +# +# # Reshape to (1, timesteps, features) as expected by TS2Vec +# data = data[np.newaxis, ...] +# +# # Define prediction lengths based on aggregation frequency +# if agg_freq == 'D': +# pred_lens = [1, 2, 3, 4, 5, 6, 7] # Predicting each day for the upcoming week +# elif agg_freq == 'W': +# pred_lens = [1, 2, 3, 4] # Predicting each week for the upcoming month (4 weeks) +# elif agg_freq == '2W': +# pred_lens = [1, 2, 3] # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals) +# elif agg_freq == 'M': +# pred_lens = [1, 2, 3] # Predicting 1 month, 2 months, and 3 months ahead (in months) +# else: +# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") +# +# return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols diff --git a/tasks/__init__.py b/tasks/__init__.py index 5e38a07c..c1e1208e 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,3 +1,3 @@ from .classification import eval_classification -from .forecasting import eval_forecasting +from .forecasting import eval_forecasting, eval_forecasting_unsupervised from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart diff --git a/tasks/forecasting.py b/tasks/forecasting.py index b3493d83..d2139cce 100644 --- a/tasks/forecasting.py +++ b/tasks/forecasting.py @@ -9,17 +9,17 @@ def generate_pred_samples(features, data, pred_len, drop=0): features = features[:, drop:] labels = labels[:, drop:] return features.reshape(-1, features.shape[-1]), \ - labels.reshape(-1, labels.shape[2]*labels.shape[3]) + labels.reshape(-1, labels.shape[2]*labels.shape[3]) def cal_metrics(pred, target): return { 'MSE': ((pred - target) ** 2).mean(), 'MAE': np.abs(pred - target).mean() } - + def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols): padding = 200 - + t = time.time() all_repr = model.encode( data, @@ -29,15 +29,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, batch_size=256 ) ts2vec_infer_time = time.time() - t - + train_repr = all_repr[:, train_slice] valid_repr = all_repr[:, valid_slice] test_repr = all_repr[:, test_slice] - + train_data = data[:, train_slice, n_covariate_cols:] valid_data = data[:, valid_slice, n_covariate_cols:] test_data = data[:, test_slice, n_covariate_cols:] - + ours_result = {} lr_train_time = {} lr_infer_time = {} @@ -46,11 +46,11 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding) valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len) test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len) - + t = time.time() lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels) lr_train_time[pred_len] = time.time() - t - + t = time.time() test_pred = lr.predict(test_features) lr_infer_time[pred_len] = time.time() - t @@ -58,14 +58,14 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2] test_pred = test_pred.reshape(ori_shape) test_labels = test_labels.reshape(ori_shape) - + if test_data.shape[0] > 1: test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3) test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3) else: test_pred_inv = scaler.inverse_transform(test_pred) test_labels_inv = scaler.inverse_transform(test_labels) - + out_log[pred_len] = { 'norm': test_pred, 'raw': test_pred_inv, @@ -76,7 +76,7 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, 'norm': cal_metrics(test_pred, test_labels), 'raw': cal_metrics(test_pred_inv, test_labels_inv) } - + eval_res = { 'ours': ours_result, 'ts2vec_infer_time': ts2vec_infer_time, @@ -84,3 +84,33 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, 'lr_infer_time': lr_infer_time } return out_log, eval_res + +def eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols): + padding = 200 + + t = time.time() + all_repr = model.encode( + data, + causal=True, + sliding_length=1, + sliding_padding=padding, + batch_size=256 + ) + ts2vec_infer_time = time.time() - t + + # Extract representations (embeddings) for each time slice + train_repr = all_repr[:, train_slice] + valid_repr = all_repr[:, valid_slice] + test_repr = all_repr[:, test_slice] + + # You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization) + + eval_res = { + 'ts2vec_infer_time': ts2vec_infer_time, + 'train_repr_shape': train_repr.shape, + 'valid_repr_shape': valid_repr.shape, + 'test_repr_shape': test_repr.shape + } + + return eval_res + diff --git a/train.py b/train.py index ecc30a60..916f2a3e 100644 --- a/train.py +++ b/train.py @@ -11,8 +11,8 @@ from utils import init_dl_program, name_with_datetime, pkl_save, data_dropout def save_checkpoint_callback( - save_every=1, - unit='epoch' + save_every=1, + unit='epoch' ): assert unit in ('epoch', 'iter') def callback(model, loss): @@ -39,55 +39,55 @@ def callback(model, loss): parser.add_argument('--eval', action="store_true", help='Whether to perform evaluation after training') parser.add_argument('--irregular', type=float, default=0, help='The ratio of missing observations (defaults to 0)') args = parser.parse_args() - + print("Dataset:", args.dataset) print("Arguments:", str(args)) - + device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads) - + print('Loading data... ', end='') if args.loader == 'UCR': task_type = 'classification' train_data, train_labels, test_data, test_labels = datautils.load_UCR(args.dataset) - + elif args.loader == 'UEA': task_type = 'classification' train_data, train_labels, test_data, test_labels = datautils.load_UEA(args.dataset) - + elif args.loader == 'forecast_csv': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset) train_data = data[:, train_slice] - + elif args.loader == 'forecast_csv_univar': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset, univar=True) train_data = data[:, train_slice] - + elif args.loader == 'forecast_npy': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset) train_data = data[:, train_slice] - + elif args.loader == 'forecast_npy_univar': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset, univar=True) train_data = data[:, train_slice] - + elif args.loader == 'anomaly': task_type = 'anomaly_detection' all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data = datautils.gen_ano_train_data(all_train_data) - + elif args.loader == 'anomaly_coldstart': task_type = 'anomaly_detection_coldstart' all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data, _, _, _ = datautils.load_UCR('FordA') - + else: raise ValueError(f"Unknown loader {args.loader}.") - - + + if args.irregular > 0: if task_type == 'classification': train_data = data_dropout(train_data, args.irregular) @@ -95,23 +95,23 @@ def callback(model, loss): else: raise ValueError(f"Task type {task_type} is not supported when irregular>0.") print('done') - + config = dict( batch_size=args.batch_size, lr=args.lr, output_dims=args.repr_dims, max_train_length=args.max_train_length ) - + if args.save_every is not None: unit = 'epoch' if args.epochs is not None else 'iter' config[f'after_{unit}_callback'] = save_checkpoint_callback(args.save_every, unit) run_dir = 'training/' + args.dataset + '__' + name_with_datetime(args.run_name) os.makedirs(run_dir, exist_ok=True) - + t = time.time() - + model = TS2Vec( input_dims=train_data.shape[-1], device=device, @@ -129,18 +129,29 @@ def callback(model, loss): print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n") if args.eval: + out = None # Initialise out as None to avoid the NameError in the case of unsupervised tasks + if task_type == 'classification': out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm') elif task_type == 'forecasting': - out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) + # add case for unsupervised evaluation on Online Retail dataset + if args.dataset == 'ts2vec_online_retail_II_data': + eval_res = tasks.eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols) + else: + out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) elif task_type == 'anomaly_detection': out, eval_res = tasks.eval_anomaly_detection(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay) elif task_type == 'anomaly_detection_coldstart': out, eval_res = tasks.eval_anomaly_detection_coldstart(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay) else: assert False - pkl_save(f'{run_dir}/out.pkl', out) + + # Save only eval_res when 'out' is None (i.e., for unsupervised tasks) + if out is not None: + pkl_save(f'{run_dir}/out.pkl', out) + pkl_save(f'{run_dir}/eval_res.pkl', eval_res) print('Evaluation result:', eval_res) print("Finished.") + From 8852d6861e4a493ad6a15ebb85f8c0b6c492e150 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Mon, 9 Sep 2024 00:42:52 +0100 Subject: [PATCH 03/10] TS2Vec is now able to process the Online Retail II dataset using an editing version of load_forecasting_csv. It can also evaluate this dataset with a new function eval_forecasting_unsupervised. Also includes additional utility functions and configuration --- .gitignore | 15 ++++ .idea/misc.xml | 10 +++ .idea/ts2vec.iml | 7 ++ .idea/vcs.xml | 6 ++ .idea/workspace.xml | 138 +++++++++++++++++++++++++++++ datautils.py | 202 ++++++++++++++++++++++++++++++++++++++----- requirements.txt | 1 + tasks/__init__.py | 2 +- tasks/forecasting.py | 52 ++++++++--- train.py | 53 +++++++----- 10 files changed, 432 insertions(+), 54 deletions(-) create mode 100644 .idea/misc.xml create mode 100644 .idea/ts2vec.iml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml diff --git a/.gitignore b/.gitignore index 78a0cc4e..9d59144c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,21 @@ +# Ignore dataset files that are not Python /datasets/*[!.py] + +# Ignore Jupyter notebook checkpoints .ipynb_checkpoints + +# Ignore Python cache files __pycache__ + +# Ignore everything in the training folder except .py files training/* +!training/*.py + +# Ignore specific nohup files in the scripts folder scripts/_nohup + +# Ignore rsync filter file .rsync-filter + +# Ignore the ts2vec_env directory (virtual environment) +ts2vec_env/ \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..feccba82 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/ts2vec.iml b/.idea/ts2vec.iml new file mode 100644 index 00000000..ec63674c --- /dev/null +++ b/.idea/ts2vec.iml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..35eb1ddf --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 00000000..ef09f3fa --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + { + "lastFilter": { + "state": "OPEN", + "assignee": "jumairamiller" + } +} + { + "selectedUrlAndAccountId": { + "url": "https://github.com/jumairamiller/ts2vec.git", + "accountId": "1c182f61-b702-458b-8f9a-03f9dbad0d94" + }, + "recentNewPullRequestHead": { + "server": { + "useHttp": false, + "host": "github.com", + "port": null, + "suffix": null + }, + "owner": "jumairamiller", + "repository": "ts2vec" + } +} + + + { + "associatedIndex": 3 +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/datautils.py b/datautils.py index f8cfddd7..c618fb66 100644 --- a/datautils.py +++ b/datautils.py @@ -68,7 +68,7 @@ def load_UCR(dataset): 'UMD' ]: return train[..., np.newaxis], train_labels, test[..., np.newaxis], test_labels - + mean = np.nanmean(train) std = np.nanstd(train) train = (train - mean) / std @@ -79,7 +79,7 @@ def load_UCR(dataset): def load_UEA(dataset): train_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TRAIN.arff')[0] test_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TEST.arff')[0] - + def extract_data(data): res_data = [] res_labels = [] @@ -89,31 +89,31 @@ def extract_data(data): res_data.append(t_data) res_labels.append(t_label) return np.array(res_data).swapaxes(1, 2), np.array(res_labels) - + train_X, train_y = extract_data(train_data) test_X, test_y = extract_data(test_data) - + scaler = StandardScaler() scaler.fit(train_X.reshape(-1, train_X.shape[-1])) train_X = scaler.transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape) test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape) - + labels = np.unique(train_y) transform = { k : i for i, k in enumerate(labels)} train_y = np.vectorize(transform.get)(train_y) test_y = np.vectorize(transform.get)(test_y) return train_X, train_y, test_X, test_y - - + + def load_forecast_npy(name, univar=False): - data = np.load(f'datasets/{name}.npy') + data = np.load(f'datasets/{name}.npy') if univar: data = data[: -1:] - + train_slice = slice(None, int(0.6 * len(data))) valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) test_slice = slice(int(0.8 * len(data)), None) - + scaler = StandardScaler().fit(data[train_slice]) data = scaler.transform(data) data = np.expand_dims(data, 0) @@ -135,10 +135,32 @@ def _get_time_features(dt): def load_forecast_csv(name, univar=False): - data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True) + """ + Loads and processes time series data for forecasting tasks, supporting both the pre-processed + Online Retail II dataset and existing datasets like ETTh1, ETTm1, and electricity. + + Parameters: + name (str): The name of the dataset file (without the .csv extension). + univar (bool): Whether to load the univariate version of the data. + + Returns: + tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols + """ + # Try loading with 'date' as index, if it fails, try with 'InvoiceDate' + try: + data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True) + except ValueError: + data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True) + + # Ensure index is parsed as datetime + if not pd.api.types.is_datetime64_any_dtype(data.index): + data.index = pd.to_datetime(data.index) + + # Extract time features for the date/index column dt_embed = _get_time_features(data.index) n_covariate_cols = dt_embed.shape[-1] - + + # Handle univariate or multivariate cases if univar: if name in ('ETTh1', 'ETTh2', 'ETTm1', 'ETTm2'): data = data[['OT']] @@ -146,8 +168,15 @@ def load_forecast_csv(name, univar=False): data = data[['MT_001']] else: data = data.iloc[:, -1:] - + else: + # Online Retail II case + if name == 'ts2vec_online_retail_II_data': + data = data[['Quantity', 'Price']] + + # Convert data to numpy array data = data.to_numpy() + + # Define train, validation, and test splits based on dataset if name == 'ETTh1' or name == 'ETTh2': train_slice = slice(None, 12*30*24) valid_slice = slice(12*30*24, 16*30*24) @@ -157,35 +186,39 @@ def load_forecast_csv(name, univar=False): valid_slice = slice(12*30*24*4, 16*30*24*4) test_slice = slice(16*30*24*4, 20*30*24*4) else: + # Default case for custom datasets train_slice = slice(None, int(0.6 * len(data))) valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) test_slice = slice(int(0.8 * len(data)), None) - + + # Normalise data scaler = StandardScaler().fit(data[train_slice]) data = scaler.transform(data) - if name in ('electricity'): + + # Reshape data based on dataset structure + if name in 'electricity': data = np.expand_dims(data.T, -1) # Each variable is an instance rather than a feature else: - data = np.expand_dims(data, 0) - + data = np.expand_dims(data, 0) # Single instance case + if n_covariate_cols > 0: dt_scaler = StandardScaler().fit(dt_embed[train_slice]) dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0) data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1) - + if name in ('ETTh1', 'ETTh2', 'electricity'): pred_lens = [24, 48, 168, 336, 720] else: pred_lens = [24, 48, 96, 288, 672] - + return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols def load_anomaly(name): res = pkl_load(f'datasets/{name}.pkl') return res['all_train_data'], res['all_train_labels'], res['all_train_timestamps'], \ - res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \ - res['delay'] + res['all_test_data'], res['all_test_labels'], res['all_test_timestamps'], \ + res['delay'] def gen_ano_train_data(all_train_data): @@ -196,3 +229,130 @@ def gen_ano_train_data(all_train_data): pretrain_data.append(train_data) pretrain_data = np.expand_dims(np.stack(pretrain_data), 2) return pretrain_data + +#----------------------------------------------------------------------------- +def _get_time_features(dt): + return np.stack([ + dt.dayofweek.to_numpy(), # Day of the week + dt.day.to_numpy(), # Day of the month + dt.dayofyear.to_numpy(), # Day of the year + dt.month.to_numpy(), # Month + dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year + ], axis=1).astype(np.float) + +# +# def load_online_retail(name='Cleaned_Online_Retail', agg_freq='D'): +# """ +# Loads and preprocesses the Online Retail dataset for forecasting tasks. +# +# Parameters: +# name (str): Name of the dataset file (without extension). +# agg_freq (str): Aggregation frequency (e.g., 'D' for daily, 'W' for weekly, '2W' for bi-weekly, 'M' for monthly). +# +# Returns: +# data (np.ndarray): Preprocessed time series data. +# train_slice (slice): Slice object for training data. +# valid_slice (slice): Slice object for validation data. +# test_slice (slice): Slice object for testing data. +# scaler (StandardScaler): Fitted scaler object. +# pred_lens (list): List of prediction lengths. +# n_covariate_cols (int): Number of covariate columns. +# """ +# +# # Load data +# file_path = f'datasets/UEA/{name}.csv' +# df = pd.read_csv(file_path, parse_dates=['InvoiceDate']) +# +# # Convert 'InvoiceDate' to datetime if not already done +# if not pd.api.types.is_datetime64_any_dtype(df['InvoiceDate']): +# df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']) +# +# # Set 'InvoiceDate' as index +# df.set_index('InvoiceDate', inplace=True) +# +# # Handle different aggregation frequencies +# if agg_freq == 'D': +# freq = 'D' +# elif agg_freq == 'W': +# freq = 'W' +# elif agg_freq == '2W': +# freq = '2W' +# elif agg_freq == 'M': +# freq = 'M' +# else: +# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") +# +# # Aggregate quantity sold per specified frequency +# df_agg = df.resample(freq).sum()['Quantity'] +# +# # Handle missing values by filling with zeros +# df_agg = df_agg.fillna(0) +# +# # Generate time features +# time_features = _get_time_features(df_agg.index) +# n_covariate_cols = time_features.shape[1] +# +# # Convert to numpy array +# data = df_agg.values +# data = data.reshape(-1, 1) # Reshape to (timesteps, features) +# +# # Combine data with time features +# data = np.concatenate([time_features, data], axis=1) +# +# # Train/Validation/Test split +# total_length = len(data) +# train_size = int(total_length * 0.6) +# valid_size = int(total_length * 0.2) +# +# train_slice = slice(None, train_size) +# valid_slice = slice(train_size, train_size + valid_size) +# test_slice = slice(train_size + valid_size, None) +# +# # Scaling +# scaler = StandardScaler() +# data[train_slice] = scaler.fit_transform(data[train_slice]) +# data[valid_slice] = scaler.transform(data[valid_slice]) +# data[test_slice] = scaler.transform(data[test_slice]) +# +# # Reshape to (1, timesteps, features) as expected by TS2Vec +# data = data[np.newaxis, ...] +# +# # Define prediction lengths based on aggregation frequency +# if agg_freq == 'D': +# pred_lens = [1, 2, 3, 4, 5, 6, 7] # Predicting each day for the upcoming week +# elif agg_freq == 'W': +# pred_lens = [1, 2, 3, 4] # Predicting each week for the upcoming month (4 weeks) +# elif agg_freq == '2W': +# pred_lens = [1, 2, 3] # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals) +# elif agg_freq == 'M': +# pred_lens = [1, 2, 3] # Predicting 1 month, 2 months, and 3 months ahead (in months) +# else: +# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") +# +# return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols +# train_slice = slice(None, train_size) +# valid_slice = slice(train_size, train_size + valid_size) +# test_slice = slice(train_size + valid_size, None) +# +# # Scaling +# scaler = StandardScaler() +# data[train_slice] = scaler.fit_transform(data[train_slice]) +# data[valid_slice] = scaler.transform(data[valid_slice]) +# data[test_slice] = scaler.transform(data[test_slice]) +# +# # Reshape to (1, timesteps, features) as expected by TS2Vec +# data = data[np.newaxis, ...] +# +# # Define prediction lengths based on aggregation frequency +# if agg_freq == 'D': +# pred_lens = [1, 2, 3, 4, 5, 6, 7] # Predicting each day for the upcoming week +# elif agg_freq == 'W': +# pred_lens = [1, 2, 3, 4] # Predicting each week for the upcoming month (4 weeks) +# elif agg_freq == '2W': +# pred_lens = [1, 2, 3] # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals) +# elif agg_freq == 'M': +# pred_lens = [1, 2, 3] # Predicting 1 month, 2 months, and 3 months ahead (in months) +# else: +# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") +# +# return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols diff --git a/requirements.txt b/requirements.txt index 9cc27049..d5ee03d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ numpy==1.19.2 statsmodels==0.12.2 pandas==1.0.1 scikit_learn==0.24.2 +xlrd==1.2.0 diff --git a/tasks/__init__.py b/tasks/__init__.py index 5e38a07c..c1e1208e 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,3 +1,3 @@ from .classification import eval_classification -from .forecasting import eval_forecasting +from .forecasting import eval_forecasting, eval_forecasting_unsupervised from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart diff --git a/tasks/forecasting.py b/tasks/forecasting.py index b3493d83..d2139cce 100644 --- a/tasks/forecasting.py +++ b/tasks/forecasting.py @@ -9,17 +9,17 @@ def generate_pred_samples(features, data, pred_len, drop=0): features = features[:, drop:] labels = labels[:, drop:] return features.reshape(-1, features.shape[-1]), \ - labels.reshape(-1, labels.shape[2]*labels.shape[3]) + labels.reshape(-1, labels.shape[2]*labels.shape[3]) def cal_metrics(pred, target): return { 'MSE': ((pred - target) ** 2).mean(), 'MAE': np.abs(pred - target).mean() } - + def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols): padding = 200 - + t = time.time() all_repr = model.encode( data, @@ -29,15 +29,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, batch_size=256 ) ts2vec_infer_time = time.time() - t - + train_repr = all_repr[:, train_slice] valid_repr = all_repr[:, valid_slice] test_repr = all_repr[:, test_slice] - + train_data = data[:, train_slice, n_covariate_cols:] valid_data = data[:, valid_slice, n_covariate_cols:] test_data = data[:, test_slice, n_covariate_cols:] - + ours_result = {} lr_train_time = {} lr_infer_time = {} @@ -46,11 +46,11 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding) valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len) test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len) - + t = time.time() lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels) lr_train_time[pred_len] = time.time() - t - + t = time.time() test_pred = lr.predict(test_features) lr_infer_time[pred_len] = time.time() - t @@ -58,14 +58,14 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2] test_pred = test_pred.reshape(ori_shape) test_labels = test_labels.reshape(ori_shape) - + if test_data.shape[0] > 1: test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3) test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3) else: test_pred_inv = scaler.inverse_transform(test_pred) test_labels_inv = scaler.inverse_transform(test_labels) - + out_log[pred_len] = { 'norm': test_pred, 'raw': test_pred_inv, @@ -76,7 +76,7 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, 'norm': cal_metrics(test_pred, test_labels), 'raw': cal_metrics(test_pred_inv, test_labels_inv) } - + eval_res = { 'ours': ours_result, 'ts2vec_infer_time': ts2vec_infer_time, @@ -84,3 +84,33 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, 'lr_infer_time': lr_infer_time } return out_log, eval_res + +def eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols): + padding = 200 + + t = time.time() + all_repr = model.encode( + data, + causal=True, + sliding_length=1, + sliding_padding=padding, + batch_size=256 + ) + ts2vec_infer_time = time.time() - t + + # Extract representations (embeddings) for each time slice + train_repr = all_repr[:, train_slice] + valid_repr = all_repr[:, valid_slice] + test_repr = all_repr[:, test_slice] + + # You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization) + + eval_res = { + 'ts2vec_infer_time': ts2vec_infer_time, + 'train_repr_shape': train_repr.shape, + 'valid_repr_shape': valid_repr.shape, + 'test_repr_shape': test_repr.shape + } + + return eval_res + diff --git a/train.py b/train.py index ecc30a60..916f2a3e 100644 --- a/train.py +++ b/train.py @@ -11,8 +11,8 @@ from utils import init_dl_program, name_with_datetime, pkl_save, data_dropout def save_checkpoint_callback( - save_every=1, - unit='epoch' + save_every=1, + unit='epoch' ): assert unit in ('epoch', 'iter') def callback(model, loss): @@ -39,55 +39,55 @@ def callback(model, loss): parser.add_argument('--eval', action="store_true", help='Whether to perform evaluation after training') parser.add_argument('--irregular', type=float, default=0, help='The ratio of missing observations (defaults to 0)') args = parser.parse_args() - + print("Dataset:", args.dataset) print("Arguments:", str(args)) - + device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads) - + print('Loading data... ', end='') if args.loader == 'UCR': task_type = 'classification' train_data, train_labels, test_data, test_labels = datautils.load_UCR(args.dataset) - + elif args.loader == 'UEA': task_type = 'classification' train_data, train_labels, test_data, test_labels = datautils.load_UEA(args.dataset) - + elif args.loader == 'forecast_csv': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset) train_data = data[:, train_slice] - + elif args.loader == 'forecast_csv_univar': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset, univar=True) train_data = data[:, train_slice] - + elif args.loader == 'forecast_npy': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset) train_data = data[:, train_slice] - + elif args.loader == 'forecast_npy_univar': task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset, univar=True) train_data = data[:, train_slice] - + elif args.loader == 'anomaly': task_type = 'anomaly_detection' all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data = datautils.gen_ano_train_data(all_train_data) - + elif args.loader == 'anomaly_coldstart': task_type = 'anomaly_detection_coldstart' all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data, _, _, _ = datautils.load_UCR('FordA') - + else: raise ValueError(f"Unknown loader {args.loader}.") - - + + if args.irregular > 0: if task_type == 'classification': train_data = data_dropout(train_data, args.irregular) @@ -95,23 +95,23 @@ def callback(model, loss): else: raise ValueError(f"Task type {task_type} is not supported when irregular>0.") print('done') - + config = dict( batch_size=args.batch_size, lr=args.lr, output_dims=args.repr_dims, max_train_length=args.max_train_length ) - + if args.save_every is not None: unit = 'epoch' if args.epochs is not None else 'iter' config[f'after_{unit}_callback'] = save_checkpoint_callback(args.save_every, unit) run_dir = 'training/' + args.dataset + '__' + name_with_datetime(args.run_name) os.makedirs(run_dir, exist_ok=True) - + t = time.time() - + model = TS2Vec( input_dims=train_data.shape[-1], device=device, @@ -129,18 +129,29 @@ def callback(model, loss): print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n") if args.eval: + out = None # Initialise out as None to avoid the NameError in the case of unsupervised tasks + if task_type == 'classification': out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm') elif task_type == 'forecasting': - out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) + # add case for unsupervised evaluation on Online Retail dataset + if args.dataset == 'ts2vec_online_retail_II_data': + eval_res = tasks.eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols) + else: + out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) elif task_type == 'anomaly_detection': out, eval_res = tasks.eval_anomaly_detection(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay) elif task_type == 'anomaly_detection_coldstart': out, eval_res = tasks.eval_anomaly_detection_coldstart(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay) else: assert False - pkl_save(f'{run_dir}/out.pkl', out) + + # Save only eval_res when 'out' is None (i.e., for unsupervised tasks) + if out is not None: + pkl_save(f'{run_dir}/out.pkl', out) + pkl_save(f'{run_dir}/eval_res.pkl', eval_res) print('Evaluation result:', eval_res) print("Finished.") + From ac644d4d5ef3239a2dc70c3ab803f2e6bad24808 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Sun, 22 Sep 2024 22:49:56 +0100 Subject: [PATCH 04/10] ts2vec now run with ts2vec_online_retail_II_data using forecast_csv as loader, whilst also generating MSE and MAE (metrics were previously not being calculated correctly becasue of incompatible data reshaping) --- datautils.py | 226 ++++++++++++++++++--------------------- models/encoder.py | 2 +- requirements.txt | 3 + tasks/_eval_protocols.py | 12 +++ tasks/forecasting.py | 23 +++- train.py | 115 +++++++++++++++++--- 6 files changed, 240 insertions(+), 141 deletions(-) diff --git a/datautils.py b/datautils.py index c618fb66..2c12e3ae 100644 --- a/datautils.py +++ b/datautils.py @@ -5,6 +5,9 @@ import random from datetime import datetime import pickle + +import torch + from utils import pkl_load, pad_nan_to_target from scipy.io.arff import loadarff from sklearn.preprocessing import StandardScaler, MinMaxScaler @@ -146,7 +149,7 @@ def load_forecast_csv(name, univar=False): Returns: tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols """ - # Try loading with 'date' as index, if it fails, try with 'InvoiceDate' + # Try loading with 'date' as index, if it fails, try with 'InvoiceDate' to load online retail data try: data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True) except ValueError: @@ -168,10 +171,6 @@ def load_forecast_csv(name, univar=False): data = data[['MT_001']] else: data = data.iloc[:, -1:] - else: - # Online Retail II case - if name == 'ts2vec_online_retail_II_data': - data = data[['Quantity', 'Price']] # Convert data to numpy array data = data.to_numpy() @@ -186,13 +185,18 @@ def load_forecast_csv(name, univar=False): valid_slice = slice(12*30*24*4, 16*30*24*4) test_slice = slice(16*30*24*4, 20*30*24*4) else: - # Default case for custom datasets - train_slice = slice(None, int(0.6 * len(data))) + # Default case for other or custom datasets + train_slice = slice(0, int(0.6 * len(data))) valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) - test_slice = slice(int(0.8 * len(data)), None) + test_slice = slice(int(0.8 * len(data)), len(data)) # Normalise data - scaler = StandardScaler().fit(data[train_slice]) + scaler = None + if name == 'ts2vec_online_retail_II_data': + scaler = MinMaxScaler().fit(data[train_slice]) + else: + scaler = StandardScaler().fit(data[train_slice]) + data = scaler.transform(data) # Reshape data based on dataset structure @@ -202,12 +206,22 @@ def load_forecast_csv(name, univar=False): data = np.expand_dims(data, 0) # Single instance case if n_covariate_cols > 0: - dt_scaler = StandardScaler().fit(dt_embed[train_slice]) + if name == 'ts2vec_online_retail_II_data': + dt_scaler = MinMaxScaler().fit(dt_embed[train_slice]) + else: + dt_scaler = StandardScaler().fit(dt_embed[train_slice]) dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0) + # Concatenating the time embeddings to the data + ''' NOTE: + The np.repeat(dt_embed, data.shape[0], axis=0) function is used to repeat the time embeddings + for each instance only in the case of the 'electricity' dataset. This ensures that the time + embeddings are correctly aligned with the data instances''' data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1) if name in ('ETTh1', 'ETTh2', 'electricity'): pred_lens = [24, 48, 168, 336, 720] + elif name == 'ts2vec_online_retail_II_data': + pred_lens = [1, 2, 3, 4, 5] else: pred_lens = [24, 48, 96, 288, 672] @@ -240,119 +254,89 @@ def _get_time_features(dt): dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year ], axis=1).astype(np.float) -# -# def load_online_retail(name='Cleaned_Online_Retail', agg_freq='D'): + +def load_online_retail(name, repr_dims): # """ # Loads and preprocesses the Online Retail dataset for forecasting tasks. -# -# Parameters: -# name (str): Name of the dataset file (without extension). -# agg_freq (str): Aggregation frequency (e.g., 'D' for daily, 'W' for weekly, '2W' for bi-weekly, 'M' for monthly). +# Ensures both Price, Quantity, and customer embeddings are included throughout. """ # # Returns: -# data (np.ndarray): Preprocessed time series data. -# train_slice (slice): Slice object for training data. -# valid_slice (slice): Slice object for validation data. -# test_slice (slice): Slice object for testing data. -# scaler (StandardScaler): Fitted scaler object. -# pred_lens (list): List of prediction lengths. -# n_covariate_cols (int): Number of covariate columns. +# train_data: Dictionary mapping customer_id to their training data (DataFrame). +# valid_data: Dictionary mapping customer_id to their validation data (DataFrame). +# test_data: Dictionary mapping customer_id to their test data (DataFrame). +# customer_embeddings: Fixed embeddings for each customer ID. +# customer_id_to_index: Mapping from customer IDs to embedding indices. +# scaler: Fitted scaler for data normalization. # """ -# -# # Load data -# file_path = f'datasets/UEA/{name}.csv' -# df = pd.read_csv(file_path, parse_dates=['InvoiceDate']) -# -# # Convert 'InvoiceDate' to datetime if not already done -# if not pd.api.types.is_datetime64_any_dtype(df['InvoiceDate']): -# df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']) -# -# # Set 'InvoiceDate' as index -# df.set_index('InvoiceDate', inplace=True) -# -# # Handle different aggregation frequencies -# if agg_freq == 'D': -# freq = 'D' -# elif agg_freq == 'W': -# freq = 'W' -# elif agg_freq == '2W': -# freq = '2W' -# elif agg_freq == 'M': -# freq = 'M' -# else: -# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") -# -# # Aggregate quantity sold per specified frequency -# df_agg = df.resample(freq).sum()['Quantity'] -# -# # Handle missing values by filling with zeros -# df_agg = df_agg.fillna(0) -# -# # Generate time features -# time_features = _get_time_features(df_agg.index) -# n_covariate_cols = time_features.shape[1] -# -# # Convert to numpy array -# data = df_agg.values -# data = data.reshape(-1, 1) # Reshape to (timesteps, features) -# -# # Combine data with time features -# data = np.concatenate([time_features, data], axis=1) -# -# # Train/Validation/Test split -# total_length = len(data) -# train_size = int(total_length * 0.6) -# valid_size = int(total_length * 0.2) -# -# train_slice = slice(None, train_size) -# valid_slice = slice(train_size, train_size + valid_size) -# test_slice = slice(train_size + valid_size, None) -# -# # Scaling -# scaler = StandardScaler() -# data[train_slice] = scaler.fit_transform(data[train_slice]) -# data[valid_slice] = scaler.transform(data[valid_slice]) -# data[test_slice] = scaler.transform(data[test_slice]) -# -# # Reshape to (1, timesteps, features) as expected by TS2Vec -# data = data[np.newaxis, ...] -# -# # Define prediction lengths based on aggregation frequency -# if agg_freq == 'D': -# pred_lens = [1, 2, 3, 4, 5, 6, 7] # Predicting each day for the upcoming week -# elif agg_freq == 'W': -# pred_lens = [1, 2, 3, 4] # Predicting each week for the upcoming month (4 weeks) -# elif agg_freq == '2W': -# pred_lens = [1, 2, 3] # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals) -# elif agg_freq == 'M': -# pred_lens = [1, 2, 3] # Predicting 1 month, 2 months, and 3 months ahead (in months) -# else: -# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") -# -# return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols -# train_slice = slice(None, train_size) -# valid_slice = slice(train_size, train_size + valid_size) -# test_slice = slice(train_size + valid_size, None) -# -# # Scaling -# scaler = StandardScaler() -# data[train_slice] = scaler.fit_transform(data[train_slice]) -# data[valid_slice] = scaler.transform(data[valid_slice]) -# data[test_slice] = scaler.transform(data[test_slice]) -# -# # Reshape to (1, timesteps, features) as expected by TS2Vec -# data = data[np.newaxis, ...] -# -# # Define prediction lengths based on aggregation frequency -# if agg_freq == 'D': -# pred_lens = [1, 2, 3, 4, 5, 6, 7] # Predicting each day for the upcoming week -# elif agg_freq == 'W': -# pred_lens = [1, 2, 3, 4] # Predicting each week for the upcoming month (4 weeks) -# elif agg_freq == '2W': -# pred_lens = [1, 2, 3] # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals) -# elif agg_freq == 'M': -# pred_lens = [1, 2, 3] # Predicting 1 month, 2 months, and 3 months ahead (in months) -# else: -# raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.") -# -# return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols + + # Load data + data = pd.read_csv(f'datasets/{name}.csv', parse_dates=['InvoiceDate']) + + # Convert 'InvoiceDate' to datetime if not already done + if not pd.api.types.is_datetime64_any_dtype(data['InvoiceDate']): + data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate']) + + # Sort data by 'InvoiceDate' + data.sort_values(by='InvoiceDate', inplace=True) + + # Rename 'Customer ID' to 'CustomerID' + if 'Customer ID' in data.columns: + data.rename(columns={'Customer ID': 'CustomerID'}, inplace=True) + + if 'CustomerID' not in data.columns: + raise KeyError("The 'CustomerID' column is missing from the dataset.") + + # Extract customerIDs and create numerical mapping + customer_ids = data['CustomerID'].unique() + customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)} + + # Create fixed embeddings for each unique customer ID + customer_embeddings = torch.nn.Embedding(len(customer_ids), repr_dims) + customer_embeddings.weight.requires_grad = False # Fixed embeddings + + # Map customer IDs to their embeddings + customer_embeddings_tensor = customer_embeddings(torch.tensor(data['CustomerID'].map(customer_id_to_index).values)) + + # Store customer embeddings as a new column 'customer_embed' in the DataFrame + data['customer_embed'] = list(customer_embeddings_tensor.detach().numpy()) + + # Group by CustomerID + customer_data = data.groupby('CustomerID') + + # Split the data into train, valid, and test sets + train_data = {} + valid_data = {} + test_data = {} + + '''Split the data into train, valid, and test sets such that valid set includes second-last transaction + of each customerID and test set includes last transaction of each customerID''' + for customer_id, customer_df in customer_data: + if len(customer_df) >= 3: + train_data[customer_id] = customer_df.iloc[: -2] + valid_data[customer_id] = customer_df.iloc[-2 : -1] + test_data[customer_id] = customer_df.iloc[-1 :] + + return train_data, valid_data, test_data, customer_embeddings + + + + # # Filter customerIDs with at least 5 records + # customer_counts = data['CustomerID'].value_counts() + # valid_customers = customer_counts[customer_counts >= 5].index + # data = data[data['CustomerID'].isin(valid_customers)] + # + # # Group by CustomerID and sort by InvoiceDate + # grouped = data.groupby('CustomerID').apply(lambda x: x.sort_values('InvoiceDate')).reset_index(drop=True) + # + # # Create time series data for each CustomerID + # customer_data = {} + # for customer_id, group in grouped.groupby('CustomerID'): + # group.set_index('InvoiceDate', inplace=True) + # customer_data[customer_id] = group[['CustomerID', 'Quantity', 'Price']] + + # Set other forecasting parameters + # scaler = MinMaxScaler() # alternative to StandardScaler which avoid negative values + # pred_lens = [1, 2, 3] + # n_covariate_cols = 2 + # + # return customer_data diff --git a/models/encoder.py b/models/encoder.py index cedb13b0..c4aad5d5 100644 --- a/models/encoder.py +++ b/models/encoder.py @@ -41,7 +41,7 @@ def __init__(self, input_dims, output_dims, hidden_dims=64, depth=10, mask_mode= def forward(self, x, mask=None): # x: B x T x input_dims nan_mask = ~x.isnan().any(axis=-1) x[~nan_mask] = 0 - x = self.input_fc(x) # B x T x Ch + x = self.input_fc(x) # B x T x Ch NOTE: Ch = output_dims (I think...) # generate & apply mask if mask is None: diff --git a/requirements.txt b/requirements.txt index d5ee03d8..0173de5d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ statsmodels==0.12.2 pandas==1.0.1 scikit_learn==0.24.2 xlrd==1.2.0 + +matplotlib==3.3.4 +scikit-learn~=0.24.2 \ No newline at end of file diff --git a/tasks/_eval_protocols.py b/tasks/_eval_protocols.py index e413bcf4..d0e9af8b 100644 --- a/tasks/_eval_protocols.py +++ b/tasks/_eval_protocols.py @@ -79,6 +79,18 @@ def fit_knn(features, y): return pipe def fit_ridge(train_features, train_y, valid_features, valid_y, MAX_SAMPLES=100000): + # Replace NaN and infinite values with 0 + train_features = np.nan_to_num(train_features, nan=0.0, posinf=0.0, neginf=0.0) + train_y = np.nan_to_num(train_y, nan=0.0, posinf=0.0, neginf=0.0) + valid_features = np.nan_to_num(valid_features, nan=0.0, posinf=0.0, neginf=0.0) + valid_y = np.nan_to_num(valid_y, nan=0.0, posinf=0.0, neginf=0.0) + + # Ensure values are within the range of float64 + train_features = np.clip(train_features, -1e308, 1e308) + train_y = np.clip(train_y, -1e308, 1e308) + valid_features = np.clip(valid_features, -1e308, 1e308) + valid_y = np.clip(valid_y, -1e308, 1e308) + # If the training set is too large, subsample MAX_SAMPLES examples if train_features.shape[0] > MAX_SAMPLES: split = train_test_split( diff --git a/tasks/forecasting.py b/tasks/forecasting.py index d2139cce..ac8faa78 100644 --- a/tasks/forecasting.py +++ b/tasks/forecasting.py @@ -17,7 +17,7 @@ def cal_metrics(pred, target): 'MAE': np.abs(pred - target).mean() } -def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols): +def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, dataset_name = None): padding = 200 t = time.time() @@ -55,13 +55,28 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, test_pred = lr.predict(test_features) lr_infer_time[pred_len] = time.time() - t - ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2] - test_pred = test_pred.reshape(ori_shape) - test_labels = test_labels.reshape(ori_shape) + if dataset_name =='ts2vec_online_retail_II_data': + test_pred = test_pred.reshape(-1, 2) + test_labels = test_labels.reshape(-1, 2) + else: + ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2] + test_pred = test_pred.reshape(ori_shape) + test_labels = test_labels.reshape(ori_shape) if test_data.shape[0] > 1: test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3) test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3) + elif dataset_name == 'ts2vec_online_retail_II_data': + test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape) + test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape) + + # Remove NaN values from all datasets + valid_indices = ~np.isnan(test_pred).any(axis=1) & ~np.isnan(test_pred_inv).any(axis=1) & ~np.isnan(test_labels).any(axis=1) & ~np.isnan(test_labels_inv).any(axis=1) + test_pred = test_pred[valid_indices] + test_pred_inv = test_pred_inv[valid_indices] + test_labels = test_labels[valid_indices] + test_labels_inv = test_labels_inv[valid_indices] + else: test_pred_inv = scaler.inverse_transform(test_pred) test_labels_inv = scaler.inverse_transform(test_labels) diff --git a/train.py b/train.py index 916f2a3e..ff43e247 100644 --- a/train.py +++ b/train.py @@ -1,10 +1,19 @@ +import config +import device +import pandas as pd import torch +from matplotlib import pyplot as plt +from scipy.ndimage import label +from torch import nn import numpy as np import argparse import os import sys import time import datetime + +from sklearn.preprocessing import MinMaxScaler + from ts2vec import TS2Vec import tasks import datautils @@ -45,6 +54,11 @@ def callback(model, loss): device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads) + # Define variables outside the conditional block for online retail dataset + customer_embedding_layer = None + customer_id_to_index = None + + print('Loading data... ', end='') if args.loader == 'UCR': task_type = 'classification' @@ -58,6 +72,7 @@ def callback(model, loss): task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset) train_data = data[:, train_slice] + test_data = data[:, test_slice] elif args.loader == 'forecast_csv_univar': task_type = 'forecasting' @@ -84,10 +99,15 @@ def callback(model, loss): all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data, _, _, _ = datautils.load_UCR('FordA') + # elif args.loader == 'retail': + # task_type = 'forecasting' + # + # # Load the data + # train_data, valid_data, test_data, customer_embeddings = datautils.load_online_retail(args.dataset, repr_dims=args.repr_dims) + else: raise ValueError(f"Unknown loader {args.loader}.") - if args.irregular > 0: if task_type == 'classification': train_data = data_dropout(train_data, args.irregular) @@ -112,31 +132,96 @@ def callback(model, loss): t = time.time() - model = TS2Vec( - input_dims=train_data.shape[-1], - device=device, - **config - ) - loss_log = model.fit( - train_data, - n_epochs=args.epochs, - n_iters=args.iters, - verbose=True - ) + # Define the TS2Vec model + if args.loader == 'retail': + # Initialise the loss log fir tracking losses during training + loss_log = [] + # Initialise the model + model = TS2Vec( + input_dims=2, # We are using 'Quantity' and 'Price' as input dimensions + device=device, + **config, + use_customer_embs=True # Specify that we are using customer embeddings + ) + + # Training loop with customer ID fixed embeddings as input + for epoch in range(args.epochs): + '''For retail loader, `train_data`, `valid_data`, and `test_data` are dictionaries where each key is + a `customer_id` and the value is a DataFrame with shape `(n_transactions, n_features)`. The model will + train on 'data_array`, which is reshaped to `(n_instances, n_timestamps, n_dimensions)`.''' + epoch_loss = 0 + for customer_id, data in train_data.items(): + + + # Get the customer embedding + customer_idx = torch.tensor(customer_id_to_index[customer_id], device = device) + customer_embedding = customer_embedding_layer(customer_idx).unsqueeze(0).unsqueeze(0) + + # Convert data to the required format + data_array = data.to_numpy() + data_array = torch.tensor(data_array, dtype=torch.float32, device=device) # Ensure data_array is a torch.Tensor + + # Add the customer embedding to the data + data_array += customer_embedding + + # Convert back to NumPy array before training the model + data_array = data_array.cpu().numpy() + + # train the model + loss_log = model.fit(data_array, n_epochs=1, n_iters=args.iters, verbose=True) + else: + model = TS2Vec( + input_dims=train_data.shape[-1], + device=device, + **config + ) + # Training code + loss_log = model.fit( + train_data, + n_epochs=args.epochs, + n_iters=args.iters, + verbose=True + ) + model.save(f'{run_dir}/model.pkl') + # Plot the training loss log + plt.plot(loss_log, label='Training Loss') + plt.xlabel('Epoch') + plt.ylabel('Loss') + plt.title('Training Loss Over Epochs') + plt.grid(True) + plt.savefig(f'{run_dir}/loss_plot.png') + # Draw a straight line to show the general trend + z = np.polyfit(range(len(loss_log)), loss_log, 1) + p = np.poly1d(z) + plt.plot(range(len(loss_log)), p(range(len(loss_log))), "r--", label='Training Loss Trend') + plt.legend() + # Save the plot + plt.savefig(f'{run_dir}/train_loss_plot.png') + plt.show() + t = time.time() - t print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n") if args.eval: - out = None # Initialise out as None to avoid the NameError in the case of unsupervised tasks + out = None # Initialise 'out' as None to avoid the NameError in the case of unsupervised tasks if task_type == 'classification': out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm') elif task_type == 'forecasting': # add case for unsupervised evaluation on Online Retail dataset if args.dataset == 'ts2vec_online_retail_II_data': - eval_res = tasks.eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols) + # # print data shapes and check for NaN or infinite values + # print("Data shape:", data.shape) + # print("Train slice:", train_slice) + # print("Valid slice:", valid_slice) + # print("Test slice:", test_slice) + # print("Scaler:", scaler) + # print("Prediction lengths:", pred_lens) + # print("Number of covariate columns:", n_covariate_cols) + out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset) + else: out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) elif task_type == 'anomaly_detection': @@ -153,5 +238,5 @@ def callback(model, loss): pkl_save(f'{run_dir}/eval_res.pkl', eval_res) print('Evaluation result:', eval_res) - print("Finished.") + print("Finished.") From b65984c944a7b57ac92f749c64e9e224b061fd24 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Sun, 22 Sep 2024 23:41:38 +0100 Subject: [PATCH 05/10] uploading preprocessing scripts for Online Retail II dataset --- .../stage1_online_retail_pre_processing.py | 80 +++++++++++++++++++ .../stage2_online_retail_pre_processing_.py | 67 ++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 datasets/stage1_online_retail_pre_processing.py create mode 100644 datasets/stage2_online_retail_pre_processing_.py diff --git a/datasets/stage1_online_retail_pre_processing.py b/datasets/stage1_online_retail_pre_processing.py new file mode 100644 index 00000000..eee5d36d --- /dev/null +++ b/datasets/stage1_online_retail_pre_processing.py @@ -0,0 +1,80 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Step 1: Load and Combine Data from Excel Sheets +def load_and_combine_sheets(file_path, sheets): + """ + Load data from multiple Excel sheets and combine into a single DataFrame. + """ + combined_data = pd.DataFrame() + for sheet in sheets: + logging.info(f"Loading data from sheet: {sheet}") + sheet_data = pd.read_excel(file_path, sheet_name=sheet) + combined_data = pd.concat([combined_data, sheet_data], ignore_index=True) + logging.info("Data successfully loaded and combined.") + return combined_data + +# Step 2: Preprocess Data +def preprocess_data(data): + """ + Preprocess data by converting dates, normalizing numeric columns, and handling missing values. + """ + # Convert 'InvoiceDate' to datetime and ensure numerical consistency in key columns + logging.info("Preprocessing data: converting dates and normalizing numeric columns.") + data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce') + data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce') + data['Price'] = pd.to_numeric(data['Price'], errors='coerce') + data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce') + + # Remove rows where the Invoice starts with 'C' (canceled orders) + data = data[~data['Invoice'].astype(str).str.startswith('C')] + + # Drop rows with missing critical data + data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price']) + + # Normalize 'Quantity' and 'Price' using Min-Max scaling to insure values are positive + scaler = MinMaxScaler() + data[['Quantity', 'Price']] = scaler.fit_transform(data[['Quantity', 'Price']]) + logging.info("Data normalized and missing values handled.") + + return data + +# Step 3: Aggregate Data +def aggregate_data(data): + """ + Aggregate data by summing 'Quantity' and averaging 'Price' daily. + """ + logging.info("Aggregating data by Date.") + # Group by Date, aggregating Quantity and Price + data_agg = data.groupby(pd.Grouper(key='InvoiceDate', freq='D')).agg({ + 'Quantity': 'sum', + 'Price': 'mean' + }).reset_index() + + logging.info("Data aggregation complete.") + return data_agg + +# Main Function to Run All Steps +def main(): + # File path and sheets to load + file_path = 'online_retail_II.xlsx' + sheets = ['Year 2009-2010', 'Year 2010-2011'] + + # Load and preprocess the data + combined_data = load_and_combine_sheets(file_path, sheets) + cleaned_data = preprocess_data(combined_data) + + # Aggregate the data + aggregated_data = aggregate_data(cleaned_data) + + # Save the final reshaped and adjusted data to CSV + aggregated_data.to_csv('ts2vec_online_retail_II_data.csv', index=False) + logging.info("Final data saved successfully.") + +if __name__ == "__main__": + main() diff --git a/datasets/stage2_online_retail_pre_processing_.py b/datasets/stage2_online_retail_pre_processing_.py new file mode 100644 index 00000000..f33962ef --- /dev/null +++ b/datasets/stage2_online_retail_pre_processing_.py @@ -0,0 +1,67 @@ +import pandas as pd +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Step 1: Load the original Online Retail II dataset +def load_data(file_path): + logging.info(f"Loading data from: {file_path}") + data = pd.read_excel(file_path) + logging.info(f"Data successfully loaded with {len(data)} records.") + return data + +# Step 2: Clean and preprocess the dataset +def preprocess_data(data): + logging.info("Preprocessing data: cleaning and handling missing values.") + # Convert 'InvoiceDate' to datetime and ensure numerical consistency + data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce') + data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce') + data['Price'] = pd.to_numeric(data['Price'], errors='coerce') + data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce') + + # Remove cancelled orders (invoices starting with 'C') + data = data[~data['Invoice'].str.startswith('C', na=False)] + + # Drop rows with missing values in key columns + data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price']) + + logging.info(f"Data cleaned. Remaining records: {len(data)}.") + return data + +# Step 3: Group by CustomerID and InvoiceNo +def group_by_customer_invoice(data): + logging.info("Grouping by Customer ID and Invoice Number.") + # Group by CustomerID and InvoiceNo to represent each invoice as a time series record + grouped = data.groupby(['Customer ID', 'Invoice']).agg({ + 'InvoiceDate': 'first', # First date of the invoice + 'Quantity': 'sum', # Sum of quantities in the invoice + 'Price': 'mean' # Average price in the invoice + }).reset_index() + + logging.info(f"Grouped data created with {len(grouped)} records.") + return grouped + +# Step 4: Save the restructured dataset +def save_data(grouped_data, output_file): + logging.info(f"Saving restructured data to {output_file}.") + grouped_data.to_csv(output_file, index=False) + logging.info("Data successfully saved.") + +# Main function to run the entire preprocessing pipeline +def main(): + file_path = 'online_retail_II.xlsx' + output_file = ('restructured_ts2vec_online_retail.csv') + + # Load and preprocess the data + data = load_data(file_path) + cleaned_data = preprocess_data(data) + + # Group data by CustomerID and InvoiceNo + grouped_data = group_by_customer_invoice(cleaned_data) + + # Save the restructured dataset + save_data(grouped_data, output_file) + +if __name__ == "__main__": + main() \ No newline at end of file From 32f571f82e10e7641bb09bf4c8bc3b982067628b Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Mon, 23 Sep 2024 01:05:44 +0100 Subject: [PATCH 06/10] updating gitignore and comments --- .gitignore | 2 +- datautils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 9d59144c..bb10ee7d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ # Ignore Jupyter notebook checkpoints .ipynb_checkpoints - +*.idea/ # Ignore Python cache files __pycache__ diff --git a/datautils.py b/datautils.py index 2c12e3ae..03f6a461 100644 --- a/datautils.py +++ b/datautils.py @@ -192,7 +192,7 @@ def load_forecast_csv(name, univar=False): # Normalise data scaler = None - if name == 'ts2vec_online_retail_II_data': + if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail': scaler = MinMaxScaler().fit(data[train_slice]) else: scaler = StandardScaler().fit(data[train_slice]) @@ -206,7 +206,7 @@ def load_forecast_csv(name, univar=False): data = np.expand_dims(data, 0) # Single instance case if n_covariate_cols > 0: - if name == 'ts2vec_online_retail_II_data': + if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail': dt_scaler = MinMaxScaler().fit(dt_embed[train_slice]) else: dt_scaler = StandardScaler().fit(dt_embed[train_slice]) @@ -309,7 +309,7 @@ def load_online_retail(name, repr_dims): test_data = {} '''Split the data into train, valid, and test sets such that valid set includes second-last transaction - of each customerID and test set includes last transaction of each customerID''' + of each customerID and test set includes last transaction of each customerID ''' for customer_id, customer_df in customer_data: if len(customer_df) >= 3: train_data[customer_id] = customer_df.iloc[: -2] From 4fae48367a4fbf3d377bf2c4615a19866f9dc451 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Mon, 23 Sep 2024 01:07:39 +0100 Subject: [PATCH 07/10] test commit to save auth details for github --- datautils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datautils.py b/datautils.py index 03f6a461..2aff2e48 100644 --- a/datautils.py +++ b/datautils.py @@ -309,7 +309,7 @@ def load_online_retail(name, repr_dims): test_data = {} '''Split the data into train, valid, and test sets such that valid set includes second-last transaction - of each customerID and test set includes last transaction of each customerID ''' + of each customerID and test set includes last transaction of each customerID''' for customer_id, customer_df in customer_data: if len(customer_df) >= 3: train_data[customer_id] = customer_df.iloc[: -2] From 244a1424c5c73412220443c6a9f3dbe78d1e2ca4 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Mon, 23 Sep 2024 01:56:26 +0100 Subject: [PATCH 08/10] Added changes so that ts2vec now works with restructured_ts2vec_online_retail as well as ts2vec_online_retail_II_data (using load_forecast_csv). However, ts2vec currently trains on restructured_ts2vec_online_retail with Customer ID as a forecasting target feature rather than a covariate (as with datetime embedding features). TS2Vec is able to learn representations better for ts2vec_online_retail_II_data, which only includes Quantity and Price as target features, which is evident as restructured_ts2vec_online_retail have higher MSE and MAE values whilst also starting with a higher training loss value. Perhaps embedding the customer ID instead of feeding it as a forecasting feature will result is better representation learning --- tasks/__init__.py | 2 +- tasks/forecasting.py | 29 ++++++++++++++++++----------- train.py | 2 +- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/tasks/__init__.py b/tasks/__init__.py index c1e1208e..3a09b083 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,3 +1,3 @@ from .classification import eval_classification -from .forecasting import eval_forecasting, eval_forecasting_unsupervised +from .forecasting import eval_forecasting, eval_forecasting_customer_embed from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart diff --git a/tasks/forecasting.py b/tasks/forecasting.py index ac8faa78..d6fe8bb5 100644 --- a/tasks/forecasting.py +++ b/tasks/forecasting.py @@ -55,9 +55,12 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, test_pred = lr.predict(test_features) lr_infer_time[pred_len] = time.time() - t - if dataset_name =='ts2vec_online_retail_II_data': + if dataset_name == 'ts2vec_online_retail_II_data': test_pred = test_pred.reshape(-1, 2) test_labels = test_labels.reshape(-1, 2) + elif dataset_name == 'restructured_ts2vec_online_retail': + test_pred = test_pred.reshape(-1, 4) + test_labels = test_labels.reshape(-1, 4) else: ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2] test_pred = test_pred.reshape(ori_shape) @@ -66,9 +69,13 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, if test_data.shape[0] > 1: test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3) test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3) - elif dataset_name == 'ts2vec_online_retail_II_data': - test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape) - test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape) + elif dataset_name in ['ts2vec_online_retail_II_data','restructured_ts2vec_online_retail']: + if dataset_name == 'ts2vec_online_retail_II_data': + test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape) + test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape) + else: # restructured_ts2vec_online_retail + test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 4)).reshape(test_pred.shape) + test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 4)).reshape(test_labels.shape) # Remove NaN values from all datasets valid_indices = ~np.isnan(test_pred).any(axis=1) & ~np.isnan(test_pred_inv).any(axis=1) & ~np.isnan(test_labels).any(axis=1) & ~np.isnan(test_labels_inv).any(axis=1) @@ -92,15 +99,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, 'raw': cal_metrics(test_pred_inv, test_labels_inv) } - eval_res = { - 'ours': ours_result, - 'ts2vec_infer_time': ts2vec_infer_time, - 'lr_train_time': lr_train_time, - 'lr_infer_time': lr_infer_time - } + eval_res = { + 'ours': ours_result, + 'ts2vec_infer_time': ts2vec_infer_time, + 'lr_train_time': lr_train_time, + 'lr_infer_time': lr_infer_time + } return out_log, eval_res -def eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols): +def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols): padding = 200 t = time.time() diff --git a/train.py b/train.py index ff43e247..b2c14ca1 100644 --- a/train.py +++ b/train.py @@ -211,7 +211,7 @@ def callback(model, loss): out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm') elif task_type == 'forecasting': # add case for unsupervised evaluation on Online Retail dataset - if args.dataset == 'ts2vec_online_retail_II_data': + if args.dataset in 'ts2vec_online_retail_II_data' or 'restructured_ts2vec_online_retail': # # print data shapes and check for NaN or infinite values # print("Data shape:", data.shape) # print("Train slice:", train_slice) From d3af0340e051dd47e22a343519bf28445a90c227 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Tue, 24 Sep 2024 15:29:36 +0100 Subject: [PATCH 09/10] added new loader and evaluation function for online retail data to be trained with customer embeddings --- datautils.py | 83 +++++++++++++++++++++++++++----------- tasks/forecasting.py | 57 +++++++++++++++++++++----- train.py | 95 +++++++++++++++----------------------------- 3 files changed, 138 insertions(+), 97 deletions(-) diff --git a/datautils.py b/datautils.py index 2aff2e48..a31ec3d5 100644 --- a/datautils.py +++ b/datautils.py @@ -1,3 +1,4 @@ +import logging import os import numpy as np import pandas as pd @@ -254,8 +255,9 @@ def _get_time_features(dt): dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year ], axis=1).astype(np.float) - -def load_online_retail(name, repr_dims): +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +def load_online_retail(name): # """ # Loads and preprocesses the Online Retail dataset for forecasting tasks. # Ensures both Price, Quantity, and customer embeddings are included throughout. """ @@ -270,14 +272,21 @@ def load_online_retail(name, repr_dims): # """ # Load data - data = pd.read_csv(f'datasets/{name}.csv', parse_dates=['InvoiceDate']) + data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True) # Convert 'InvoiceDate' to datetime if not already done - if not pd.api.types.is_datetime64_any_dtype(data['InvoiceDate']): - data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate']) + if not pd.api.types.is_datetime64_any_dtype(data.index): + data.index = pd.to_datetime(data.index) - # Sort data by 'InvoiceDate' - data.sort_values(by='InvoiceDate', inplace=True) + # Remove rows with any None or NaN values + data.dropna(inplace=True) + + # Remove rows with negative 'Quantity' or 'Price' values + data = data[(data['Quantity'] > 0) & (data['Price'] > 0)] + logging.info(f"Data shape after removing negative values: {data.shape}") + + # Remove Invoice column + data.drop(columns=['Invoice'], inplace=True) # Rename 'Customer ID' to 'CustomerID' if 'Customer ID' in data.columns: @@ -285,38 +294,64 @@ def load_online_retail(name, repr_dims): if 'CustomerID' not in data.columns: raise KeyError("The 'CustomerID' column is missing from the dataset.") + logging.info(f"Data columns and shape: {data.columns, data.shape}") # Extract customerIDs and create numerical mapping customer_ids = data['CustomerID'].unique() + logging.info(f"Unique customer ID count: {customer_ids.shape[0]}") customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)} + logging.info(f"Unique customer ID to index count: {len(customer_id_to_index)}") # Create fixed embeddings for each unique customer ID - customer_embeddings = torch.nn.Embedding(len(customer_ids), repr_dims) + customer_embeddings = torch.nn.Embedding(len(customer_ids), 4) customer_embeddings.weight.requires_grad = False # Fixed embeddings # Map customer IDs to their embeddings customer_embeddings_tensor = customer_embeddings(torch.tensor(data['CustomerID'].map(customer_id_to_index).values)) + logging.info(f"Customer embeddings shape: {customer_embeddings_tensor.shape}") + logging.info(f"Customer embeddings: {customer_embeddings_tensor}") + + # Replace 'CustomerID' column with the corresponding customer embedding + customer_embeddings_expanded = customer_embeddings_tensor.detach().numpy() + logging.info(f"Customer embeddings expanded shape: {customer_embeddings_expanded.shape}") - # Store customer embeddings as a new column 'customer_embed' in the DataFrame - data['customer_embed'] = list(customer_embeddings_tensor.detach().numpy()) + # Drop the original 'CustomerID' column + data = data.drop(columns=['CustomerID']) - # Group by CustomerID - customer_data = data.groupby('CustomerID') + # Extract time features for the date/index column + dt_embed = _get_time_features(data.index) + n_covariate_cols = dt_embed.shape[-1] + customer_embeddings_expanded.shape[-1] + logging.info(f"Number of covariate columns: {n_covariate_cols}") + logging.info(f"Time features shape: {dt_embed.shape}") - # Split the data into train, valid, and test sets - train_data = {} - valid_data = {} - test_data = {} + # Convert the DataFrame to a NumPy array + data = data.to_numpy() - '''Split the data into train, valid, and test sets such that valid set includes second-last transaction - of each customerID and test set includes last transaction of each customerID''' - for customer_id, customer_df in customer_data: - if len(customer_df) >= 3: - train_data[customer_id] = customer_df.iloc[: -2] - valid_data[customer_id] = customer_df.iloc[-2 : -1] - test_data[customer_id] = customer_df.iloc[-1 :] + train_slice = slice(0, int(0.6 * len(data))) + valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data))) + test_slice = slice(int(0.8 * len(data)), len(data)) - return train_data, valid_data, test_data, customer_embeddings + # Normalise data + scaler = MinMaxScaler().fit(data[train_slice]) + data = scaler.transform(data) + + # Reshape data based on dataset structure + data = np.expand_dims(data, 0) # Single instance case + + if n_covariate_cols > 0: + dt_scaler = MinMaxScaler().fit(dt_embed[train_slice]) + dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0) + # Concatenating the time embeddings to the data + data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1) + + cid_scaler = MinMaxScaler().fit(customer_embeddings_expanded[train_slice]) + cid_embed = np.expand_dims(cid_scaler.transform(customer_embeddings_expanded), 0) + # Concatenating the customer ID embeddings to the data + data = np.concatenate([np.repeat(cid_embed, data.shape[0], axis=0), data], axis=-1) + + pred_lens = [1, 2, 3, 4, 5] + + return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols diff --git a/tasks/forecasting.py b/tasks/forecasting.py index d6fe8bb5..c5b0276c 100644 --- a/tasks/forecasting.py +++ b/tasks/forecasting.py @@ -107,7 +107,7 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, } return out_log, eval_res -def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols): +def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols): padding = 200 t = time.time() @@ -125,14 +125,53 @@ def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_ valid_repr = all_repr[:, valid_slice] test_repr = all_repr[:, test_slice] - # You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization) + # Extract data for each time slice + train_data = data[:, train_slice, n_covariate_cols:] + valid_data = data[:, valid_slice, n_covariate_cols:] + test_data = data[:, test_slice, n_covariate_cols:] - eval_res = { - 'ts2vec_infer_time': ts2vec_infer_time, - 'train_repr_shape': train_repr.shape, - 'valid_repr_shape': valid_repr.shape, - 'test_repr_shape': test_repr.shape - } + ours_result = {} + lr_train_time = {} + lr_infer_time = {} + out_log = {} + for pred_len in pred_lens: + train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding) + valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len) + test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len) + + t = time.time() + lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels) + lr_train_time[pred_len] = time.time() - t + + t = time.time() + test_pred = lr.predict(test_features) + lr_infer_time[pred_len] = time.time() - t + + test_pred = test_pred.reshape(-1, 2) + test_labels = test_labels.reshape(-1, 2) - return eval_res + test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape) + test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape) + + # NOTE: You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization) + + out_log[pred_len] = { + 'norm': test_pred, + 'raw': test_pred_inv, + 'norm_gt': test_labels, + 'raw_gt': test_labels_inv + } + ours_result[pred_len] = { + 'norm': cal_metrics(test_pred, test_labels), + 'raw': cal_metrics(test_pred_inv, test_labels_inv) + } + + eval_res = { + 'ours': ours_result, + 'ts2vec_infer_time': ts2vec_infer_time, + 'lr_train_time': lr_train_time, + 'lr_infer_time': lr_infer_time + } + + return out_log, eval_res diff --git a/train.py b/train.py index b2c14ca1..fb8346c2 100644 --- a/train.py +++ b/train.py @@ -72,7 +72,6 @@ def callback(model, loss): task_type = 'forecasting' data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset) train_data = data[:, train_slice] - test_data = data[:, test_slice] elif args.loader == 'forecast_csv_univar': task_type = 'forecasting' @@ -99,11 +98,11 @@ def callback(model, loss): all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset) train_data, _, _, _ = datautils.load_UCR('FordA') - # elif args.loader == 'retail': - # task_type = 'forecasting' - # - # # Load the data - # train_data, valid_data, test_data, customer_embeddings = datautils.load_online_retail(args.dataset, repr_dims=args.repr_dims) + elif args.loader == 'retail': + task_type = 'forecasting' + # Load the data + data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_online_retail(args.dataset) + train_data = data[:, train_slice] else: raise ValueError(f"Unknown loader {args.loader}.") @@ -133,55 +132,20 @@ def callback(model, loss): t = time.time() # Define the TS2Vec model - if args.loader == 'retail': - # Initialise the loss log fir tracking losses during training - loss_log = [] - # Initialise the model - model = TS2Vec( - input_dims=2, # We are using 'Quantity' and 'Price' as input dimensions - device=device, - **config, - use_customer_embs=True # Specify that we are using customer embeddings - ) - - # Training loop with customer ID fixed embeddings as input - for epoch in range(args.epochs): - '''For retail loader, `train_data`, `valid_data`, and `test_data` are dictionaries where each key is - a `customer_id` and the value is a DataFrame with shape `(n_transactions, n_features)`. The model will - train on 'data_array`, which is reshaped to `(n_instances, n_timestamps, n_dimensions)`.''' - epoch_loss = 0 - for customer_id, data in train_data.items(): - - - # Get the customer embedding - customer_idx = torch.tensor(customer_id_to_index[customer_id], device = device) - customer_embedding = customer_embedding_layer(customer_idx).unsqueeze(0).unsqueeze(0) - - # Convert data to the required format - data_array = data.to_numpy() - data_array = torch.tensor(data_array, dtype=torch.float32, device=device) # Ensure data_array is a torch.Tensor - - # Add the customer embedding to the data - data_array += customer_embedding - - # Convert back to NumPy array before training the model - data_array = data_array.cpu().numpy() - - # train the model - loss_log = model.fit(data_array, n_epochs=1, n_iters=args.iters, verbose=True) - else: - model = TS2Vec( - input_dims=train_data.shape[-1], - device=device, - **config - ) - # Training code - loss_log = model.fit( - train_data, - n_epochs=args.epochs, - n_iters=args.iters, - verbose=True - ) + + + model = TS2Vec( + input_dims=train_data.shape[-1], + device=device, + **config + ) + # Training code + loss_log = model.fit( + train_data, + n_epochs=args.epochs, + n_iters=args.iters, + verbose=True + ) model.save(f'{run_dir}/model.pkl') @@ -212,15 +176,18 @@ def callback(model, loss): elif task_type == 'forecasting': # add case for unsupervised evaluation on Online Retail dataset if args.dataset in 'ts2vec_online_retail_II_data' or 'restructured_ts2vec_online_retail': - # # print data shapes and check for NaN or infinite values - # print("Data shape:", data.shape) - # print("Train slice:", train_slice) - # print("Valid slice:", valid_slice) - # print("Test slice:", test_slice) - # print("Scaler:", scaler) - # print("Prediction lengths:", pred_lens) - # print("Number of covariate columns:", n_covariate_cols) - out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset) + # print data shapes and check for NaN or infinite values + print("Data shape:", data.shape) + print("Train slice:", train_slice) + print("Valid slice:", valid_slice) + print("Test slice:", test_slice) + print("Scaler:", scaler) + print("Prediction lengths:", pred_lens) + print("Number of covariate columns:", n_covariate_cols) + if args.loader == 'retail': + out, eval_res = tasks.eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) + else: + out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset) else: out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols) From c893c630e5fcbb8d70e2dc928c2b14b74fe74780 Mon Sep 17 00:00:00 2001 From: "jumaira.miller" Date: Tue, 24 Sep 2024 21:00:12 +0100 Subject: [PATCH 10/10] test commit --- ts2vec.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ts2vec.py b/ts2vec.py index de909bd5..76dc227d 100644 --- a/ts2vec.py +++ b/ts2vec.py @@ -5,7 +5,6 @@ from models import TSEncoder from models.losses import hierarchical_contrastive_loss from utils import take_per_row, split_with_nan, centerize_vary_length_series, torch_pad_nan -import math class TS2Vec: '''The TS2Vec model'''