From 297e087a9e7132b3b83adbfb887089230b2658c4 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Wed, 28 Aug 2024 17:11:14 +0100
Subject: [PATCH 01/10] .

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index d0ade884..ecc30a60 100644
--- a/train.py
+++ b/train.py
@@ -32,7 +32,7 @@ def callback(model, loss):
     parser.add_argument('--repr-dims', type=int, default=320, help='The representation dimension (defaults to 320)')
     parser.add_argument('--max-train-length', type=int, default=3000, help='For sequence with a length greater than <max_train_length>, it would be cropped into some sequences, each of which has a length less than <max_train_length> (defaults to 3000)')
     parser.add_argument('--iters', type=int, default=None, help='The number of iterations')
-    parser.add_argument('--epochs', type=int, default=None, help='The number of epochs')
+    parser.add_argument('--epochs', type=int, default=8, help='The number of epochs')
     parser.add_argument('--save-every', type=int, default=None, help='Save the checkpoint every <save_every> iterations/epochs')
     parser.add_argument('--seed', type=int, default=None, help='The random seed')
     parser.add_argument('--max-threads', type=int, default=None, help='The maximum allowed number of threads used by this process')

From cda6a296578190fa350ac18450c5f0833699753e Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Mon, 9 Sep 2024 00:42:52 +0100
Subject: [PATCH 02/10] TS2Vec is now able to process the Online Retail II
 dataset using an editing version of load_forecasting_csv. It can also
 evaluate this dataset with a new function eval_forecasting_unsupervised. Also
 includes additional utility functions and configuration

---
 .gitignore           |  15 ++++
 datautils.py         | 202 ++++++++++++++++++++++++++++++++++++++-----
 tasks/__init__.py    |   2 +-
 tasks/forecasting.py |  52 ++++++++---
 train.py             |  53 +++++++-----
 5 files changed, 270 insertions(+), 54 deletions(-)

diff --git a/.gitignore b/.gitignore
index 78a0cc4e..9d59144c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,21 @@
+# Ignore dataset files that are not Python
 /datasets/*[!.py]
+
+# Ignore Jupyter notebook checkpoints
 .ipynb_checkpoints
+
+# Ignore Python cache files
 __pycache__
+
+# Ignore everything in the training folder except .py files
 training/*
+!training/*.py
+
+# Ignore specific nohup files in the scripts folder
 scripts/_nohup
+
+# Ignore rsync filter file
 .rsync-filter
+
+# Ignore the ts2vec_env directory (virtual environment)
+ts2vec_env/
\ No newline at end of file
diff --git a/datautils.py b/datautils.py
index f8cfddd7..c618fb66 100644
--- a/datautils.py
+++ b/datautils.py
@@ -68,7 +68,7 @@ def load_UCR(dataset):
         'UMD'
     ]:
         return train[..., np.newaxis], train_labels, test[..., np.newaxis], test_labels
-    
+
     mean = np.nanmean(train)
     std = np.nanstd(train)
     train = (train - mean) / std
@@ -79,7 +79,7 @@ def load_UCR(dataset):
 def load_UEA(dataset):
     train_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TRAIN.arff')[0]
     test_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TEST.arff')[0]
-    
+
     def extract_data(data):
         res_data = []
         res_labels = []
@@ -89,31 +89,31 @@ def extract_data(data):
             res_data.append(t_data)
             res_labels.append(t_label)
         return np.array(res_data).swapaxes(1, 2), np.array(res_labels)
-    
+
     train_X, train_y = extract_data(train_data)
     test_X, test_y = extract_data(test_data)
-    
+
     scaler = StandardScaler()
     scaler.fit(train_X.reshape(-1, train_X.shape[-1]))
     train_X = scaler.transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape)
     test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape)
-    
+
     labels = np.unique(train_y)
     transform = { k : i for i, k in enumerate(labels)}
     train_y = np.vectorize(transform.get)(train_y)
     test_y = np.vectorize(transform.get)(test_y)
     return train_X, train_y, test_X, test_y
-    
-    
+
+
 def load_forecast_npy(name, univar=False):
-    data = np.load(f'datasets/{name}.npy')    
+    data = np.load(f'datasets/{name}.npy')
     if univar:
         data = data[: -1:]
-        
+
     train_slice = slice(None, int(0.6 * len(data)))
     valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
     test_slice = slice(int(0.8 * len(data)), None)
-    
+
     scaler = StandardScaler().fit(data[train_slice])
     data = scaler.transform(data)
     data = np.expand_dims(data, 0)
@@ -135,10 +135,32 @@ def _get_time_features(dt):
 
 
 def load_forecast_csv(name, univar=False):
-    data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True)
+    """
+    Loads and processes time series data for forecasting tasks, supporting both the pre-processed
+    Online Retail II dataset and existing datasets like ETTh1, ETTm1, and electricity.
+
+    Parameters:
+    name (str): The name of the dataset file (without the .csv extension).
+    univar (bool): Whether to load the univariate version of the data.
+
+    Returns:
+    tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
+    """
+    # Try loading with 'date' as index, if it fails, try with 'InvoiceDate'
+    try:
+        data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True)
+    except ValueError:
+        data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True)
+
+    # Ensure index is parsed as datetime
+    if not pd.api.types.is_datetime64_any_dtype(data.index):
+        data.index = pd.to_datetime(data.index)
+
+    # Extract time features for the date/index column
     dt_embed = _get_time_features(data.index)
     n_covariate_cols = dt_embed.shape[-1]
-    
+
+    # Handle univariate or multivariate cases
     if univar:
         if name in ('ETTh1', 'ETTh2', 'ETTm1', 'ETTm2'):
             data = data[['OT']]
@@ -146,8 +168,15 @@ def load_forecast_csv(name, univar=False):
             data = data[['MT_001']]
         else:
             data = data.iloc[:, -1:]
-        
+    else:
+        # Online Retail II case
+        if name == 'ts2vec_online_retail_II_data':
+            data = data[['Quantity', 'Price']]
+
+    # Convert data to numpy array
     data = data.to_numpy()
+
+    # Define train, validation, and test splits based on dataset
     if name == 'ETTh1' or name == 'ETTh2':
         train_slice = slice(None, 12*30*24)
         valid_slice = slice(12*30*24, 16*30*24)
@@ -157,35 +186,39 @@ def load_forecast_csv(name, univar=False):
         valid_slice = slice(12*30*24*4, 16*30*24*4)
         test_slice = slice(16*30*24*4, 20*30*24*4)
     else:
+        # Default case for custom datasets
         train_slice = slice(None, int(0.6 * len(data)))
         valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
         test_slice = slice(int(0.8 * len(data)), None)
-    
+
+    # Normalise data
     scaler = StandardScaler().fit(data[train_slice])
     data = scaler.transform(data)
-    if name in ('electricity'):
+
+    # Reshape data based on dataset structure
+    if name in 'electricity':
         data = np.expand_dims(data.T, -1)  # Each variable is an instance rather than a feature
     else:
-        data = np.expand_dims(data, 0)
-    
+        data = np.expand_dims(data, 0) # Single instance case
+
     if n_covariate_cols > 0:
         dt_scaler = StandardScaler().fit(dt_embed[train_slice])
         dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0)
         data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1)
-    
+
     if name in ('ETTh1', 'ETTh2', 'electricity'):
         pred_lens = [24, 48, 168, 336, 720]
     else:
         pred_lens = [24, 48, 96, 288, 672]
-        
+
     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
 
 
 def load_anomaly(name):
     res = pkl_load(f'datasets/{name}.pkl')
     return res['all_train_data'], res['all_train_labels'], res['all_train_timestamps'], \
-           res['all_test_data'],  res['all_test_labels'],  res['all_test_timestamps'], \
-           res['delay']
+        res['all_test_data'],  res['all_test_labels'],  res['all_test_timestamps'], \
+        res['delay']
 
 
 def gen_ano_train_data(all_train_data):
@@ -196,3 +229,130 @@ def gen_ano_train_data(all_train_data):
         pretrain_data.append(train_data)
     pretrain_data = np.expand_dims(np.stack(pretrain_data), 2)
     return pretrain_data
+
+#-----------------------------------------------------------------------------
+def _get_time_features(dt):
+    return np.stack([
+        dt.dayofweek.to_numpy(),  # Day of the week
+        dt.day.to_numpy(),        # Day of the month
+        dt.dayofyear.to_numpy(),  # Day of the year
+        dt.month.to_numpy(),      # Month
+        dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year
+    ], axis=1).astype(np.float)
+
+#
+# def load_online_retail(name='Cleaned_Online_Retail', agg_freq='D'):
+#     """
+#     Loads and preprocesses the Online Retail dataset for forecasting tasks.
+#
+#     Parameters:
+#         name (str): Name of the dataset file (without extension).
+#         agg_freq (str): Aggregation frequency (e.g., 'D' for daily, 'W' for weekly, '2W' for bi-weekly, 'M' for monthly).
+#
+#     Returns:
+#         data (np.ndarray): Preprocessed time series data.
+#         train_slice (slice): Slice object for training data.
+#         valid_slice (slice): Slice object for validation data.
+#         test_slice (slice): Slice object for testing data.
+#         scaler (StandardScaler): Fitted scaler object.
+#         pred_lens (list): List of prediction lengths.
+#         n_covariate_cols (int): Number of covariate columns.
+#     """
+#
+#     # Load data
+#     file_path = f'datasets/UEA/{name}.csv'
+#     df = pd.read_csv(file_path, parse_dates=['InvoiceDate'])
+#
+#     # Convert 'InvoiceDate' to datetime if not already done
+#     if not pd.api.types.is_datetime64_any_dtype(df['InvoiceDate']):
+#         df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
+#
+#     # Set 'InvoiceDate' as index
+#     df.set_index('InvoiceDate', inplace=True)
+#
+#     # Handle different aggregation frequencies
+#     if agg_freq == 'D':
+#         freq = 'D'
+#     elif agg_freq == 'W':
+#         freq = 'W'
+#     elif agg_freq == '2W':
+#         freq = '2W'
+#     elif agg_freq == 'M':
+#         freq = 'M'
+#     else:
+#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
+#
+#     # Aggregate quantity sold per specified frequency
+#     df_agg = df.resample(freq).sum()['Quantity']
+#
+#     # Handle missing values by filling with zeros
+#     df_agg = df_agg.fillna(0)
+#
+#     # Generate time features
+#     time_features = _get_time_features(df_agg.index)
+#     n_covariate_cols = time_features.shape[1]
+#
+#     # Convert to numpy array
+#     data = df_agg.values
+#     data = data.reshape(-1, 1)  # Reshape to (timesteps, features)
+#
+#     # Combine data with time features
+#     data = np.concatenate([time_features, data], axis=1)
+#
+#     # Train/Validation/Test split
+#     total_length = len(data)
+#     train_size = int(total_length * 0.6)
+#     valid_size = int(total_length * 0.2)
+#
+#     train_slice = slice(None, train_size)
+#     valid_slice = slice(train_size, train_size + valid_size)
+#     test_slice = slice(train_size + valid_size, None)
+#
+#     # Scaling
+#     scaler = StandardScaler()
+#     data[train_slice] = scaler.fit_transform(data[train_slice])
+#     data[valid_slice] = scaler.transform(data[valid_slice])
+#     data[test_slice] = scaler.transform(data[test_slice])
+#
+#     # Reshape to (1, timesteps, features) as expected by TS2Vec
+#     data = data[np.newaxis, ...]
+#
+#     # Define prediction lengths based on aggregation frequency
+#     if agg_freq == 'D':
+#         pred_lens = [1, 2, 3, 4, 5, 6, 7]  # Predicting each day for the upcoming week
+#     elif agg_freq == 'W':
+#         pred_lens = [1, 2, 3, 4]  # Predicting each week for the upcoming month (4 weeks)
+#     elif agg_freq == '2W':
+#         pred_lens = [1, 2, 3]  # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals)
+#     elif agg_freq == 'M':
+#         pred_lens = [1, 2, 3]  # Predicting 1 month, 2 months, and 3 months ahead (in months)
+#     else:
+#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
+#
+#     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
+#     train_slice = slice(None, train_size)
+#     valid_slice = slice(train_size, train_size + valid_size)
+#     test_slice = slice(train_size + valid_size, None)
+#
+#     # Scaling
+#     scaler = StandardScaler()
+#     data[train_slice] = scaler.fit_transform(data[train_slice])
+#     data[valid_slice] = scaler.transform(data[valid_slice])
+#     data[test_slice] = scaler.transform(data[test_slice])
+#
+#     # Reshape to (1, timesteps, features) as expected by TS2Vec
+#     data = data[np.newaxis, ...]
+#
+#     # Define prediction lengths based on aggregation frequency
+#     if agg_freq == 'D':
+#         pred_lens = [1, 2, 3, 4, 5, 6, 7]  # Predicting each day for the upcoming week
+#     elif agg_freq == 'W':
+#         pred_lens = [1, 2, 3, 4]  # Predicting each week for the upcoming month (4 weeks)
+#     elif agg_freq == '2W':
+#         pred_lens = [1, 2, 3]  # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals)
+#     elif agg_freq == 'M':
+#         pred_lens = [1, 2, 3]  # Predicting 1 month, 2 months, and 3 months ahead (in months)
+#     else:
+#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
+#
+#     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
diff --git a/tasks/__init__.py b/tasks/__init__.py
index 5e38a07c..c1e1208e 100644
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -1,3 +1,3 @@
 from .classification import eval_classification
-from .forecasting import eval_forecasting
+from .forecasting import eval_forecasting, eval_forecasting_unsupervised
 from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart
diff --git a/tasks/forecasting.py b/tasks/forecasting.py
index b3493d83..d2139cce 100644
--- a/tasks/forecasting.py
+++ b/tasks/forecasting.py
@@ -9,17 +9,17 @@ def generate_pred_samples(features, data, pred_len, drop=0):
     features = features[:, drop:]
     labels = labels[:, drop:]
     return features.reshape(-1, features.shape[-1]), \
-            labels.reshape(-1, labels.shape[2]*labels.shape[3])
+        labels.reshape(-1, labels.shape[2]*labels.shape[3])
 
 def cal_metrics(pred, target):
     return {
         'MSE': ((pred - target) ** 2).mean(),
         'MAE': np.abs(pred - target).mean()
     }
-    
+
 def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols):
     padding = 200
-    
+
     t = time.time()
     all_repr = model.encode(
         data,
@@ -29,15 +29,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         batch_size=256
     )
     ts2vec_infer_time = time.time() - t
-    
+
     train_repr = all_repr[:, train_slice]
     valid_repr = all_repr[:, valid_slice]
     test_repr = all_repr[:, test_slice]
-    
+
     train_data = data[:, train_slice, n_covariate_cols:]
     valid_data = data[:, valid_slice, n_covariate_cols:]
     test_data = data[:, test_slice, n_covariate_cols:]
-    
+
     ours_result = {}
     lr_train_time = {}
     lr_infer_time = {}
@@ -46,11 +46,11 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding)
         valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len)
         test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len)
-        
+
         t = time.time()
         lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels)
         lr_train_time[pred_len] = time.time() - t
-        
+
         t = time.time()
         test_pred = lr.predict(test_features)
         lr_infer_time[pred_len] = time.time() - t
@@ -58,14 +58,14 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2]
         test_pred = test_pred.reshape(ori_shape)
         test_labels = test_labels.reshape(ori_shape)
-        
+
         if test_data.shape[0] > 1:
             test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3)
             test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3)
         else:
             test_pred_inv = scaler.inverse_transform(test_pred)
             test_labels_inv = scaler.inverse_transform(test_labels)
-            
+
         out_log[pred_len] = {
             'norm': test_pred,
             'raw': test_pred_inv,
@@ -76,7 +76,7 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
             'norm': cal_metrics(test_pred, test_labels),
             'raw': cal_metrics(test_pred_inv, test_labels_inv)
         }
-        
+
     eval_res = {
         'ours': ours_result,
         'ts2vec_infer_time': ts2vec_infer_time,
@@ -84,3 +84,33 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         'lr_infer_time': lr_infer_time
     }
     return out_log, eval_res
+
+def eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols):
+    padding = 200
+
+    t = time.time()
+    all_repr = model.encode(
+        data,
+        causal=True,
+        sliding_length=1,
+        sliding_padding=padding,
+        batch_size=256
+    )
+    ts2vec_infer_time = time.time() - t
+
+    # Extract representations (embeddings) for each time slice
+    train_repr = all_repr[:, train_slice]
+    valid_repr = all_repr[:, valid_slice]
+    test_repr = all_repr[:, test_slice]
+
+    # You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization)
+
+    eval_res = {
+        'ts2vec_infer_time': ts2vec_infer_time,
+        'train_repr_shape': train_repr.shape,
+        'valid_repr_shape': valid_repr.shape,
+        'test_repr_shape': test_repr.shape
+    }
+
+    return eval_res
+
diff --git a/train.py b/train.py
index ecc30a60..916f2a3e 100644
--- a/train.py
+++ b/train.py
@@ -11,8 +11,8 @@
 from utils import init_dl_program, name_with_datetime, pkl_save, data_dropout
 
 def save_checkpoint_callback(
-    save_every=1,
-    unit='epoch'
+        save_every=1,
+        unit='epoch'
 ):
     assert unit in ('epoch', 'iter')
     def callback(model, loss):
@@ -39,55 +39,55 @@ def callback(model, loss):
     parser.add_argument('--eval', action="store_true", help='Whether to perform evaluation after training')
     parser.add_argument('--irregular', type=float, default=0, help='The ratio of missing observations (defaults to 0)')
     args = parser.parse_args()
-    
+
     print("Dataset:", args.dataset)
     print("Arguments:", str(args))
-    
+
     device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads)
-    
+
     print('Loading data... ', end='')
     if args.loader == 'UCR':
         task_type = 'classification'
         train_data, train_labels, test_data, test_labels = datautils.load_UCR(args.dataset)
-        
+
     elif args.loader == 'UEA':
         task_type = 'classification'
         train_data, train_labels, test_data, test_labels = datautils.load_UEA(args.dataset)
-        
+
     elif args.loader == 'forecast_csv':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'forecast_csv_univar':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset, univar=True)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'forecast_npy':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'forecast_npy_univar':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset, univar=True)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'anomaly':
         task_type = 'anomaly_detection'
         all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
         train_data = datautils.gen_ano_train_data(all_train_data)
-        
+
     elif args.loader == 'anomaly_coldstart':
         task_type = 'anomaly_detection_coldstart'
         all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
         train_data, _, _, _ = datautils.load_UCR('FordA')
-        
+
     else:
         raise ValueError(f"Unknown loader {args.loader}.")
-        
-        
+
+
     if args.irregular > 0:
         if task_type == 'classification':
             train_data = data_dropout(train_data, args.irregular)
@@ -95,23 +95,23 @@ def callback(model, loss):
         else:
             raise ValueError(f"Task type {task_type} is not supported when irregular>0.")
     print('done')
-    
+
     config = dict(
         batch_size=args.batch_size,
         lr=args.lr,
         output_dims=args.repr_dims,
         max_train_length=args.max_train_length
     )
-    
+
     if args.save_every is not None:
         unit = 'epoch' if args.epochs is not None else 'iter'
         config[f'after_{unit}_callback'] = save_checkpoint_callback(args.save_every, unit)
 
     run_dir = 'training/' + args.dataset + '__' + name_with_datetime(args.run_name)
     os.makedirs(run_dir, exist_ok=True)
-    
+
     t = time.time()
-    
+
     model = TS2Vec(
         input_dims=train_data.shape[-1],
         device=device,
@@ -129,18 +129,29 @@ def callback(model, loss):
     print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n")
 
     if args.eval:
+        out = None # Initialise out as None to avoid the NameError in the case of unsupervised tasks
+
         if task_type == 'classification':
             out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm')
         elif task_type == 'forecasting':
-            out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
+            # add case for unsupervised evaluation on Online Retail dataset
+            if args.dataset == 'ts2vec_online_retail_II_data':
+                eval_res = tasks.eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols)
+            else:
+                out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
         elif task_type == 'anomaly_detection':
             out, eval_res = tasks.eval_anomaly_detection(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay)
         elif task_type == 'anomaly_detection_coldstart':
             out, eval_res = tasks.eval_anomaly_detection_coldstart(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay)
         else:
             assert False
-        pkl_save(f'{run_dir}/out.pkl', out)
+
+        # Save only eval_res when 'out' is None (i.e., for unsupervised tasks)
+        if out is not None:
+            pkl_save(f'{run_dir}/out.pkl', out)
+
         pkl_save(f'{run_dir}/eval_res.pkl', eval_res)
         print('Evaluation result:', eval_res)
 
     print("Finished.")
+

From 8852d6861e4a493ad6a15ebb85f8c0b6c492e150 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Mon, 9 Sep 2024 00:42:52 +0100
Subject: [PATCH 03/10] TS2Vec is now able to process the Online Retail II
 dataset using an editing version of load_forecasting_csv. It can also
 evaluate this dataset with a new function eval_forecasting_unsupervised. Also
 includes additional utility functions and configuration

---
 .gitignore           |  15 ++++
 .idea/misc.xml       |  10 +++
 .idea/ts2vec.iml     |   7 ++
 .idea/vcs.xml        |   6 ++
 .idea/workspace.xml  | 138 +++++++++++++++++++++++++++++
 datautils.py         | 202 ++++++++++++++++++++++++++++++++++++++-----
 requirements.txt     |   1 +
 tasks/__init__.py    |   2 +-
 tasks/forecasting.py |  52 ++++++++---
 train.py             |  53 +++++++-----
 10 files changed, 432 insertions(+), 54 deletions(-)
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/ts2vec.iml
 create mode 100644 .idea/vcs.xml
 create mode 100644 .idea/workspace.xml

diff --git a/.gitignore b/.gitignore
index 78a0cc4e..9d59144c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,21 @@
+# Ignore dataset files that are not Python
 /datasets/*[!.py]
+
+# Ignore Jupyter notebook checkpoints
 .ipynb_checkpoints
+
+# Ignore Python cache files
 __pycache__
+
+# Ignore everything in the training folder except .py files
 training/*
+!training/*.py
+
+# Ignore specific nohup files in the scripts folder
 scripts/_nohup
+
+# Ignore rsync filter file
 .rsync-filter
+
+# Ignore the ts2vec_env directory (virtual environment)
+ts2vec_env/
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 00000000..feccba82
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.8 (ts2vec)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (ts2vec)" project-jdk-type="Python SDK" />
+  <component name="PyCharmDSProjectLayout">
+    <option name="id" value="JupyterRightHiddenStructureLayout" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/ts2vec.iml b/.idea/ts2vec.iml
new file mode 100644
index 00000000..ec63674c
--- /dev/null
+++ b/.idea/ts2vec.iml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module version="4">
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 00000000..35eb1ddf
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 00000000..ef09f3fa
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,138 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="f34426fb-75db-4b60-811c-f2e497b8258c" name="Changes" comment="TS2Vec is now able to process the Online Retail II dataset using an editing version of load_forecasting_csv. It can also evaluate this dataset with a new function eval_forecasting_unsupervised. Also includes additional utility functions and configuration">
+      <change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_BRANCH_BY_REPOSITORY">
+      <map>
+        <entry key="$PROJECT_DIR$" value="sos" />
+      </map>
+    </option>
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="GitHubPullRequestSearchHistory">{
+  &quot;lastFilter&quot;: {
+    &quot;state&quot;: &quot;OPEN&quot;,
+    &quot;assignee&quot;: &quot;jumairamiller&quot;
+  }
+}</component>
+  <component name="GithubPullRequestsUISettings">{
+  &quot;selectedUrlAndAccountId&quot;: {
+    &quot;url&quot;: &quot;https://github.com/jumairamiller/ts2vec.git&quot;,
+    &quot;accountId&quot;: &quot;1c182f61-b702-458b-8f9a-03f9dbad0d94&quot;
+  },
+  &quot;recentNewPullRequestHead&quot;: {
+    &quot;server&quot;: {
+      &quot;useHttp&quot;: false,
+      &quot;host&quot;: &quot;github.com&quot;,
+      &quot;port&quot;: null,
+      &quot;suffix&quot;: null
+    },
+    &quot;owner&quot;: &quot;jumairamiller&quot;,
+    &quot;repository&quot;: &quot;ts2vec&quot;
+  }
+}</component>
+  <component name="ProblemsViewState">
+    <option name="selectedTabId" value="ProjectErrors" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 3
+}</component>
+  <component name="ProjectId" id="2l9pBR4as2gmUPvcH8kBcJXtCQH" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "Python.train.executor": "Run",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "git-widget-placeholder": "sos",
+    "last_opened_file_path": "/home/jumaira/Documents/online_retail_data_preprocessing",
+    "settings.editor.selected.configurable": "vcs.Git"
+  }
+}]]></component>
+  <component name="RunManager">
+    <configuration name="train" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="ts2vec" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/train.py" />
+      <option name="PARAMETERS" value="restructured_ts2vec_online_retail Results --loader forecast_csv --eval" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.train" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-python-sdk-54d380b879e2-1e58bffe31b7-com.jetbrains.pycharm.ds.sharedIndexes.bundled-DS-242.20224.354" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State>
+              <option name="FILTERS">
+                <map>
+                  <entry key="branch">
+                    <value>
+                      <list>
+                        <option value="rework" />
+                      </list>
+                    </value>
+                  </entry>
+                </map>
+              </option>
+            </State>
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="." />
+    <MESSAGE value="started editing code to accept Online Retail dataset as an input, this includes creating new evaluation and loader functions for forecasting tasks" />
+    <MESSAGE value="TS2Vec is now able to process the Online Retail II dataset using an editing version of load_forecasting_csv. It can also evaluate this dataset with a new function eval_forecasting_unsupervised. Also includes additional utility functions and configuration" />
+    <option name="LAST_COMMIT_MESSAGE" value="TS2Vec is now able to process the Online Retail II dataset using an editing version of load_forecasting_csv. It can also evaluate this dataset with a new function eval_forecasting_unsupervised. Also includes additional utility functions and configuration" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/datautils.py b/datautils.py
index f8cfddd7..c618fb66 100644
--- a/datautils.py
+++ b/datautils.py
@@ -68,7 +68,7 @@ def load_UCR(dataset):
         'UMD'
     ]:
         return train[..., np.newaxis], train_labels, test[..., np.newaxis], test_labels
-    
+
     mean = np.nanmean(train)
     std = np.nanstd(train)
     train = (train - mean) / std
@@ -79,7 +79,7 @@ def load_UCR(dataset):
 def load_UEA(dataset):
     train_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TRAIN.arff')[0]
     test_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TEST.arff')[0]
-    
+
     def extract_data(data):
         res_data = []
         res_labels = []
@@ -89,31 +89,31 @@ def extract_data(data):
             res_data.append(t_data)
             res_labels.append(t_label)
         return np.array(res_data).swapaxes(1, 2), np.array(res_labels)
-    
+
     train_X, train_y = extract_data(train_data)
     test_X, test_y = extract_data(test_data)
-    
+
     scaler = StandardScaler()
     scaler.fit(train_X.reshape(-1, train_X.shape[-1]))
     train_X = scaler.transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape)
     test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape)
-    
+
     labels = np.unique(train_y)
     transform = { k : i for i, k in enumerate(labels)}
     train_y = np.vectorize(transform.get)(train_y)
     test_y = np.vectorize(transform.get)(test_y)
     return train_X, train_y, test_X, test_y
-    
-    
+
+
 def load_forecast_npy(name, univar=False):
-    data = np.load(f'datasets/{name}.npy')    
+    data = np.load(f'datasets/{name}.npy')
     if univar:
         data = data[: -1:]
-        
+
     train_slice = slice(None, int(0.6 * len(data)))
     valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
     test_slice = slice(int(0.8 * len(data)), None)
-    
+
     scaler = StandardScaler().fit(data[train_slice])
     data = scaler.transform(data)
     data = np.expand_dims(data, 0)
@@ -135,10 +135,32 @@ def _get_time_features(dt):
 
 
 def load_forecast_csv(name, univar=False):
-    data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True)
+    """
+    Loads and processes time series data for forecasting tasks, supporting both the pre-processed
+    Online Retail II dataset and existing datasets like ETTh1, ETTm1, and electricity.
+
+    Parameters:
+    name (str): The name of the dataset file (without the .csv extension).
+    univar (bool): Whether to load the univariate version of the data.
+
+    Returns:
+    tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
+    """
+    # Try loading with 'date' as index, if it fails, try with 'InvoiceDate'
+    try:
+        data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True)
+    except ValueError:
+        data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True)
+
+    # Ensure index is parsed as datetime
+    if not pd.api.types.is_datetime64_any_dtype(data.index):
+        data.index = pd.to_datetime(data.index)
+
+    # Extract time features for the date/index column
     dt_embed = _get_time_features(data.index)
     n_covariate_cols = dt_embed.shape[-1]
-    
+
+    # Handle univariate or multivariate cases
     if univar:
         if name in ('ETTh1', 'ETTh2', 'ETTm1', 'ETTm2'):
             data = data[['OT']]
@@ -146,8 +168,15 @@ def load_forecast_csv(name, univar=False):
             data = data[['MT_001']]
         else:
             data = data.iloc[:, -1:]
-        
+    else:
+        # Online Retail II case
+        if name == 'ts2vec_online_retail_II_data':
+            data = data[['Quantity', 'Price']]
+
+    # Convert data to numpy array
     data = data.to_numpy()
+
+    # Define train, validation, and test splits based on dataset
     if name == 'ETTh1' or name == 'ETTh2':
         train_slice = slice(None, 12*30*24)
         valid_slice = slice(12*30*24, 16*30*24)
@@ -157,35 +186,39 @@ def load_forecast_csv(name, univar=False):
         valid_slice = slice(12*30*24*4, 16*30*24*4)
         test_slice = slice(16*30*24*4, 20*30*24*4)
     else:
+        # Default case for custom datasets
         train_slice = slice(None, int(0.6 * len(data)))
         valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
         test_slice = slice(int(0.8 * len(data)), None)
-    
+
+    # Normalise data
     scaler = StandardScaler().fit(data[train_slice])
     data = scaler.transform(data)
-    if name in ('electricity'):
+
+    # Reshape data based on dataset structure
+    if name in 'electricity':
         data = np.expand_dims(data.T, -1)  # Each variable is an instance rather than a feature
     else:
-        data = np.expand_dims(data, 0)
-    
+        data = np.expand_dims(data, 0) # Single instance case
+
     if n_covariate_cols > 0:
         dt_scaler = StandardScaler().fit(dt_embed[train_slice])
         dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0)
         data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1)
-    
+
     if name in ('ETTh1', 'ETTh2', 'electricity'):
         pred_lens = [24, 48, 168, 336, 720]
     else:
         pred_lens = [24, 48, 96, 288, 672]
-        
+
     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
 
 
 def load_anomaly(name):
     res = pkl_load(f'datasets/{name}.pkl')
     return res['all_train_data'], res['all_train_labels'], res['all_train_timestamps'], \
-           res['all_test_data'],  res['all_test_labels'],  res['all_test_timestamps'], \
-           res['delay']
+        res['all_test_data'],  res['all_test_labels'],  res['all_test_timestamps'], \
+        res['delay']
 
 
 def gen_ano_train_data(all_train_data):
@@ -196,3 +229,130 @@ def gen_ano_train_data(all_train_data):
         pretrain_data.append(train_data)
     pretrain_data = np.expand_dims(np.stack(pretrain_data), 2)
     return pretrain_data
+
+#-----------------------------------------------------------------------------
+def _get_time_features(dt):
+    return np.stack([
+        dt.dayofweek.to_numpy(),  # Day of the week
+        dt.day.to_numpy(),        # Day of the month
+        dt.dayofyear.to_numpy(),  # Day of the year
+        dt.month.to_numpy(),      # Month
+        dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year
+    ], axis=1).astype(np.float)
+
+#
+# def load_online_retail(name='Cleaned_Online_Retail', agg_freq='D'):
+#     """
+#     Loads and preprocesses the Online Retail dataset for forecasting tasks.
+#
+#     Parameters:
+#         name (str): Name of the dataset file (without extension).
+#         agg_freq (str): Aggregation frequency (e.g., 'D' for daily, 'W' for weekly, '2W' for bi-weekly, 'M' for monthly).
+#
+#     Returns:
+#         data (np.ndarray): Preprocessed time series data.
+#         train_slice (slice): Slice object for training data.
+#         valid_slice (slice): Slice object for validation data.
+#         test_slice (slice): Slice object for testing data.
+#         scaler (StandardScaler): Fitted scaler object.
+#         pred_lens (list): List of prediction lengths.
+#         n_covariate_cols (int): Number of covariate columns.
+#     """
+#
+#     # Load data
+#     file_path = f'datasets/UEA/{name}.csv'
+#     df = pd.read_csv(file_path, parse_dates=['InvoiceDate'])
+#
+#     # Convert 'InvoiceDate' to datetime if not already done
+#     if not pd.api.types.is_datetime64_any_dtype(df['InvoiceDate']):
+#         df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
+#
+#     # Set 'InvoiceDate' as index
+#     df.set_index('InvoiceDate', inplace=True)
+#
+#     # Handle different aggregation frequencies
+#     if agg_freq == 'D':
+#         freq = 'D'
+#     elif agg_freq == 'W':
+#         freq = 'W'
+#     elif agg_freq == '2W':
+#         freq = '2W'
+#     elif agg_freq == 'M':
+#         freq = 'M'
+#     else:
+#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
+#
+#     # Aggregate quantity sold per specified frequency
+#     df_agg = df.resample(freq).sum()['Quantity']
+#
+#     # Handle missing values by filling with zeros
+#     df_agg = df_agg.fillna(0)
+#
+#     # Generate time features
+#     time_features = _get_time_features(df_agg.index)
+#     n_covariate_cols = time_features.shape[1]
+#
+#     # Convert to numpy array
+#     data = df_agg.values
+#     data = data.reshape(-1, 1)  # Reshape to (timesteps, features)
+#
+#     # Combine data with time features
+#     data = np.concatenate([time_features, data], axis=1)
+#
+#     # Train/Validation/Test split
+#     total_length = len(data)
+#     train_size = int(total_length * 0.6)
+#     valid_size = int(total_length * 0.2)
+#
+#     train_slice = slice(None, train_size)
+#     valid_slice = slice(train_size, train_size + valid_size)
+#     test_slice = slice(train_size + valid_size, None)
+#
+#     # Scaling
+#     scaler = StandardScaler()
+#     data[train_slice] = scaler.fit_transform(data[train_slice])
+#     data[valid_slice] = scaler.transform(data[valid_slice])
+#     data[test_slice] = scaler.transform(data[test_slice])
+#
+#     # Reshape to (1, timesteps, features) as expected by TS2Vec
+#     data = data[np.newaxis, ...]
+#
+#     # Define prediction lengths based on aggregation frequency
+#     if agg_freq == 'D':
+#         pred_lens = [1, 2, 3, 4, 5, 6, 7]  # Predicting each day for the upcoming week
+#     elif agg_freq == 'W':
+#         pred_lens = [1, 2, 3, 4]  # Predicting each week for the upcoming month (4 weeks)
+#     elif agg_freq == '2W':
+#         pred_lens = [1, 2, 3]  # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals)
+#     elif agg_freq == 'M':
+#         pred_lens = [1, 2, 3]  # Predicting 1 month, 2 months, and 3 months ahead (in months)
+#     else:
+#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
+#
+#     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
+#     train_slice = slice(None, train_size)
+#     valid_slice = slice(train_size, train_size + valid_size)
+#     test_slice = slice(train_size + valid_size, None)
+#
+#     # Scaling
+#     scaler = StandardScaler()
+#     data[train_slice] = scaler.fit_transform(data[train_slice])
+#     data[valid_slice] = scaler.transform(data[valid_slice])
+#     data[test_slice] = scaler.transform(data[test_slice])
+#
+#     # Reshape to (1, timesteps, features) as expected by TS2Vec
+#     data = data[np.newaxis, ...]
+#
+#     # Define prediction lengths based on aggregation frequency
+#     if agg_freq == 'D':
+#         pred_lens = [1, 2, 3, 4, 5, 6, 7]  # Predicting each day for the upcoming week
+#     elif agg_freq == 'W':
+#         pred_lens = [1, 2, 3, 4]  # Predicting each week for the upcoming month (4 weeks)
+#     elif agg_freq == '2W':
+#         pred_lens = [1, 2, 3]  # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals)
+#     elif agg_freq == 'M':
+#         pred_lens = [1, 2, 3]  # Predicting 1 month, 2 months, and 3 months ahead (in months)
+#     else:
+#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
+#
+#     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
diff --git a/requirements.txt b/requirements.txt
index 9cc27049..d5ee03d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ numpy==1.19.2
 statsmodels==0.12.2
 pandas==1.0.1
 scikit_learn==0.24.2
+xlrd==1.2.0
diff --git a/tasks/__init__.py b/tasks/__init__.py
index 5e38a07c..c1e1208e 100644
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -1,3 +1,3 @@
 from .classification import eval_classification
-from .forecasting import eval_forecasting
+from .forecasting import eval_forecasting, eval_forecasting_unsupervised
 from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart
diff --git a/tasks/forecasting.py b/tasks/forecasting.py
index b3493d83..d2139cce 100644
--- a/tasks/forecasting.py
+++ b/tasks/forecasting.py
@@ -9,17 +9,17 @@ def generate_pred_samples(features, data, pred_len, drop=0):
     features = features[:, drop:]
     labels = labels[:, drop:]
     return features.reshape(-1, features.shape[-1]), \
-            labels.reshape(-1, labels.shape[2]*labels.shape[3])
+        labels.reshape(-1, labels.shape[2]*labels.shape[3])
 
 def cal_metrics(pred, target):
     return {
         'MSE': ((pred - target) ** 2).mean(),
         'MAE': np.abs(pred - target).mean()
     }
-    
+
 def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols):
     padding = 200
-    
+
     t = time.time()
     all_repr = model.encode(
         data,
@@ -29,15 +29,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         batch_size=256
     )
     ts2vec_infer_time = time.time() - t
-    
+
     train_repr = all_repr[:, train_slice]
     valid_repr = all_repr[:, valid_slice]
     test_repr = all_repr[:, test_slice]
-    
+
     train_data = data[:, train_slice, n_covariate_cols:]
     valid_data = data[:, valid_slice, n_covariate_cols:]
     test_data = data[:, test_slice, n_covariate_cols:]
-    
+
     ours_result = {}
     lr_train_time = {}
     lr_infer_time = {}
@@ -46,11 +46,11 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding)
         valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len)
         test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len)
-        
+
         t = time.time()
         lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels)
         lr_train_time[pred_len] = time.time() - t
-        
+
         t = time.time()
         test_pred = lr.predict(test_features)
         lr_infer_time[pred_len] = time.time() - t
@@ -58,14 +58,14 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2]
         test_pred = test_pred.reshape(ori_shape)
         test_labels = test_labels.reshape(ori_shape)
-        
+
         if test_data.shape[0] > 1:
             test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3)
             test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3)
         else:
             test_pred_inv = scaler.inverse_transform(test_pred)
             test_labels_inv = scaler.inverse_transform(test_labels)
-            
+
         out_log[pred_len] = {
             'norm': test_pred,
             'raw': test_pred_inv,
@@ -76,7 +76,7 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
             'norm': cal_metrics(test_pred, test_labels),
             'raw': cal_metrics(test_pred_inv, test_labels_inv)
         }
-        
+
     eval_res = {
         'ours': ours_result,
         'ts2vec_infer_time': ts2vec_infer_time,
@@ -84,3 +84,33 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         'lr_infer_time': lr_infer_time
     }
     return out_log, eval_res
+
+def eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols):
+    padding = 200
+
+    t = time.time()
+    all_repr = model.encode(
+        data,
+        causal=True,
+        sliding_length=1,
+        sliding_padding=padding,
+        batch_size=256
+    )
+    ts2vec_infer_time = time.time() - t
+
+    # Extract representations (embeddings) for each time slice
+    train_repr = all_repr[:, train_slice]
+    valid_repr = all_repr[:, valid_slice]
+    test_repr = all_repr[:, test_slice]
+
+    # You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization)
+
+    eval_res = {
+        'ts2vec_infer_time': ts2vec_infer_time,
+        'train_repr_shape': train_repr.shape,
+        'valid_repr_shape': valid_repr.shape,
+        'test_repr_shape': test_repr.shape
+    }
+
+    return eval_res
+
diff --git a/train.py b/train.py
index ecc30a60..916f2a3e 100644
--- a/train.py
+++ b/train.py
@@ -11,8 +11,8 @@
 from utils import init_dl_program, name_with_datetime, pkl_save, data_dropout
 
 def save_checkpoint_callback(
-    save_every=1,
-    unit='epoch'
+        save_every=1,
+        unit='epoch'
 ):
     assert unit in ('epoch', 'iter')
     def callback(model, loss):
@@ -39,55 +39,55 @@ def callback(model, loss):
     parser.add_argument('--eval', action="store_true", help='Whether to perform evaluation after training')
     parser.add_argument('--irregular', type=float, default=0, help='The ratio of missing observations (defaults to 0)')
     args = parser.parse_args()
-    
+
     print("Dataset:", args.dataset)
     print("Arguments:", str(args))
-    
+
     device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads)
-    
+
     print('Loading data... ', end='')
     if args.loader == 'UCR':
         task_type = 'classification'
         train_data, train_labels, test_data, test_labels = datautils.load_UCR(args.dataset)
-        
+
     elif args.loader == 'UEA':
         task_type = 'classification'
         train_data, train_labels, test_data, test_labels = datautils.load_UEA(args.dataset)
-        
+
     elif args.loader == 'forecast_csv':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'forecast_csv_univar':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset, univar=True)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'forecast_npy':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'forecast_npy_univar':
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_npy(args.dataset, univar=True)
         train_data = data[:, train_slice]
-        
+
     elif args.loader == 'anomaly':
         task_type = 'anomaly_detection'
         all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
         train_data = datautils.gen_ano_train_data(all_train_data)
-        
+
     elif args.loader == 'anomaly_coldstart':
         task_type = 'anomaly_detection_coldstart'
         all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
         train_data, _, _, _ = datautils.load_UCR('FordA')
-        
+
     else:
         raise ValueError(f"Unknown loader {args.loader}.")
-        
-        
+
+
     if args.irregular > 0:
         if task_type == 'classification':
             train_data = data_dropout(train_data, args.irregular)
@@ -95,23 +95,23 @@ def callback(model, loss):
         else:
             raise ValueError(f"Task type {task_type} is not supported when irregular>0.")
     print('done')
-    
+
     config = dict(
         batch_size=args.batch_size,
         lr=args.lr,
         output_dims=args.repr_dims,
         max_train_length=args.max_train_length
     )
-    
+
     if args.save_every is not None:
         unit = 'epoch' if args.epochs is not None else 'iter'
         config[f'after_{unit}_callback'] = save_checkpoint_callback(args.save_every, unit)
 
     run_dir = 'training/' + args.dataset + '__' + name_with_datetime(args.run_name)
     os.makedirs(run_dir, exist_ok=True)
-    
+
     t = time.time()
-    
+
     model = TS2Vec(
         input_dims=train_data.shape[-1],
         device=device,
@@ -129,18 +129,29 @@ def callback(model, loss):
     print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n")
 
     if args.eval:
+        out = None # Initialise out as None to avoid the NameError in the case of unsupervised tasks
+
         if task_type == 'classification':
             out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm')
         elif task_type == 'forecasting':
-            out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
+            # add case for unsupervised evaluation on Online Retail dataset
+            if args.dataset == 'ts2vec_online_retail_II_data':
+                eval_res = tasks.eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols)
+            else:
+                out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
         elif task_type == 'anomaly_detection':
             out, eval_res = tasks.eval_anomaly_detection(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay)
         elif task_type == 'anomaly_detection_coldstart':
             out, eval_res = tasks.eval_anomaly_detection_coldstart(model, all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay)
         else:
             assert False
-        pkl_save(f'{run_dir}/out.pkl', out)
+
+        # Save only eval_res when 'out' is None (i.e., for unsupervised tasks)
+        if out is not None:
+            pkl_save(f'{run_dir}/out.pkl', out)
+
         pkl_save(f'{run_dir}/eval_res.pkl', eval_res)
         print('Evaluation result:', eval_res)
 
     print("Finished.")
+

From ac644d4d5ef3239a2dc70c3ab803f2e6bad24808 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Sun, 22 Sep 2024 22:49:56 +0100
Subject: [PATCH 04/10] ts2vec now run with ts2vec_online_retail_II_data using
 forecast_csv as loader, whilst also generating MSE and MAE (metrics were
 previously not being calculated correctly becasue of incompatible data
 reshaping)

---
 datautils.py             | 226 ++++++++++++++++++---------------------
 models/encoder.py        |   2 +-
 requirements.txt         |   3 +
 tasks/_eval_protocols.py |  12 +++
 tasks/forecasting.py     |  23 +++-
 train.py                 | 115 +++++++++++++++++---
 6 files changed, 240 insertions(+), 141 deletions(-)

diff --git a/datautils.py b/datautils.py
index c618fb66..2c12e3ae 100644
--- a/datautils.py
+++ b/datautils.py
@@ -5,6 +5,9 @@
 import random
 from datetime import datetime
 import pickle
+
+import torch
+
 from utils import pkl_load, pad_nan_to_target
 from scipy.io.arff import loadarff
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
@@ -146,7 +149,7 @@ def load_forecast_csv(name, univar=False):
     Returns:
     tuple: data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
     """
-    # Try loading with 'date' as index, if it fails, try with 'InvoiceDate'
+    # Try loading with 'date' as index, if it fails, try with 'InvoiceDate' to load online retail data
     try:
         data = pd.read_csv(f'datasets/{name}.csv', index_col='date', parse_dates=True)
     except ValueError:
@@ -168,10 +171,6 @@ def load_forecast_csv(name, univar=False):
             data = data[['MT_001']]
         else:
             data = data.iloc[:, -1:]
-    else:
-        # Online Retail II case
-        if name == 'ts2vec_online_retail_II_data':
-            data = data[['Quantity', 'Price']]
 
     # Convert data to numpy array
     data = data.to_numpy()
@@ -186,13 +185,18 @@ def load_forecast_csv(name, univar=False):
         valid_slice = slice(12*30*24*4, 16*30*24*4)
         test_slice = slice(16*30*24*4, 20*30*24*4)
     else:
-        # Default case for custom datasets
-        train_slice = slice(None, int(0.6 * len(data)))
+        # Default case for other or custom datasets
+        train_slice = slice(0, int(0.6 * len(data)))
         valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
-        test_slice = slice(int(0.8 * len(data)), None)
+        test_slice = slice(int(0.8 * len(data)), len(data))
 
     # Normalise data
-    scaler = StandardScaler().fit(data[train_slice])
+    scaler = None
+    if name == 'ts2vec_online_retail_II_data':
+        scaler = MinMaxScaler().fit(data[train_slice])
+    else:
+        scaler = StandardScaler().fit(data[train_slice])
+
     data = scaler.transform(data)
 
     # Reshape data based on dataset structure
@@ -202,12 +206,22 @@ def load_forecast_csv(name, univar=False):
         data = np.expand_dims(data, 0) # Single instance case
 
     if n_covariate_cols > 0:
-        dt_scaler = StandardScaler().fit(dt_embed[train_slice])
+        if name == 'ts2vec_online_retail_II_data':
+            dt_scaler = MinMaxScaler().fit(dt_embed[train_slice])
+        else:
+            dt_scaler = StandardScaler().fit(dt_embed[train_slice])
         dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0)
+        # Concatenating the time embeddings to the data
+        ''' NOTE: 
+        The np.repeat(dt_embed, data.shape[0], axis=0) function is used to repeat the time embeddings 
+        for each instance only in the case of the 'electricity' dataset. This ensures that the time 
+        embeddings are correctly aligned with the data instances'''
         data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1)
 
     if name in ('ETTh1', 'ETTh2', 'electricity'):
         pred_lens = [24, 48, 168, 336, 720]
+    elif name == 'ts2vec_online_retail_II_data':
+        pred_lens = [1, 2, 3, 4, 5]
     else:
         pred_lens = [24, 48, 96, 288, 672]
 
@@ -240,119 +254,89 @@ def _get_time_features(dt):
         dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year
     ], axis=1).astype(np.float)
 
-#
-# def load_online_retail(name='Cleaned_Online_Retail', agg_freq='D'):
+
+def load_online_retail(name, repr_dims):
 #     """
 #     Loads and preprocesses the Online Retail dataset for forecasting tasks.
-#
-#     Parameters:
-#         name (str): Name of the dataset file (without extension).
-#         agg_freq (str): Aggregation frequency (e.g., 'D' for daily, 'W' for weekly, '2W' for bi-weekly, 'M' for monthly).
+#     Ensures both Price, Quantity, and customer embeddings are included throughout. """
 #
 #     Returns:
-#         data (np.ndarray): Preprocessed time series data.
-#         train_slice (slice): Slice object for training data.
-#         valid_slice (slice): Slice object for validation data.
-#         test_slice (slice): Slice object for testing data.
-#         scaler (StandardScaler): Fitted scaler object.
-#         pred_lens (list): List of prediction lengths.
-#         n_covariate_cols (int): Number of covariate columns.
+#         train_data: Dictionary mapping customer_id to their training data (DataFrame).
+#         valid_data: Dictionary mapping customer_id to their validation data (DataFrame).
+#         test_data: Dictionary mapping customer_id to their test data (DataFrame).
+#         customer_embeddings: Fixed embeddings for each customer ID.
+#         customer_id_to_index: Mapping from customer IDs to embedding indices.
+#         scaler: Fitted scaler for data normalization.
 #     """
-#
-#     # Load data
-#     file_path = f'datasets/UEA/{name}.csv'
-#     df = pd.read_csv(file_path, parse_dates=['InvoiceDate'])
-#
-#     # Convert 'InvoiceDate' to datetime if not already done
-#     if not pd.api.types.is_datetime64_any_dtype(df['InvoiceDate']):
-#         df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
-#
-#     # Set 'InvoiceDate' as index
-#     df.set_index('InvoiceDate', inplace=True)
-#
-#     # Handle different aggregation frequencies
-#     if agg_freq == 'D':
-#         freq = 'D'
-#     elif agg_freq == 'W':
-#         freq = 'W'
-#     elif agg_freq == '2W':
-#         freq = '2W'
-#     elif agg_freq == 'M':
-#         freq = 'M'
-#     else:
-#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
-#
-#     # Aggregate quantity sold per specified frequency
-#     df_agg = df.resample(freq).sum()['Quantity']
-#
-#     # Handle missing values by filling with zeros
-#     df_agg = df_agg.fillna(0)
-#
-#     # Generate time features
-#     time_features = _get_time_features(df_agg.index)
-#     n_covariate_cols = time_features.shape[1]
-#
-#     # Convert to numpy array
-#     data = df_agg.values
-#     data = data.reshape(-1, 1)  # Reshape to (timesteps, features)
-#
-#     # Combine data with time features
-#     data = np.concatenate([time_features, data], axis=1)
-#
-#     # Train/Validation/Test split
-#     total_length = len(data)
-#     train_size = int(total_length * 0.6)
-#     valid_size = int(total_length * 0.2)
-#
-#     train_slice = slice(None, train_size)
-#     valid_slice = slice(train_size, train_size + valid_size)
-#     test_slice = slice(train_size + valid_size, None)
-#
-#     # Scaling
-#     scaler = StandardScaler()
-#     data[train_slice] = scaler.fit_transform(data[train_slice])
-#     data[valid_slice] = scaler.transform(data[valid_slice])
-#     data[test_slice] = scaler.transform(data[test_slice])
-#
-#     # Reshape to (1, timesteps, features) as expected by TS2Vec
-#     data = data[np.newaxis, ...]
-#
-#     # Define prediction lengths based on aggregation frequency
-#     if agg_freq == 'D':
-#         pred_lens = [1, 2, 3, 4, 5, 6, 7]  # Predicting each day for the upcoming week
-#     elif agg_freq == 'W':
-#         pred_lens = [1, 2, 3, 4]  # Predicting each week for the upcoming month (4 weeks)
-#     elif agg_freq == '2W':
-#         pred_lens = [1, 2, 3]  # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals)
-#     elif agg_freq == 'M':
-#         pred_lens = [1, 2, 3]  # Predicting 1 month, 2 months, and 3 months ahead (in months)
-#     else:
-#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
-#
-#     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
-#     train_slice = slice(None, train_size)
-#     valid_slice = slice(train_size, train_size + valid_size)
-#     test_slice = slice(train_size + valid_size, None)
-#
-#     # Scaling
-#     scaler = StandardScaler()
-#     data[train_slice] = scaler.fit_transform(data[train_slice])
-#     data[valid_slice] = scaler.transform(data[valid_slice])
-#     data[test_slice] = scaler.transform(data[test_slice])
-#
-#     # Reshape to (1, timesteps, features) as expected by TS2Vec
-#     data = data[np.newaxis, ...]
-#
-#     # Define prediction lengths based on aggregation frequency
-#     if agg_freq == 'D':
-#         pred_lens = [1, 2, 3, 4, 5, 6, 7]  # Predicting each day for the upcoming week
-#     elif agg_freq == 'W':
-#         pred_lens = [1, 2, 3, 4]  # Predicting each week for the upcoming month (4 weeks)
-#     elif agg_freq == '2W':
-#         pred_lens = [1, 2, 3]  # Predicting 2 weeks, 4 weeks, and 6 weeks ahead (in bi-weekly intervals)
-#     elif agg_freq == 'M':
-#         pred_lens = [1, 2, 3]  # Predicting 1 month, 2 months, and 3 months ahead (in months)
-#     else:
-#         raise ValueError("Invalid agg_freq value. Use 'D', 'W', '2W', or 'M'.")
-#
-#     return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
+
+     # Load data
+    data = pd.read_csv(f'datasets/{name}.csv', parse_dates=['InvoiceDate'])
+
+    # Convert 'InvoiceDate' to datetime if not already done
+    if not pd.api.types.is_datetime64_any_dtype(data['InvoiceDate']):
+        data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
+
+    # Sort data by 'InvoiceDate'
+    data.sort_values(by='InvoiceDate', inplace=True)
+
+    # Rename 'Customer ID' to 'CustomerID'
+    if 'Customer ID' in data.columns:
+        data.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)
+
+    if 'CustomerID' not in data.columns:
+        raise KeyError("The 'CustomerID' column is missing from the dataset.")
+
+    # Extract customerIDs and create numerical mapping
+    customer_ids = data['CustomerID'].unique()
+    customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)}
+
+    # Create fixed embeddings for each unique customer ID
+    customer_embeddings = torch.nn.Embedding(len(customer_ids), repr_dims)
+    customer_embeddings.weight.requires_grad = False  # Fixed embeddings
+
+    # Map customer IDs to their embeddings
+    customer_embeddings_tensor = customer_embeddings(torch.tensor(data['CustomerID'].map(customer_id_to_index).values))
+
+    # Store customer embeddings as a new column 'customer_embed' in the DataFrame
+    data['customer_embed'] = list(customer_embeddings_tensor.detach().numpy())
+
+    # Group by CustomerID
+    customer_data = data.groupby('CustomerID')
+
+    # Split the data into train, valid, and test sets
+    train_data = {}
+    valid_data = {}
+    test_data = {}
+
+    '''Split the data into train, valid, and test sets such that valid set includes second-last transaction 
+    of each customerID and test set includes last transaction of each customerID'''
+    for customer_id, customer_df in customer_data:
+        if len(customer_df) >= 3:
+            train_data[customer_id] = customer_df.iloc[: -2]
+            valid_data[customer_id] = customer_df.iloc[-2 : -1]
+            test_data[customer_id] = customer_df.iloc[-1 :]
+
+    return train_data, valid_data, test_data, customer_embeddings
+
+
+
+    # # Filter customerIDs with at least 5 records
+    # customer_counts = data['CustomerID'].value_counts()
+    # valid_customers = customer_counts[customer_counts >= 5].index
+    # data = data[data['CustomerID'].isin(valid_customers)]
+    #
+    # # Group by CustomerID and sort by InvoiceDate
+    # grouped = data.groupby('CustomerID').apply(lambda x: x.sort_values('InvoiceDate')).reset_index(drop=True)
+    #
+    # # Create time series data for each CustomerID
+    # customer_data = {}
+    # for customer_id, group in grouped.groupby('CustomerID'):
+    #     group.set_index('InvoiceDate', inplace=True)
+    #     customer_data[customer_id] = group[['CustomerID', 'Quantity', 'Price']]
+
+    # Set other forecasting parameters
+    # scaler = MinMaxScaler() # alternative to StandardScaler which avoid negative values
+    # pred_lens = [1, 2, 3]
+    # n_covariate_cols = 2
+    #
+    # return customer_data
diff --git a/models/encoder.py b/models/encoder.py
index cedb13b0..c4aad5d5 100644
--- a/models/encoder.py
+++ b/models/encoder.py
@@ -41,7 +41,7 @@ def __init__(self, input_dims, output_dims, hidden_dims=64, depth=10, mask_mode=
     def forward(self, x, mask=None):  # x: B x T x input_dims
         nan_mask = ~x.isnan().any(axis=-1)
         x[~nan_mask] = 0
-        x = self.input_fc(x)  # B x T x Ch
+        x = self.input_fc(x)  # B x T x Ch NOTE: Ch = output_dims (I think...)
         
         # generate & apply mask
         if mask is None:
diff --git a/requirements.txt b/requirements.txt
index d5ee03d8..0173de5d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,6 @@ statsmodels==0.12.2
 pandas==1.0.1
 scikit_learn==0.24.2
 xlrd==1.2.0
+
+matplotlib==3.3.4
+scikit-learn~=0.24.2
\ No newline at end of file
diff --git a/tasks/_eval_protocols.py b/tasks/_eval_protocols.py
index e413bcf4..d0e9af8b 100644
--- a/tasks/_eval_protocols.py
+++ b/tasks/_eval_protocols.py
@@ -79,6 +79,18 @@ def fit_knn(features, y):
     return pipe
 
 def fit_ridge(train_features, train_y, valid_features, valid_y, MAX_SAMPLES=100000):
+    # Replace NaN and infinite values with 0
+    train_features = np.nan_to_num(train_features, nan=0.0, posinf=0.0, neginf=0.0)
+    train_y = np.nan_to_num(train_y, nan=0.0, posinf=0.0, neginf=0.0)
+    valid_features = np.nan_to_num(valid_features, nan=0.0, posinf=0.0, neginf=0.0)
+    valid_y = np.nan_to_num(valid_y, nan=0.0, posinf=0.0, neginf=0.0)
+
+    # Ensure values are within the range of float64
+    train_features = np.clip(train_features, -1e308, 1e308)
+    train_y = np.clip(train_y, -1e308, 1e308)
+    valid_features = np.clip(valid_features, -1e308, 1e308)
+    valid_y = np.clip(valid_y, -1e308, 1e308)
+
     # If the training set is too large, subsample MAX_SAMPLES examples
     if train_features.shape[0] > MAX_SAMPLES:
         split = train_test_split(
diff --git a/tasks/forecasting.py b/tasks/forecasting.py
index d2139cce..ac8faa78 100644
--- a/tasks/forecasting.py
+++ b/tasks/forecasting.py
@@ -17,7 +17,7 @@ def cal_metrics(pred, target):
         'MAE': np.abs(pred - target).mean()
     }
 
-def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols):
+def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, dataset_name = None):
     padding = 200
 
     t = time.time()
@@ -55,13 +55,28 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         test_pred = lr.predict(test_features)
         lr_infer_time[pred_len] = time.time() - t
 
-        ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2]
-        test_pred = test_pred.reshape(ori_shape)
-        test_labels = test_labels.reshape(ori_shape)
+        if dataset_name =='ts2vec_online_retail_II_data':
+            test_pred = test_pred.reshape(-1, 2)
+            test_labels = test_labels.reshape(-1, 2)
+        else:
+            ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2]
+            test_pred = test_pred.reshape(ori_shape)
+            test_labels = test_labels.reshape(ori_shape)
 
         if test_data.shape[0] > 1:
             test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3)
             test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3)
+        elif dataset_name == 'ts2vec_online_retail_II_data':
+            test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape)
+            test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape)
+
+            # Remove NaN values from all datasets
+            valid_indices = ~np.isnan(test_pred).any(axis=1) & ~np.isnan(test_pred_inv).any(axis=1) & ~np.isnan(test_labels).any(axis=1) & ~np.isnan(test_labels_inv).any(axis=1)
+            test_pred = test_pred[valid_indices]
+            test_pred_inv = test_pred_inv[valid_indices]
+            test_labels = test_labels[valid_indices]
+            test_labels_inv = test_labels_inv[valid_indices]
+
         else:
             test_pred_inv = scaler.inverse_transform(test_pred)
             test_labels_inv = scaler.inverse_transform(test_labels)
diff --git a/train.py b/train.py
index 916f2a3e..ff43e247 100644
--- a/train.py
+++ b/train.py
@@ -1,10 +1,19 @@
+import config
+import device
+import pandas as pd
 import torch
+from matplotlib import pyplot as plt
+from scipy.ndimage import label
+from torch import nn
 import numpy as np
 import argparse
 import os
 import sys
 import time
 import datetime
+
+from sklearn.preprocessing import MinMaxScaler
+
 from ts2vec import TS2Vec
 import tasks
 import datautils
@@ -45,6 +54,11 @@ def callback(model, loss):
 
     device = init_dl_program(args.gpu, seed=args.seed, max_threads=args.max_threads)
 
+    # Define variables outside the conditional block for online retail dataset
+    customer_embedding_layer = None
+    customer_id_to_index = None
+
+
     print('Loading data... ', end='')
     if args.loader == 'UCR':
         task_type = 'classification'
@@ -58,6 +72,7 @@ def callback(model, loss):
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset)
         train_data = data[:, train_slice]
+        test_data = data[:, test_slice]
 
     elif args.loader == 'forecast_csv_univar':
         task_type = 'forecasting'
@@ -84,10 +99,15 @@ def callback(model, loss):
         all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
         train_data, _, _, _ = datautils.load_UCR('FordA')
 
+    # elif args.loader == 'retail':
+    #     task_type = 'forecasting'
+    #
+    #     # Load the data
+    #     train_data, valid_data, test_data, customer_embeddings = datautils.load_online_retail(args.dataset, repr_dims=args.repr_dims)
+
     else:
         raise ValueError(f"Unknown loader {args.loader}.")
 
-
     if args.irregular > 0:
         if task_type == 'classification':
             train_data = data_dropout(train_data, args.irregular)
@@ -112,31 +132,96 @@ def callback(model, loss):
 
     t = time.time()
 
-    model = TS2Vec(
-        input_dims=train_data.shape[-1],
-        device=device,
-        **config
-    )
-    loss_log = model.fit(
-        train_data,
-        n_epochs=args.epochs,
-        n_iters=args.iters,
-        verbose=True
-    )
+    # Define the TS2Vec model
+    if args.loader == 'retail':
+        # Initialise the loss log fir tracking losses during training
+        loss_log = []
+        # Initialise the model
+        model = TS2Vec(
+            input_dims=2,  # We are using 'Quantity' and 'Price' as input dimensions
+            device=device,
+            **config,
+            use_customer_embs=True  # Specify that we are using customer embeddings
+        )
+
+        # Training loop with customer ID fixed embeddings as input
+        for epoch in range(args.epochs):
+            '''For retail loader, `train_data`, `valid_data`, and `test_data` are dictionaries where each key is
+            a `customer_id` and the value is a DataFrame with shape `(n_transactions, n_features)`. The model will 
+            train on 'data_array`, which is reshaped to `(n_instances, n_timestamps, n_dimensions)`.'''
+            epoch_loss = 0
+            for customer_id, data in train_data.items():
+
+
+                # Get the customer embedding
+                customer_idx = torch.tensor(customer_id_to_index[customer_id], device = device)
+                customer_embedding = customer_embedding_layer(customer_idx).unsqueeze(0).unsqueeze(0)
+
+                # Convert data to the required format
+                data_array = data.to_numpy()
+                data_array = torch.tensor(data_array, dtype=torch.float32, device=device)  # Ensure data_array is a torch.Tensor
+
+                # Add the customer embedding to the data
+                data_array += customer_embedding
+
+                # Convert back to NumPy array before training the model
+                data_array = data_array.cpu().numpy()
+
+                # train the model
+                loss_log = model.fit(data_array, n_epochs=1, n_iters=args.iters, verbose=True)
+    else:
+        model = TS2Vec(
+            input_dims=train_data.shape[-1],
+            device=device,
+            **config
+        )
+        # Training code
+        loss_log = model.fit(
+            train_data,
+            n_epochs=args.epochs,
+            n_iters=args.iters,
+            verbose=True
+        )
+
     model.save(f'{run_dir}/model.pkl')
 
+    # Plot the training loss log
+    plt.plot(loss_log, label='Training Loss')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.title('Training Loss Over Epochs')
+    plt.grid(True)
+    plt.savefig(f'{run_dir}/loss_plot.png')
+    # Draw a straight line to show the general trend
+    z = np.polyfit(range(len(loss_log)), loss_log, 1)
+    p = np.poly1d(z)
+    plt.plot(range(len(loss_log)), p(range(len(loss_log))), "r--", label='Training Loss Trend')
+    plt.legend()
+    # Save the plot
+    plt.savefig(f'{run_dir}/train_loss_plot.png')
+    plt.show()
+
     t = time.time() - t
     print(f"\nTraining time: {datetime.timedelta(seconds=t)}\n")
 
     if args.eval:
-        out = None # Initialise out as None to avoid the NameError in the case of unsupervised tasks
+        out = None # Initialise 'out' as None to avoid the NameError in the case of unsupervised tasks
 
         if task_type == 'classification':
             out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm')
         elif task_type == 'forecasting':
             # add case for unsupervised evaluation on Online Retail dataset
             if args.dataset == 'ts2vec_online_retail_II_data':
-                eval_res = tasks.eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols)
+                # # print data shapes and check for NaN or infinite values
+                # print("Data shape:", data.shape)
+                # print("Train slice:", train_slice)
+                # print("Valid slice:", valid_slice)
+                # print("Test slice:", test_slice)
+                # print("Scaler:", scaler)
+                # print("Prediction lengths:", pred_lens)
+                # print("Number of covariate columns:", n_covariate_cols)
+                out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset)
+
             else:
                 out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
         elif task_type == 'anomaly_detection':
@@ -153,5 +238,5 @@ def callback(model, loss):
         pkl_save(f'{run_dir}/eval_res.pkl', eval_res)
         print('Evaluation result:', eval_res)
 
-    print("Finished.")
+        print("Finished.")
 

From b65984c944a7b57ac92f749c64e9e224b061fd24 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Sun, 22 Sep 2024 23:41:38 +0100
Subject: [PATCH 05/10] uploading preprocessing scripts for Online Retail II
 dataset

---
 .../stage1_online_retail_pre_processing.py    | 80 +++++++++++++++++++
 .../stage2_online_retail_pre_processing_.py   | 67 ++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 datasets/stage1_online_retail_pre_processing.py
 create mode 100644 datasets/stage2_online_retail_pre_processing_.py

diff --git a/datasets/stage1_online_retail_pre_processing.py b/datasets/stage1_online_retail_pre_processing.py
new file mode 100644
index 00000000..eee5d36d
--- /dev/null
+++ b/datasets/stage1_online_retail_pre_processing.py
@@ -0,0 +1,80 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Step 1: Load and Combine Data from Excel Sheets
+def load_and_combine_sheets(file_path, sheets):
+    """
+    Load data from multiple Excel sheets and combine into a single DataFrame.
+    """
+    combined_data = pd.DataFrame()
+    for sheet in sheets:
+        logging.info(f"Loading data from sheet: {sheet}")
+        sheet_data = pd.read_excel(file_path, sheet_name=sheet)
+        combined_data = pd.concat([combined_data, sheet_data], ignore_index=True)
+    logging.info("Data successfully loaded and combined.")
+    return combined_data
+
+# Step 2: Preprocess Data
+def preprocess_data(data):
+    """
+    Preprocess data by converting dates, normalizing numeric columns, and handling missing values.
+    """
+    # Convert 'InvoiceDate' to datetime and ensure numerical consistency in key columns
+    logging.info("Preprocessing data: converting dates and normalizing numeric columns.")
+    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
+    data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
+    data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
+    data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')
+
+    # Remove rows where the Invoice starts with 'C' (canceled orders)
+    data = data[~data['Invoice'].astype(str).str.startswith('C')]
+
+    # Drop rows with missing critical data
+    data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])
+
+    # Normalize 'Quantity' and 'Price' using Min-Max scaling to insure values are positive
+    scaler = MinMaxScaler()
+    data[['Quantity', 'Price']] = scaler.fit_transform(data[['Quantity', 'Price']])
+    logging.info("Data normalized and missing values handled.")
+
+    return data
+
+# Step 3: Aggregate Data
+def aggregate_data(data):
+    """
+    Aggregate data by summing 'Quantity' and averaging 'Price' daily.
+    """
+    logging.info("Aggregating data by Date.")
+    # Group by Date, aggregating Quantity and Price
+    data_agg = data.groupby(pd.Grouper(key='InvoiceDate', freq='D')).agg({
+        'Quantity': 'sum',
+        'Price': 'mean'
+    }).reset_index()
+
+    logging.info("Data aggregation complete.")
+    return data_agg
+
+# Main Function to Run All Steps
+def main():
+    # File path and sheets to load
+    file_path = 'online_retail_II.xlsx'
+    sheets = ['Year 2009-2010', 'Year 2010-2011']
+    
+    # Load and preprocess the data
+    combined_data = load_and_combine_sheets(file_path, sheets)
+    cleaned_data = preprocess_data(combined_data)
+    
+    # Aggregate the data
+    aggregated_data = aggregate_data(cleaned_data)
+
+    # Save the final reshaped and adjusted data to CSV
+    aggregated_data.to_csv('ts2vec_online_retail_II_data.csv', index=False)
+    logging.info("Final data saved successfully.")
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/stage2_online_retail_pre_processing_.py b/datasets/stage2_online_retail_pre_processing_.py
new file mode 100644
index 00000000..f33962ef
--- /dev/null
+++ b/datasets/stage2_online_retail_pre_processing_.py
@@ -0,0 +1,67 @@
+import pandas as pd
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Step 1: Load the original Online Retail II dataset
+def load_data(file_path):
+    logging.info(f"Loading data from: {file_path}")
+    data = pd.read_excel(file_path)
+    logging.info(f"Data successfully loaded with {len(data)} records.")
+    return data
+
+# Step 2: Clean and preprocess the dataset
+def preprocess_data(data):
+    logging.info("Preprocessing data: cleaning and handling missing values.")
+    # Convert 'InvoiceDate' to datetime and ensure numerical consistency
+    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
+    data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
+    data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
+    data['Customer ID'] = pd.to_numeric(data['Customer ID'], errors='coerce')
+
+    # Remove cancelled orders (invoices starting with 'C')
+    data = data[~data['Invoice'].str.startswith('C', na=False)]
+
+    # Drop rows with missing values in key columns
+    data = data.dropna(subset=['InvoiceDate', 'Customer ID', 'Quantity', 'Price'])
+
+    logging.info(f"Data cleaned. Remaining records: {len(data)}.")
+    return data
+
+# Step 3: Group by CustomerID and InvoiceNo
+def group_by_customer_invoice(data):
+    logging.info("Grouping by Customer ID and Invoice Number.")
+    # Group by CustomerID and InvoiceNo to represent each invoice as a time series record
+    grouped = data.groupby(['Customer ID', 'Invoice']).agg({
+        'InvoiceDate': 'first',  # First date of the invoice
+        'Quantity': 'sum',       # Sum of quantities in the invoice
+        'Price': 'mean'          # Average price in the invoice
+    }).reset_index()
+
+    logging.info(f"Grouped data created with {len(grouped)} records.")
+    return grouped
+
+# Step 4: Save the restructured dataset
+def save_data(grouped_data, output_file):
+    logging.info(f"Saving restructured data to {output_file}.")
+    grouped_data.to_csv(output_file, index=False)
+    logging.info("Data successfully saved.")
+
+# Main function to run the entire preprocessing pipeline
+def main():
+    file_path = 'online_retail_II.xlsx'
+    output_file = ('restructured_ts2vec_online_retail.csv')
+
+    # Load and preprocess the data
+    data = load_data(file_path)
+    cleaned_data = preprocess_data(data)
+
+    # Group data by CustomerID and InvoiceNo
+    grouped_data = group_by_customer_invoice(cleaned_data)
+
+    # Save the restructured dataset
+    save_data(grouped_data, output_file)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 32f571f82e10e7641bb09bf4c8bc3b982067628b Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Mon, 23 Sep 2024 01:05:44 +0100
Subject: [PATCH 06/10] updating gitignore and comments

---
 .gitignore   | 2 +-
 datautils.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9d59144c..bb10ee7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,7 @@
 
 # Ignore Jupyter notebook checkpoints
 .ipynb_checkpoints
-
+*.idea/
 # Ignore Python cache files
 __pycache__
 
diff --git a/datautils.py b/datautils.py
index 2c12e3ae..03f6a461 100644
--- a/datautils.py
+++ b/datautils.py
@@ -192,7 +192,7 @@ def load_forecast_csv(name, univar=False):
 
     # Normalise data
     scaler = None
-    if name == 'ts2vec_online_retail_II_data':
+    if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail':
         scaler = MinMaxScaler().fit(data[train_slice])
     else:
         scaler = StandardScaler().fit(data[train_slice])
@@ -206,7 +206,7 @@ def load_forecast_csv(name, univar=False):
         data = np.expand_dims(data, 0) # Single instance case
 
     if n_covariate_cols > 0:
-        if name == 'ts2vec_online_retail_II_data':
+        if name == 'ts2vec_online_retail_II_data' or name == 'restructured_ts2vec_online_retail':
             dt_scaler = MinMaxScaler().fit(dt_embed[train_slice])
         else:
             dt_scaler = StandardScaler().fit(dt_embed[train_slice])
@@ -309,7 +309,7 @@ def load_online_retail(name, repr_dims):
     test_data = {}
 
     '''Split the data into train, valid, and test sets such that valid set includes second-last transaction 
-    of each customerID and test set includes last transaction of each customerID'''
+    of each customerID and test set includes last transaction of each customerID '''
     for customer_id, customer_df in customer_data:
         if len(customer_df) >= 3:
             train_data[customer_id] = customer_df.iloc[: -2]

From 4fae48367a4fbf3d377bf2c4615a19866f9dc451 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Mon, 23 Sep 2024 01:07:39 +0100
Subject: [PATCH 07/10] test commit to save auth details for github

---
 datautils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datautils.py b/datautils.py
index 03f6a461..2aff2e48 100644
--- a/datautils.py
+++ b/datautils.py
@@ -309,7 +309,7 @@ def load_online_retail(name, repr_dims):
     test_data = {}
 
     '''Split the data into train, valid, and test sets such that valid set includes second-last transaction 
-    of each customerID and test set includes last transaction of each customerID '''
+    of each customerID and test set includes last transaction of each customerID'''
     for customer_id, customer_df in customer_data:
         if len(customer_df) >= 3:
             train_data[customer_id] = customer_df.iloc[: -2]

From 244a1424c5c73412220443c6a9f3dbe78d1e2ca4 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Mon, 23 Sep 2024 01:56:26 +0100
Subject: [PATCH 08/10] Added changes so that ts2vec now works with
 restructured_ts2vec_online_retail as well as ts2vec_online_retail_II_data
 (using load_forecast_csv). However, ts2vec currently trains on
 restructured_ts2vec_online_retail with Customer ID as a forecasting target
 feature rather than a covariate (as with datetime embedding features). TS2Vec
 is able to learn representations better for ts2vec_online_retail_II_data,
 which only includes Quantity and Price as target features, which is evident
 as restructured_ts2vec_online_retail have higher MSE and MAE values whilst
 also starting with a higher training loss value. Perhaps embedding the
 customer ID instead of feeding it as a forecasting feature will result is
 better representation learning

---
 tasks/__init__.py    |  2 +-
 tasks/forecasting.py | 29 ++++++++++++++++++-----------
 train.py             |  2 +-
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tasks/__init__.py b/tasks/__init__.py
index c1e1208e..3a09b083 100644
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -1,3 +1,3 @@
 from .classification import eval_classification
-from .forecasting import eval_forecasting, eval_forecasting_unsupervised
+from .forecasting import eval_forecasting, eval_forecasting_customer_embed
 from .anomaly_detection import eval_anomaly_detection, eval_anomaly_detection_coldstart
diff --git a/tasks/forecasting.py b/tasks/forecasting.py
index ac8faa78..d6fe8bb5 100644
--- a/tasks/forecasting.py
+++ b/tasks/forecasting.py
@@ -55,9 +55,12 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         test_pred = lr.predict(test_features)
         lr_infer_time[pred_len] = time.time() - t
 
-        if dataset_name =='ts2vec_online_retail_II_data':
+        if dataset_name == 'ts2vec_online_retail_II_data':
             test_pred = test_pred.reshape(-1, 2)
             test_labels = test_labels.reshape(-1, 2)
+        elif dataset_name == 'restructured_ts2vec_online_retail':
+            test_pred = test_pred.reshape(-1, 4)
+            test_labels = test_labels.reshape(-1, 4)
         else:
             ori_shape = test_data.shape[0], -1, pred_len, test_data.shape[2]
             test_pred = test_pred.reshape(ori_shape)
@@ -66,9 +69,13 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         if test_data.shape[0] > 1:
             test_pred_inv = scaler.inverse_transform(test_pred.swapaxes(0, 3)).swapaxes(0, 3)
             test_labels_inv = scaler.inverse_transform(test_labels.swapaxes(0, 3)).swapaxes(0, 3)
-        elif dataset_name == 'ts2vec_online_retail_II_data':
-            test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape)
-            test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape)
+        elif dataset_name in ['ts2vec_online_retail_II_data','restructured_ts2vec_online_retail']:
+            if dataset_name == 'ts2vec_online_retail_II_data':
+                test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape)
+                test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape)
+            else: # restructured_ts2vec_online_retail
+                test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 4)).reshape(test_pred.shape)
+                test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 4)).reshape(test_labels.shape)
 
             # Remove NaN values from all datasets
             valid_indices = ~np.isnan(test_pred).any(axis=1) & ~np.isnan(test_pred_inv).any(axis=1) & ~np.isnan(test_labels).any(axis=1) & ~np.isnan(test_labels_inv).any(axis=1)
@@ -92,15 +99,15 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
             'raw': cal_metrics(test_pred_inv, test_labels_inv)
         }
 
-    eval_res = {
-        'ours': ours_result,
-        'ts2vec_infer_time': ts2vec_infer_time,
-        'lr_train_time': lr_train_time,
-        'lr_infer_time': lr_infer_time
-    }
+        eval_res = {
+            'ours': ours_result,
+            'ts2vec_infer_time': ts2vec_infer_time,
+            'lr_train_time': lr_train_time,
+            'lr_infer_time': lr_infer_time
+        }
     return out_log, eval_res
 
-def eval_forecasting_unsupervised(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols):
+def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols):
     padding = 200
 
     t = time.time()
diff --git a/train.py b/train.py
index ff43e247..b2c14ca1 100644
--- a/train.py
+++ b/train.py
@@ -211,7 +211,7 @@ def callback(model, loss):
             out, eval_res = tasks.eval_classification(model, train_data, train_labels, test_data, test_labels, eval_protocol='svm')
         elif task_type == 'forecasting':
             # add case for unsupervised evaluation on Online Retail dataset
-            if args.dataset == 'ts2vec_online_retail_II_data':
+            if args.dataset in 'ts2vec_online_retail_II_data' or 'restructured_ts2vec_online_retail':
                 # # print data shapes and check for NaN or infinite values
                 # print("Data shape:", data.shape)
                 # print("Train slice:", train_slice)

From d3af0340e051dd47e22a343519bf28445a90c227 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Tue, 24 Sep 2024 15:29:36 +0100
Subject: [PATCH 09/10] added new loader and evaluation function for online
 retail data to be trained with customer embeddings

---
 datautils.py         | 83 +++++++++++++++++++++++++++-----------
 tasks/forecasting.py | 57 +++++++++++++++++++++-----
 train.py             | 95 +++++++++++++++-----------------------------
 3 files changed, 138 insertions(+), 97 deletions(-)

diff --git a/datautils.py b/datautils.py
index 2aff2e48..a31ec3d5 100644
--- a/datautils.py
+++ b/datautils.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import numpy as np
 import pandas as pd
@@ -254,8 +255,9 @@ def _get_time_features(dt):
         dt.to_series().apply(lambda x: x.strftime("%V")).astype(int).to_numpy(), # Week of the year
     ], axis=1).astype(np.float)
 
-
-def load_online_retail(name, repr_dims):
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def load_online_retail(name):
 #     """
 #     Loads and preprocesses the Online Retail dataset for forecasting tasks.
 #     Ensures both Price, Quantity, and customer embeddings are included throughout. """
@@ -270,14 +272,21 @@ def load_online_retail(name, repr_dims):
 #     """
 
      # Load data
-    data = pd.read_csv(f'datasets/{name}.csv', parse_dates=['InvoiceDate'])
+    data = pd.read_csv(f'datasets/{name}.csv', index_col='InvoiceDate', parse_dates=True)
 
     # Convert 'InvoiceDate' to datetime if not already done
-    if not pd.api.types.is_datetime64_any_dtype(data['InvoiceDate']):
-        data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
+    if not pd.api.types.is_datetime64_any_dtype(data.index):
+        data.index = pd.to_datetime(data.index)
 
-    # Sort data by 'InvoiceDate'
-    data.sort_values(by='InvoiceDate', inplace=True)
+    # Remove rows with any None or NaN values
+    data.dropna(inplace=True)
+
+    # Remove rows with negative 'Quantity' or 'Price' values
+    data = data[(data['Quantity'] > 0) & (data['Price'] > 0)]
+    logging.info(f"Data shape after removing negative values: {data.shape}")
+
+    # Remove Invoice column
+    data.drop(columns=['Invoice'], inplace=True)
 
     # Rename 'Customer ID' to 'CustomerID'
     if 'Customer ID' in data.columns:
@@ -285,38 +294,64 @@ def load_online_retail(name, repr_dims):
 
     if 'CustomerID' not in data.columns:
         raise KeyError("The 'CustomerID' column is missing from the dataset.")
+    logging.info(f"Data columns and shape: {data.columns, data.shape}")
 
     # Extract customerIDs and create numerical mapping
     customer_ids = data['CustomerID'].unique()
+    logging.info(f"Unique customer ID count: {customer_ids.shape[0]}")
     customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)}
+    logging.info(f"Unique customer ID to index count: {len(customer_id_to_index)}")
 
     # Create fixed embeddings for each unique customer ID
-    customer_embeddings = torch.nn.Embedding(len(customer_ids), repr_dims)
+    customer_embeddings = torch.nn.Embedding(len(customer_ids), 4)
     customer_embeddings.weight.requires_grad = False  # Fixed embeddings
 
     # Map customer IDs to their embeddings
     customer_embeddings_tensor = customer_embeddings(torch.tensor(data['CustomerID'].map(customer_id_to_index).values))
+    logging.info(f"Customer embeddings shape: {customer_embeddings_tensor.shape}")
+    logging.info(f"Customer embeddings: {customer_embeddings_tensor}")
+
+    # Replace 'CustomerID' column with the corresponding customer embedding
+    customer_embeddings_expanded = customer_embeddings_tensor.detach().numpy()
+    logging.info(f"Customer embeddings expanded shape: {customer_embeddings_expanded.shape}")
 
-    # Store customer embeddings as a new column 'customer_embed' in the DataFrame
-    data['customer_embed'] = list(customer_embeddings_tensor.detach().numpy())
+    # Drop the original 'CustomerID' column
+    data = data.drop(columns=['CustomerID'])
 
-    # Group by CustomerID
-    customer_data = data.groupby('CustomerID')
+    # Extract time features for the date/index column
+    dt_embed = _get_time_features(data.index)
+    n_covariate_cols = dt_embed.shape[-1] + customer_embeddings_expanded.shape[-1]
+    logging.info(f"Number of covariate columns: {n_covariate_cols}")
+    logging.info(f"Time features shape: {dt_embed.shape}")
 
-    # Split the data into train, valid, and test sets
-    train_data = {}
-    valid_data = {}
-    test_data = {}
+    # Convert the DataFrame to a NumPy array
+    data = data.to_numpy()
 
-    '''Split the data into train, valid, and test sets such that valid set includes second-last transaction 
-    of each customerID and test set includes last transaction of each customerID'''
-    for customer_id, customer_df in customer_data:
-        if len(customer_df) >= 3:
-            train_data[customer_id] = customer_df.iloc[: -2]
-            valid_data[customer_id] = customer_df.iloc[-2 : -1]
-            test_data[customer_id] = customer_df.iloc[-1 :]
+    train_slice = slice(0, int(0.6 * len(data)))
+    valid_slice = slice(int(0.6 * len(data)), int(0.8 * len(data)))
+    test_slice = slice(int(0.8 * len(data)), len(data))
 
-    return train_data, valid_data, test_data, customer_embeddings
+    # Normalise data
+    scaler = MinMaxScaler().fit(data[train_slice])
+    data = scaler.transform(data)
+
+    # Reshape data based on dataset structure
+    data = np.expand_dims(data, 0) # Single instance case
+
+    if n_covariate_cols > 0:
+        dt_scaler = MinMaxScaler().fit(dt_embed[train_slice])
+        dt_embed = np.expand_dims(dt_scaler.transform(dt_embed), 0)
+        # Concatenating the time embeddings to the data
+        data = np.concatenate([np.repeat(dt_embed, data.shape[0], axis=0), data], axis=-1)
+
+        cid_scaler = MinMaxScaler().fit(customer_embeddings_expanded[train_slice])
+        cid_embed = np.expand_dims(cid_scaler.transform(customer_embeddings_expanded), 0)
+        # Concatenating the customer ID embeddings to the data
+        data = np.concatenate([np.repeat(cid_embed, data.shape[0], axis=0), data], axis=-1)
+
+    pred_lens = [1, 2, 3, 4, 5]
+
+    return data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols
 
 
 
diff --git a/tasks/forecasting.py b/tasks/forecasting.py
index d6fe8bb5..c5b0276c 100644
--- a/tasks/forecasting.py
+++ b/tasks/forecasting.py
@@ -107,7 +107,7 @@ def eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler,
         }
     return out_log, eval_res
 
-def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, n_covariate_cols):
+def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols):
     padding = 200
 
     t = time.time()
@@ -125,14 +125,53 @@ def eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_
     valid_repr = all_repr[:, valid_slice]
     test_repr = all_repr[:, test_slice]
 
-    # You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization)
+    # Extract data for each time slice
+    train_data = data[:, train_slice, n_covariate_cols:]
+    valid_data = data[:, valid_slice, n_covariate_cols:]
+    test_data = data[:, test_slice, n_covariate_cols:]
 
-    eval_res = {
-        'ts2vec_infer_time': ts2vec_infer_time,
-        'train_repr_shape': train_repr.shape,
-        'valid_repr_shape': valid_repr.shape,
-        'test_repr_shape': test_repr.shape
-    }
+    ours_result = {}
+    lr_train_time = {}
+    lr_infer_time = {}
+    out_log = {}
+    for pred_len in pred_lens:
+        train_features, train_labels = generate_pred_samples(train_repr, train_data, pred_len, drop=padding)
+        valid_features, valid_labels = generate_pred_samples(valid_repr, valid_data, pred_len)
+        test_features, test_labels = generate_pred_samples(test_repr, test_data, pred_len)
+
+        t = time.time()
+        lr = eval_protocols.fit_ridge(train_features, train_labels, valid_features, valid_labels)
+        lr_train_time[pred_len] = time.time() - t
+
+        t = time.time()
+        test_pred = lr.predict(test_features)
+        lr_infer_time[pred_len] = time.time() - t
+
+        test_pred = test_pred.reshape(-1, 2)
+        test_labels = test_labels.reshape(-1, 2)
 
-    return eval_res
+        test_pred_inv = scaler.inverse_transform(test_pred.reshape(-1, 2)).reshape(test_pred.shape)
+        test_labels_inv = scaler.inverse_transform(test_labels.reshape(-1, 2)).reshape(test_labels.shape)
+
+        # NOTE: You can now use train_repr, valid_repr, and test_repr for unsupervised tasks (e.g., clustering, visualization)
+
+        out_log[pred_len] = {
+            'norm': test_pred,
+            'raw': test_pred_inv,
+            'norm_gt': test_labels,
+            'raw_gt': test_labels_inv
+        }
+        ours_result[pred_len] = {
+            'norm': cal_metrics(test_pred, test_labels),
+            'raw': cal_metrics(test_pred_inv, test_labels_inv)
+        }
+
+        eval_res = {
+            'ours': ours_result,
+            'ts2vec_infer_time': ts2vec_infer_time,
+            'lr_train_time': lr_train_time,
+            'lr_infer_time': lr_infer_time
+        }
+
+    return out_log, eval_res
 
diff --git a/train.py b/train.py
index b2c14ca1..fb8346c2 100644
--- a/train.py
+++ b/train.py
@@ -72,7 +72,6 @@ def callback(model, loss):
         task_type = 'forecasting'
         data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_forecast_csv(args.dataset)
         train_data = data[:, train_slice]
-        test_data = data[:, test_slice]
 
     elif args.loader == 'forecast_csv_univar':
         task_type = 'forecasting'
@@ -99,11 +98,11 @@ def callback(model, loss):
         all_train_data, all_train_labels, all_train_timestamps, all_test_data, all_test_labels, all_test_timestamps, delay = datautils.load_anomaly(args.dataset)
         train_data, _, _, _ = datautils.load_UCR('FordA')
 
-    # elif args.loader == 'retail':
-    #     task_type = 'forecasting'
-    #
-    #     # Load the data
-    #     train_data, valid_data, test_data, customer_embeddings = datautils.load_online_retail(args.dataset, repr_dims=args.repr_dims)
+    elif args.loader == 'retail':
+        task_type = 'forecasting'
+        # Load the data
+        data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols = datautils.load_online_retail(args.dataset)
+        train_data = data[:, train_slice]
 
     else:
         raise ValueError(f"Unknown loader {args.loader}.")
@@ -133,55 +132,20 @@ def callback(model, loss):
     t = time.time()
 
     # Define the TS2Vec model
-    if args.loader == 'retail':
-        # Initialise the loss log fir tracking losses during training
-        loss_log = []
-        # Initialise the model
-        model = TS2Vec(
-            input_dims=2,  # We are using 'Quantity' and 'Price' as input dimensions
-            device=device,
-            **config,
-            use_customer_embs=True  # Specify that we are using customer embeddings
-        )
-
-        # Training loop with customer ID fixed embeddings as input
-        for epoch in range(args.epochs):
-            '''For retail loader, `train_data`, `valid_data`, and `test_data` are dictionaries where each key is
-            a `customer_id` and the value is a DataFrame with shape `(n_transactions, n_features)`. The model will 
-            train on 'data_array`, which is reshaped to `(n_instances, n_timestamps, n_dimensions)`.'''
-            epoch_loss = 0
-            for customer_id, data in train_data.items():
-
-
-                # Get the customer embedding
-                customer_idx = torch.tensor(customer_id_to_index[customer_id], device = device)
-                customer_embedding = customer_embedding_layer(customer_idx).unsqueeze(0).unsqueeze(0)
-
-                # Convert data to the required format
-                data_array = data.to_numpy()
-                data_array = torch.tensor(data_array, dtype=torch.float32, device=device)  # Ensure data_array is a torch.Tensor
-
-                # Add the customer embedding to the data
-                data_array += customer_embedding
-
-                # Convert back to NumPy array before training the model
-                data_array = data_array.cpu().numpy()
-
-                # train the model
-                loss_log = model.fit(data_array, n_epochs=1, n_iters=args.iters, verbose=True)
-    else:
-        model = TS2Vec(
-            input_dims=train_data.shape[-1],
-            device=device,
-            **config
-        )
-        # Training code
-        loss_log = model.fit(
-            train_data,
-            n_epochs=args.epochs,
-            n_iters=args.iters,
-            verbose=True
-        )
+
+
+    model = TS2Vec(
+        input_dims=train_data.shape[-1],
+        device=device,
+        **config
+    )
+    # Training code
+    loss_log = model.fit(
+        train_data,
+        n_epochs=args.epochs,
+        n_iters=args.iters,
+        verbose=True
+    )
 
     model.save(f'{run_dir}/model.pkl')
 
@@ -212,15 +176,18 @@ def callback(model, loss):
         elif task_type == 'forecasting':
             # add case for unsupervised evaluation on Online Retail dataset
             if args.dataset in 'ts2vec_online_retail_II_data' or 'restructured_ts2vec_online_retail':
-                # # print data shapes and check for NaN or infinite values
-                # print("Data shape:", data.shape)
-                # print("Train slice:", train_slice)
-                # print("Valid slice:", valid_slice)
-                # print("Test slice:", test_slice)
-                # print("Scaler:", scaler)
-                # print("Prediction lengths:", pred_lens)
-                # print("Number of covariate columns:", n_covariate_cols)
-                out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset)
+                # print data shapes and check for NaN or infinite values
+                print("Data shape:", data.shape)
+                print("Train slice:", train_slice)
+                print("Valid slice:", valid_slice)
+                print("Test slice:", test_slice)
+                print("Scaler:", scaler)
+                print("Prediction lengths:", pred_lens)
+                print("Number of covariate columns:", n_covariate_cols)
+                if args.loader == 'retail':
+                    out, eval_res = tasks.eval_forecasting_customer_embed(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)
+                else:
+                    out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols, args.dataset)
 
             else:
                 out, eval_res = tasks.eval_forecasting(model, data, train_slice, valid_slice, test_slice, scaler, pred_lens, n_covariate_cols)

From c893c630e5fcbb8d70e2dc928c2b14b74fe74780 Mon Sep 17 00:00:00 2001
From: "jumaira.miller" <jumairamiller1999@gmail.com>
Date: Tue, 24 Sep 2024 21:00:12 +0100
Subject: [PATCH 10/10] test commit

---
 ts2vec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ts2vec.py b/ts2vec.py
index de909bd5..76dc227d 100644
--- a/ts2vec.py
+++ b/ts2vec.py
@@ -5,7 +5,6 @@
 from models import TSEncoder
 from models.losses import hierarchical_contrastive_loss
 from utils import take_per_row, split_with_nan, centerize_vary_length_series, torch_pad_nan
-import math
 
 class TS2Vec:
     '''The TS2Vec model'''