OpenSynth-energy · charlotte-avery · Dec 16, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/app/app.py b/app/app.py
@@ -32,6 +32,19 @@ def download_lcl_data(
 
 @app.command()
 def preprocess_data(
+    split: Annotated[
+        bool,
+        typer.Option(
+            "--split", help="Splits LCL households into training/holdout set"
+        ),
+    ] = False,
+    preprocess: Annotated[
+        bool,
+        typer.Option(
+            "--preprocess",
+            help="Preprocesses LCL data into daily load profiles",
+        ),
+    ] = False,
     data_dir: Annotated[
         str, typer.Option("--loc", help="Location of data directory.")
     ] = "./data",
@@ -145,6 +158,8 @@ def preprocess_data(
     """
 
     get_data.split_preprocess_data(
+        split,
+        preprocess,
         data_dir,
         csv_data_path,
         sample_fraction,

diff --git a/src/opensynth/datasets/low_carbon_london/get_data.py b/src/opensynth/datasets/low_carbon_london/get_data.py
@@ -34,6 +34,8 @@ def download_lcl_data(data_dir: str = "./data"):
 
 
 def split_preprocess_data(
+    split: bool,
+    preprocess: bool,
     data_dir: str,
     csv_data_path: str,
     sample_fraction: float,
@@ -93,38 +95,46 @@ def split_preprocess_data(
         f"Reading data from {CSV_FILE_NAME}. Storing data in {data_dir}."
     )
 
-    # Split dataset into training/ holdout sets
-    split_households.split_data(
-        data_dir,
-        CSV_FILE_NAME,
-        sample_fraction=sample_fraction,
-        id_col=id_col,
-        kwh_col=kwh_col,
-        datetime_col=datetime_col,
-        utc=utc,
-        datetime_format=datetime_format,
-        historical_start=historical_start,
-        historical_end=historical_end,
-        future_start=future_start,
-        future_end=future_end,
-    )
-    # Preprocess the data into daily load profiles
-    preprocess_lcl.preprocess_data(
-        data_dir,
-        datetime_col=datetime_col,
-        kwh_col=kwh_col,
-        id_col=id_col,
-        utc=utc,
-        datetime_format=datetime_format,
-        time_resolution=time_resolution,
-        feature_cols=feature_cols,
-        drop_nulls=drop_nulls,
-    )
+    if split:
+        # Split dataset into training/ holdout sets
+        split_households.split_data(
+            data_dir,
+            CSV_FILE_NAME,
+            sample_fraction=sample_fraction,
+            id_col=id_col,
+            kwh_col=kwh_col,
+            datetime_col=datetime_col,
+            utc=utc,
+            datetime_format=datetime_format,
+            historical_start=historical_start,
+            historical_end=historical_end,
+            future_start=future_start,
+            future_end=future_end,
+        )
+    if preprocess:
+        # Preprocess the data into daily load profiles
+        preprocess_lcl.preprocess_data(
+            data_dir,
+            datetime_col=datetime_col,
+            kwh_col=kwh_col,
+            id_col=id_col,
+            utc=utc,
+            datetime_format=datetime_format,
+            time_resolution=time_resolution,
+            feature_cols=feature_cols,
+            drop_nulls=drop_nulls,
+        )
 
 
 if __name__ == "__main__":
+    # Whether to split and/or preprocess the data
+    split = True
+    preprocess = True
+
+    # Data directory
     data_dir = "./data"
 
+    # Fraction of households to include in training set
     sample_fraction = 0.75
 
     # Dataset location
@@ -147,6 +157,8 @@ def split_preprocess_data(
     drop_nulls = True
 
     split_preprocess_data(
+        split,
+        preprocess,
         data_dir,
         csv_data_path,
         sample_fraction,

diff --git a/src/opensynth/datasets/low_carbon_london/load.py b/src/opensynth/datasets/low_carbon_london/load.py
@@ -15,7 +15,8 @@ def load_lcl_data_by_year(
 ) -> pd.DataFrame | pl.DataFrame:
     """Load LCL data for a specific year.
 
-    Returns a DataFrame in wide format. The first column contains the timestamp.
+    Returns a DataFrame in wide format. The first column contains the
+    timestamp.
 
     Args:
         fname (str or Path): Location of the `train.csv` data file.
@@ -25,7 +26,7 @@ def load_lcl_data_by_year(
         pl.DataFrame with KWH/hh measurements.
     """
     fname = (
-        Path(__file__).parents[0] / "../../../../data/raw/historical/train.csv"
+        Path(__file__).parents[0] / "./data/raw/historical/train.csv"
         if fname is None
         else Path(fname)
     )

diff --git a/src/opensynth/utils/polars.py b/src/opensynth/utils/polars.py
@@ -17,9 +17,9 @@ def infer_date_column(df: pl.DataFrame) -> str:
 
     Returns the column name of a column in Date format, or a String column that
     matches a Date string. If the DataFrame contains only one matching column,
-    this function will return that column name. If multiple columns match, it will
-    return the column name that matches a canonical Date name, such as "DATUM".
-    In all other cases the function will raise a ValueError().
+    this function will return that column name. If multiple columns match, it
+    will return the column name that matches a canonical Date name, such as
+    "DATUM". In all other cases the function will raise a ValueError().
 
     Args:
         df (pl.DataFrame): DataFrame.
@@ -28,8 +28,8 @@ def infer_date_column(df: pl.DataFrame) -> str:
         str: column name of a column in Date or Date-like format.
 
     Raises:
-        ValueError: if no columns are in a Date-like format or multiple columns are
-        in Date-like format and match a canonical name.
+        ValueError: if no columns are in a Date-like format or multiple columns
+            are in Date-like format and match a canonical name.
 
     """
     date_columns = df.select(pl.col(pl.Date)).columns
@@ -49,7 +49,8 @@ def infer_date_column(df: pl.DataFrame) -> str:
             return list(canonical_columns)[0]
         case _:
             raise ValueError(
-                "Multiple Date-like columns found with a matching canonical name!"
+                "Multiple Date-like columns found with a matching canonical \
+                name!"
             )
 
 
@@ -68,20 +69,21 @@ def semiwide_to_long(
     default in "%HH%mm" format.
 
     Args:
-        df (polars.DataFrame): DataFrame in semi-wide wide format, containing DateTime-
-            compatible column names.
-        on (list, optional): Columns to use as timepoints. By default, all columns that
-            match the pattern '[0-9][0-9][0-9][0-9]' will be used.
-        date_col (str, optional): Column that contains the Date values. By default,
-            a column that is in Date format, or that is a Date-compatible string, will
-            be used, if there is only one column in that format. If there are multiple
-            Date-compatible, columns, but only one matches a canonical name such as
-            DATUM, that column will be used. Otherwise, this method will fail, and the
-            date_col needs to be explicitly specified.
+        df (polars.DataFrame): DataFrame in semi-wide wide format, containing
+            DateTime-compatible column names.
+        on (list, optional): Columns to use as timepoints. By default, all
+            columns that match the pattern '[0-9][0-9][0-9][0-9]' will be used.
+        date_col (str, optional): Column that contains the Date values. By
+            default, a column that is in Date format, or that is a
+            Date-compatible string, will be used, if there is only one column
+            in that format. If there are multiple Date-compatible columns, but
+            only one matches a canonical name such as DATUM, that column will
+            be used. Otherwise, this method will fail, and the date_col needs
+            to be explicitly specified.
         datetime_name (str, optional): Name for the DateTime column in the long
             DataFrame, "DATUM_TIJD" by default.
-        value_name (str, optional): Name to give to the value column. Defaults to
-            "value".
+        value_name (str, optional): Name to give to the value column. Defaults
+            to "value".
 
     Returns:
         polars.DataFrame in long format.
@@ -137,12 +139,13 @@ def semiwide_to_wide(
             DateTime-compatible column names.
         on (list, optional): Columns to use as timepoints. By default, all
             columns that match the pattern '[0-9][0-9][0-9][0-9]' will be used.
-        date_col (str, optional): Column that contains the Date values. By default,
-            a column that is in Date format, or that is a Date-compatible string,
-            will be used, if there is only one column in that format. If there are
-            multiple Date-compatible, columns, but only one matches a canonical name
-            such as DATUM, that column will be used. Otherwise, this method will fail,
-            and the date_col needs to be explicitly specified.
+        date_col (str, optional): Column that contains the Date values. By
+            default, a column that is in Date format, or that is a
+            Date-compatible string, will be used, if there is only one column
+            in that format. If there are multiple Date-compatible columns, but
+            only one matches a canonical name such as DATUM, that column will
+            be used. Otherwise, this method will fail, and the date_col needs
+            to be explicitly specified.
         datetime_name (str, optional): Name for the DateTime column in the long
             DataFrame, "datetime" by default.
 
@@ -178,7 +181,8 @@ def randomize_index_column(
     Args:
         df (DataFrame): Input DataFrame.
         index_col_name (str): Name of index column.
-        sample_col_name (str): Name of new column containing the randomized index.
+        sample_col_name (str): Name of new column containing the randomized
+            index.
 
     Returns:
         DataFrame with index column values randomized.

diff --git a/tests/evaluation/fidelity/test_autocorrelation.py b/tests/evaluation/fidelity/test_autocorrelation.py
@@ -47,7 +47,10 @@ def test_dataframe_half_hour_pandas(test_dataframe_half_hour):
 
 @pytest.fixture(scope="module")
 def test_dataframe_quarterly():
-    """ "DataFrame with 15-minute timesteps and high correlation with a week time-lag."""
+    """
+    DataFrame with 15-minute timesteps and high correlation with a week
+    time-lag.
+    """
     n_minutes = 15
     n_values = 60 // n_minutes * 24 * 7  # 1 week
     n_timesteps = 35041