diff --git a/app/app.py b/app/app.py index e225e9e..6d5ff72 100644 --- a/app/app.py +++ b/app/app.py @@ -32,6 +32,19 @@ def download_lcl_data( @app.command() def preprocess_data( + split: Annotated[ + bool, + typer.Option( + "--split", help="Splits LCL households into training/holdout set" + ), + ] = False, + preprocess: Annotated[ + bool, + typer.Option( + "--preprocess", + help="Preprocesses LCL data into daily load profiles", + ), + ] = False, data_dir: Annotated[ str, typer.Option("--loc", help="Location of data directory.") ] = "./data", @@ -145,6 +158,8 @@ def preprocess_data( """ get_data.split_preprocess_data( + split, + preprocess, data_dir, csv_data_path, sample_fraction, diff --git a/src/opensynth/datasets/low_carbon_london/get_data.py b/src/opensynth/datasets/low_carbon_london/get_data.py index 3bc0f5b..192899b 100644 --- a/src/opensynth/datasets/low_carbon_london/get_data.py +++ b/src/opensynth/datasets/low_carbon_london/get_data.py @@ -34,6 +34,8 @@ def download_lcl_data(data_dir: str = "./data"): def split_preprocess_data( + split: bool, + preprocess: bool, data_dir: str, csv_data_path: str, sample_fraction: float, @@ -93,38 +95,46 @@ def split_preprocess_data( f"Reading data from {CSV_FILE_NAME}. Storing data in {data_dir}." ) - # Split dataset into training/ holdout sets - split_households.split_data( - data_dir, - CSV_FILE_NAME, - sample_fraction=sample_fraction, - id_col=id_col, - kwh_col=kwh_col, - datetime_col=datetime_col, - utc=utc, - datetime_format=datetime_format, - historical_start=historical_start, - historical_end=historical_end, - future_start=future_start, - future_end=future_end, - ) - # Preprocess the data into daily load profiles - preprocess_lcl.preprocess_data( - data_dir, - datetime_col=datetime_col, - kwh_col=kwh_col, - id_col=id_col, - utc=utc, - datetime_format=datetime_format, - time_resolution=time_resolution, - feature_cols=feature_cols, - drop_nulls=drop_nulls, - ) + if split: + # Split dataset into training/ holdout sets + split_households.split_data( + data_dir, + CSV_FILE_NAME, + sample_fraction=sample_fraction, + id_col=id_col, + kwh_col=kwh_col, + datetime_col=datetime_col, + utc=utc, + datetime_format=datetime_format, + historical_start=historical_start, + historical_end=historical_end, + future_start=future_start, + future_end=future_end, + ) + if preprocess: + # Preprocess the data into daily load profiles + preprocess_lcl.preprocess_data( + data_dir, + datetime_col=datetime_col, + kwh_col=kwh_col, + id_col=id_col, + utc=utc, + datetime_format=datetime_format, + time_resolution=time_resolution, + feature_cols=feature_cols, + drop_nulls=drop_nulls, + ) if __name__ == "__main__": + # Whether to split and/or preprocess the data + split = True + preprocess = True + + # Data directory data_dir = "./data" + # Fraction of households to include in training set sample_fraction = 0.75 # Dataset location @@ -147,6 +157,8 @@ def split_preprocess_data( drop_nulls = True split_preprocess_data( + split, + preprocess, data_dir, csv_data_path, sample_fraction, diff --git a/src/opensynth/datasets/low_carbon_london/load.py b/src/opensynth/datasets/low_carbon_london/load.py index c948209..2d91e0e 100644 --- a/src/opensynth/datasets/low_carbon_london/load.py +++ b/src/opensynth/datasets/low_carbon_london/load.py @@ -15,7 +15,8 @@ def load_lcl_data_by_year( ) -> pd.DataFrame | pl.DataFrame: """Load LCL data for a specific year. - Returns a DataFrame in wide format. The first column contains the timestamp. + Returns a DataFrame in wide format. The first column contains the + timestamp. Args: fname (str or Path): Location of the `train.csv` data file. @@ -25,7 +26,7 @@ def load_lcl_data_by_year( pl.DataFrame with KWH/hh measurements. """ fname = ( - Path(__file__).parents[0] / "../../../../data/raw/historical/train.csv" + Path(__file__).parents[0] / "./data/raw/historical/train.csv" if fname is None else Path(fname) ) diff --git a/src/opensynth/utils/polars.py b/src/opensynth/utils/polars.py index fffa9b5..2296aca 100644 --- a/src/opensynth/utils/polars.py +++ b/src/opensynth/utils/polars.py @@ -17,9 +17,9 @@ def infer_date_column(df: pl.DataFrame) -> str: Returns the column name of a column in Date format, or a String column that matches a Date string. If the DataFrame contains only one matching column, - this function will return that column name. If multiple columns match, it will - return the column name that matches a canonical Date name, such as "DATUM". - In all other cases the function will raise a ValueError(). + this function will return that column name. If multiple columns match, it + will return the column name that matches a canonical Date name, such as + "DATUM". In all other cases the function will raise a ValueError(). Args: df (pl.DataFrame): DataFrame. @@ -28,8 +28,8 @@ def infer_date_column(df: pl.DataFrame) -> str: str: column name of a column in Date or Date-like format. Raises: - ValueError: if no columns are in a Date-like format or multiple columns are - in Date-like format and match a canonical name. + ValueError: if no columns are in a Date-like format or multiple columns + are in Date-like format and match a canonical name. """ date_columns = df.select(pl.col(pl.Date)).columns @@ -49,7 +49,8 @@ def infer_date_column(df: pl.DataFrame) -> str: return list(canonical_columns)[0] case _: raise ValueError( - "Multiple Date-like columns found with a matching canonical name!" + "Multiple Date-like columns found with a matching canonical \ + name!" ) @@ -68,20 +69,21 @@ def semiwide_to_long( default in "%HH%mm" format. Args: - df (polars.DataFrame): DataFrame in semi-wide wide format, containing DateTime- - compatible column names. - on (list, optional): Columns to use as timepoints. By default, all columns that - match the pattern '[0-9][0-9][0-9][0-9]' will be used. - date_col (str, optional): Column that contains the Date values. By default, - a column that is in Date format, or that is a Date-compatible string, will - be used, if there is only one column in that format. If there are multiple - Date-compatible, columns, but only one matches a canonical name such as - DATUM, that column will be used. Otherwise, this method will fail, and the - date_col needs to be explicitly specified. + df (polars.DataFrame): DataFrame in semi-wide wide format, containing + DateTime-compatible column names. + on (list, optional): Columns to use as timepoints. By default, all + columns that match the pattern '[0-9][0-9][0-9][0-9]' will be used. + date_col (str, optional): Column that contains the Date values. By + default, a column that is in Date format, or that is a + Date-compatible string, will be used, if there is only one column + in that format. If there are multiple Date-compatible columns, but + only one matches a canonical name such as DATUM, that column will + be used. Otherwise, this method will fail, and the date_col needs + to be explicitly specified. datetime_name (str, optional): Name for the DateTime column in the long DataFrame, "DATUM_TIJD" by default. - value_name (str, optional): Name to give to the value column. Defaults to - "value". + value_name (str, optional): Name to give to the value column. Defaults + to "value". Returns: polars.DataFrame in long format. @@ -137,12 +139,13 @@ def semiwide_to_wide( DateTime-compatible column names. on (list, optional): Columns to use as timepoints. By default, all columns that match the pattern '[0-9][0-9][0-9][0-9]' will be used. - date_col (str, optional): Column that contains the Date values. By default, - a column that is in Date format, or that is a Date-compatible string, - will be used, if there is only one column in that format. If there are - multiple Date-compatible, columns, but only one matches a canonical name - such as DATUM, that column will be used. Otherwise, this method will fail, - and the date_col needs to be explicitly specified. + date_col (str, optional): Column that contains the Date values. By + default, a column that is in Date format, or that is a + Date-compatible string, will be used, if there is only one column + in that format. If there are multiple Date-compatible columns, but + only one matches a canonical name such as DATUM, that column will + be used. Otherwise, this method will fail, and the date_col needs + to be explicitly specified. datetime_name (str, optional): Name for the DateTime column in the long DataFrame, "datetime" by default. @@ -178,7 +181,8 @@ def randomize_index_column( Args: df (DataFrame): Input DataFrame. index_col_name (str): Name of index column. - sample_col_name (str): Name of new column containing the randomized index. + sample_col_name (str): Name of new column containing the randomized + index. Returns: DataFrame with index column values randomized. diff --git a/tests/evaluation/fidelity/test_autocorrelation.py b/tests/evaluation/fidelity/test_autocorrelation.py index 27b3ca0..d7ab203 100644 --- a/tests/evaluation/fidelity/test_autocorrelation.py +++ b/tests/evaluation/fidelity/test_autocorrelation.py @@ -47,7 +47,10 @@ def test_dataframe_half_hour_pandas(test_dataframe_half_hour): @pytest.fixture(scope="module") def test_dataframe_quarterly(): - """ "DataFrame with 15-minute timesteps and high correlation with a week time-lag.""" + """ + DataFrame with 15-minute timesteps and high correlation with a week + time-lag. + """ n_minutes = 15 n_values = 60 // n_minutes * 24 * 7 # 1 week n_timesteps = 35041