Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ def download_lcl_data(

@app.command()
def preprocess_data(
split: Annotated[
bool,
typer.Option(
"--split", help="Splits LCL households into training/holdout set"
),
] = False,
preprocess: Annotated[
bool,
typer.Option(
"--preprocess",
help="Preprocesses LCL data into daily load profiles",
),
] = False,
data_dir: Annotated[
str, typer.Option("--loc", help="Location of data directory.")
] = "./data",
Expand Down Expand Up @@ -145,6 +158,8 @@ def preprocess_data(
"""

get_data.split_preprocess_data(
split,
preprocess,
data_dir,
csv_data_path,
sample_fraction,
Expand Down
66 changes: 39 additions & 27 deletions src/opensynth/datasets/low_carbon_london/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def download_lcl_data(data_dir: str = "./data"):


def split_preprocess_data(
split: bool,
preprocess: bool,
data_dir: str,
csv_data_path: str,
sample_fraction: float,
Expand Down Expand Up @@ -93,38 +95,46 @@ def split_preprocess_data(
f"Reading data from {CSV_FILE_NAME}. Storing data in {data_dir}."
)

# Split dataset into training/ holdout sets
split_households.split_data(
data_dir,
CSV_FILE_NAME,
sample_fraction=sample_fraction,
id_col=id_col,
kwh_col=kwh_col,
datetime_col=datetime_col,
utc=utc,
datetime_format=datetime_format,
historical_start=historical_start,
historical_end=historical_end,
future_start=future_start,
future_end=future_end,
)
# Preprocess the data into daily load profiles
preprocess_lcl.preprocess_data(
data_dir,
datetime_col=datetime_col,
kwh_col=kwh_col,
id_col=id_col,
utc=utc,
datetime_format=datetime_format,
time_resolution=time_resolution,
feature_cols=feature_cols,
drop_nulls=drop_nulls,
)
if split:
# Split dataset into training/ holdout sets
split_households.split_data(
data_dir,
CSV_FILE_NAME,
sample_fraction=sample_fraction,
id_col=id_col,
kwh_col=kwh_col,
datetime_col=datetime_col,
utc=utc,
datetime_format=datetime_format,
historical_start=historical_start,
historical_end=historical_end,
future_start=future_start,
future_end=future_end,
)
if preprocess:
# Preprocess the data into daily load profiles
preprocess_lcl.preprocess_data(
data_dir,
datetime_col=datetime_col,
kwh_col=kwh_col,
id_col=id_col,
utc=utc,
datetime_format=datetime_format,
time_resolution=time_resolution,
feature_cols=feature_cols,
drop_nulls=drop_nulls,
)


if __name__ == "__main__":
# Whether to split and/or preprocess the data
split = True
preprocess = True

# Data directory
data_dir = "./data"

# Fraction of households to include in training set
sample_fraction = 0.75

# Dataset location
Expand All @@ -147,6 +157,8 @@ def split_preprocess_data(
drop_nulls = True

split_preprocess_data(
split,
preprocess,
data_dir,
csv_data_path,
sample_fraction,
Expand Down
5 changes: 3 additions & 2 deletions src/opensynth/datasets/low_carbon_london/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def load_lcl_data_by_year(
) -> pd.DataFrame | pl.DataFrame:
"""Load LCL data for a specific year.

Returns a DataFrame in wide format. The first column contains the timestamp.
Returns a DataFrame in wide format. The first column contains the
timestamp.

Args:
fname (str or Path): Location of the `train.csv` data file.
Expand All @@ -25,7 +26,7 @@ def load_lcl_data_by_year(
pl.DataFrame with KWH/hh measurements.
"""
fname = (
Path(__file__).parents[0] / "../../../../data/raw/historical/train.csv"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we should store these kind of paths in an env variable or something?

Path(__file__).parents[0] / "./data/raw/historical/train.csv"
if fname is None
else Path(fname)
)
Expand Down
54 changes: 29 additions & 25 deletions src/opensynth/utils/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ def infer_date_column(df: pl.DataFrame) -> str:

Returns the column name of a column in Date format, or a String column that
matches a Date string. If the DataFrame contains only one matching column,
this function will return that column name. If multiple columns match, it will
return the column name that matches a canonical Date name, such as "DATUM".
In all other cases the function will raise a ValueError().
this function will return that column name. If multiple columns match, it
will return the column name that matches a canonical Date name, such as
"DATUM". In all other cases the function will raise a ValueError().

Args:
df (pl.DataFrame): DataFrame.
Expand All @@ -28,8 +28,8 @@ def infer_date_column(df: pl.DataFrame) -> str:
str: column name of a column in Date or Date-like format.

Raises:
ValueError: if no columns are in a Date-like format or multiple columns are
in Date-like format and match a canonical name.
ValueError: if no columns are in a Date-like format or multiple columns
are in Date-like format and match a canonical name.

"""
date_columns = df.select(pl.col(pl.Date)).columns
Expand All @@ -49,7 +49,8 @@ def infer_date_column(df: pl.DataFrame) -> str:
return list(canonical_columns)[0]
case _:
raise ValueError(
"Multiple Date-like columns found with a matching canonical name!"
"Multiple Date-like columns found with a matching canonical \
name!"
)


Expand All @@ -68,20 +69,21 @@ def semiwide_to_long(
default in "%HH%mm" format.

Args:
df (polars.DataFrame): DataFrame in semi-wide wide format, containing DateTime-
compatible column names.
on (list, optional): Columns to use as timepoints. By default, all columns that
match the pattern '[0-9][0-9][0-9][0-9]' will be used.
date_col (str, optional): Column that contains the Date values. By default,
a column that is in Date format, or that is a Date-compatible string, will
be used, if there is only one column in that format. If there are multiple
Date-compatible, columns, but only one matches a canonical name such as
DATUM, that column will be used. Otherwise, this method will fail, and the
date_col needs to be explicitly specified.
df (polars.DataFrame): DataFrame in semi-wide wide format, containing
DateTime-compatible column names.
on (list, optional): Columns to use as timepoints. By default, all
columns that match the pattern '[0-9][0-9][0-9][0-9]' will be used.
date_col (str, optional): Column that contains the Date values. By
default, a column that is in Date format, or that is a
Date-compatible string, will be used, if there is only one column
in that format. If there are multiple Date-compatible columns, but
only one matches a canonical name such as DATUM, that column will
be used. Otherwise, this method will fail, and the date_col needs
to be explicitly specified.
datetime_name (str, optional): Name for the DateTime column in the long
DataFrame, "DATUM_TIJD" by default.
value_name (str, optional): Name to give to the value column. Defaults to
"value".
value_name (str, optional): Name to give to the value column. Defaults
to "value".

Returns:
polars.DataFrame in long format.
Expand Down Expand Up @@ -137,12 +139,13 @@ def semiwide_to_wide(
DateTime-compatible column names.
on (list, optional): Columns to use as timepoints. By default, all
columns that match the pattern '[0-9][0-9][0-9][0-9]' will be used.
date_col (str, optional): Column that contains the Date values. By default,
a column that is in Date format, or that is a Date-compatible string,
will be used, if there is only one column in that format. If there are
multiple Date-compatible, columns, but only one matches a canonical name
such as DATUM, that column will be used. Otherwise, this method will fail,
and the date_col needs to be explicitly specified.
date_col (str, optional): Column that contains the Date values. By
default, a column that is in Date format, or that is a
Date-compatible string, will be used, if there is only one column
in that format. If there are multiple Date-compatible columns, but
only one matches a canonical name such as DATUM, that column will
be used. Otherwise, this method will fail, and the date_col needs
to be explicitly specified.
datetime_name (str, optional): Name for the DateTime column in the long
DataFrame, "datetime" by default.

Expand Down Expand Up @@ -178,7 +181,8 @@ def randomize_index_column(
Args:
df (DataFrame): Input DataFrame.
index_col_name (str): Name of index column.
sample_col_name (str): Name of new column containing the randomized index.
sample_col_name (str): Name of new column containing the randomized
index.

Returns:
DataFrame with index column values randomized.
Expand Down
5 changes: 4 additions & 1 deletion tests/evaluation/fidelity/test_autocorrelation.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def test_dataframe_half_hour_pandas(test_dataframe_half_hour):

@pytest.fixture(scope="module")
def test_dataframe_quarterly():
""" "DataFrame with 15-minute timesteps and high correlation with a week time-lag."""
"""
DataFrame with 15-minute timesteps and high correlation with a week
time-lag.
"""
n_minutes = 15
n_values = 60 // n_minutes * 24 * 7 # 1 week
n_timesteps = 35041
Expand Down
Loading