diff --git a/q01_load_data/build.py b/q01_load_data/build.py index a29c139..ad654eb 100644 --- a/q01_load_data/build.py +++ b/q01_load_data/build.py @@ -1,7 +1,14 @@ +# %load q01_load_data/build.py import pandas as pd import numpy as np -from sklearn.model_selection import train_test_split + +path = 'data/elecdemand.csv' + +def q01_load_data(path): + data = pd.read_csv(path) + data['Datetime'] = pd.to_datetime(data['Datetime']) + return data.shape, data + - diff --git a/q01_load_data/tests/test_sol.pkl b/q01_load_data/tests/test_sol.pkl new file mode 100644 index 0000000..7912fb6 Binary files /dev/null and b/q01_load_data/tests/test_sol.pkl differ diff --git a/q01_load_data/tests/user_sol.pkl b/q01_load_data/tests/user_sol.pkl new file mode 100644 index 0000000..2ad49fc Binary files /dev/null and b/q01_load_data/tests/user_sol.pkl differ diff --git a/q02_data_splitter/build.py b/q02_data_splitter/build.py index b6c715f..9ab0206 100644 --- a/q02_data_splitter/build.py +++ b/q02_data_splitter/build.py @@ -1,7 +1,18 @@ +# %load q02_data_splitter/build.py import pandas as pd import numpy as np from sklearn.model_selection import TimeSeriesSplit from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data +def q02_data_splitter(path): + path = 'data/elecdemand.csv' + shape, df = q01_load_data(path) + tscv = TimeSeriesSplit(n_splits=2) + com_idx = [] + for train_index, valid_index in tscv.split(df): + com_idx.append((train_index, valid_index)) + return com_idx + + + - diff --git a/q02_data_splitter/tests/test_sol.pkl b/q02_data_splitter/tests/test_sol.pkl new file mode 100644 index 0000000..a3e9cc5 Binary files /dev/null and b/q02_data_splitter/tests/test_sol.pkl differ diff --git a/q02_data_splitter/tests/user_sol.pkl b/q02_data_splitter/tests/user_sol.pkl new file mode 100644 index 0000000..70c2333 Binary files /dev/null and b/q02_data_splitter/tests/user_sol.pkl differ diff --git a/q03_time_plot/build.py b/q03_time_plot/build.py index bf18743..a7d5307 100644 --- a/q03_time_plot/build.py +++ b/q03_time_plot/build.py @@ -1,7 +1,19 @@ +# %load q03_time_plot/build.py import pandas as pd import numpy as np import matplotlib.pyplot as plt from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data plt.switch_backend('agg') +def q03_time_plot(path): + path = 'data/elecdemand.csv' + shape, df = q01_load_data(path) + plt.plot(df['Datetime'], df['Demand']) + plt.title('Electricity Demand for Australia for a year') + plt.xlabel('Year-Month') + plt.ylabel('Demand') + plt.show(); + + + diff --git a/q04_boxplot/build.py b/q04_boxplot/build.py index c69f931..44c9a09 100644 --- a/q04_boxplot/build.py +++ b/q04_boxplot/build.py @@ -1,7 +1,16 @@ +# %load q04_boxplot/build.py import pandas as pd import numpy as np import matplotlib.pyplot as plt from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data plt.switch_backend('agg') - +def q04_boxplot(path): + path = 'data/elecdemand.csv' + shape, df = q01_load_data(path) + df.boxplot(column=['Demand'], by=['WorkDay']) + plt.show(); + + + + diff --git a/q04_boxplot/tests/test_sol.pkl b/q04_boxplot/tests/test_sol.pkl new file mode 100644 index 0000000..f863f64 Binary files /dev/null and b/q04_boxplot/tests/test_sol.pkl differ diff --git a/q04_boxplot/tests/user_sol.pkl b/q04_boxplot/tests/user_sol.pkl new file mode 100644 index 0000000..44dda79 Binary files /dev/null and b/q04_boxplot/tests/user_sol.pkl differ diff --git a/q05_feature_engineering/build.py b/q05_feature_engineering/build.py index 97e29e7..d7f3450 100644 --- a/q05_feature_engineering/build.py +++ b/q05_feature_engineering/build.py @@ -1,9 +1,21 @@ +# %load q05_feature_engineering/build.py import pandas as pd import numpy as np import matplotlib.pyplot as plt +from scipy.stats import pearsonr from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data plt.switch_backend('agg') +def q05_feature_engineering(path): + path = 'data/elecdemand.csv' + shape, df = q01_load_data(path) + corr, p_value = pearsonr(df['Temperature'], df['Demand']) + plt.scatter(df['Temperature'], df['Demand']) + plt.xlabel('Temperature') + plt.ylabel('Demand') + plt.title('Temperature vs Demand') + plt.show(); + diff --git a/q05_feature_engineering/tests/test_sol.pkl b/q05_feature_engineering/tests/test_sol.pkl new file mode 100644 index 0000000..c8990f6 Binary files /dev/null and b/q05_feature_engineering/tests/test_sol.pkl differ diff --git a/q05_feature_engineering/tests/user_sol.pkl b/q05_feature_engineering/tests/user_sol.pkl new file mode 100644 index 0000000..9f2b9ec Binary files /dev/null and b/q05_feature_engineering/tests/user_sol.pkl differ diff --git a/q05_feature_engineering_part2/build.py b/q05_feature_engineering_part2/build.py index 53e6749..d6dbc7e 100644 --- a/q05_feature_engineering_part2/build.py +++ b/q05_feature_engineering_part2/build.py @@ -1,8 +1,35 @@ +# %load q05_feature_engineering_part2/build.py import pandas as pd import numpy as np import matplotlib.pyplot as plt -from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data +from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data plt.switch_backend('agg') +def q05_feature_engineering_part2(path): + path = 'data/elecdemand.csv' + shape, data = q01_load_data(path) + data['hour'] = data['Datetime'].dt.hour + data['month'] = data['Datetime'].dt.month + plt.figure(figsize=(16, 6)) + demand_hours = [] + for i in range(1,25): + one = data[data['hour'] == i]['Demand'].values + demand_hours.append(one) + demand_months = [] + for j in range(1,13): + demand_months.append(data[data['month'] == j]['Demand'].values) + plt.subplot(211) + plt.boxplot(demand_hours, labels=[str(i) for i in range(1,25)]) + plt.xlabel('Hour') + plt.ylabel('Demand') + plt.title('Change in Electricity demand wrt to Hour') + plt.subplot(212) + plt.boxplot(demand_months, labels=[str(i) for i in range(1,13)]) + plt.xlabel('Months') + plt.ylabel('Demand') + plt.title('Change in Electricity demand wrt to months') + plt.show(); + + + - diff --git a/q05_feature_engineering_part2/tests/test_sol.pkl b/q05_feature_engineering_part2/tests/test_sol.pkl new file mode 100644 index 0000000..2f666a1 Binary files /dev/null and b/q05_feature_engineering_part2/tests/test_sol.pkl differ diff --git a/q05_feature_engineering_part2/tests/user_sol.pkl b/q05_feature_engineering_part2/tests/user_sol.pkl new file mode 100644 index 0000000..9a90e98 Binary files /dev/null and b/q05_feature_engineering_part2/tests/user_sol.pkl differ diff --git a/q05_feature_engineering_part3/build.py b/q05_feature_engineering_part3/build.py index 7da14f7..1d2cd9f 100644 --- a/q05_feature_engineering_part3/build.py +++ b/q05_feature_engineering_part3/build.py @@ -1,8 +1,33 @@ +# %load q05_feature_engineering_part3/build.py import pandas as pd import numpy as np import matplotlib.pyplot as plt from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data plt.switch_backend('agg') +def q05_feature_engineering_part3(path): + path = 'data/elecdemand.csv' + shape, data = q01_load_data(path) + data['hour'] = data['Datetime'].dt.hour + data['month'] = data['Datetime'].dt.month + plt.figure(figsize=(16, 6)) + demand_hours = [] + for i in range(1,25): + one = data[data['hour'] == i]['Demand'].values + demand_hours.append(one) + demand_months = [] + for j in range(1,13): + demand_months.append(data[data['month'] == j]['Demand'].values) + plt.subplot(211) + plt.boxplot(demand_hours, labels=[str(i) for i in range(1,25)]) + plt.xlabel('Hour') + plt.ylabel('Demand') + plt.title('Change in Electricity demand wrt to Hour') + plt.subplot(212) + plt.boxplot(demand_months, labels=[str(i) for i in range(1,13)]) + plt.xlabel('Months') + plt.ylabel('Demand') + plt.title('Change in Electricity demand wrt to months') + plt.show(); + - diff --git a/q05_feature_engineering_part3/tests/test_sol.pkl b/q05_feature_engineering_part3/tests/test_sol.pkl new file mode 100644 index 0000000..017cf66 Binary files /dev/null and b/q05_feature_engineering_part3/tests/test_sol.pkl differ diff --git a/q05_feature_engineering_part3/tests/user_sol.pkl b/q05_feature_engineering_part3/tests/user_sol.pkl new file mode 100644 index 0000000..4d921c9 Binary files /dev/null and b/q05_feature_engineering_part3/tests/user_sol.pkl differ diff --git a/q05_feature_engineering_part4/build.py b/q05_feature_engineering_part4/build.py index 2731397..cf7ec16 100644 --- a/q05_feature_engineering_part4/build.py +++ b/q05_feature_engineering_part4/build.py @@ -1,9 +1,18 @@ +# %load q05_feature_engineering_part2/build.py import pandas as pd import numpy as np -from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from greyatomlib.time_series_day_02_project.q01_load_data.build import q01_load_data -plt.switch_backend('agg') -def q05_feature_engineering_part4(): - +path = 'data/elecdemand.csv' + +def q05_feature_engineering_part4(path): + shape, data = q01_load_data(path) + data['hour'] = data['Datetime'].dt.hour + data['month'] = data['Datetime'].dt.strftime('%b') + data['Peakhours'] = list(map(lambda x: 1 if x in range(6,20) else 0, data['Datetime'].dt.hour)) + data['Peakmonths'] = list(map(lambda x: 1 if x in ['Feb', 'May', 'Jun', 'Jul', 'Aug'] else 0, data['Datetime'].dt.strftime('%b'))) + return data + + + diff --git a/q05_feature_engineering_part4/tests/test_sol.pkl b/q05_feature_engineering_part4/tests/test_sol.pkl new file mode 100644 index 0000000..fca5817 Binary files /dev/null and b/q05_feature_engineering_part4/tests/test_sol.pkl differ diff --git a/q05_feature_engineering_part4/tests/user_sol.pkl b/q05_feature_engineering_part4/tests/user_sol.pkl new file mode 100644 index 0000000..49a7e7f Binary files /dev/null and b/q05_feature_engineering_part4/tests/user_sol.pkl differ diff --git a/q06_linear_regression/build.py b/q06_linear_regression/build.py index 8c11052..e779f4c 100644 --- a/q06_linear_regression/build.py +++ b/q06_linear_regression/build.py @@ -1,3 +1,4 @@ +# %load q06_linear_regression/build.py import pandas as pd import numpy as np import math @@ -6,7 +7,23 @@ from greyatomlib.time_series_day_02_project.q05_feature_engineering_part4.build import q05_feature_engineering_part4 from greyatomlib.time_series_day_02_project.q02_data_splitter.build import q02_data_splitter -fe = ["WorkDay", "Peakhours", "Peakmonths"] +fe = ['WorkDay', 'Peakhours', 'Peakmonths'] + +def q06_linear_regression(path, columns = fe, random_state = 9): + np.random.seed(random_state) + data = q05_feature_engineering_part4(path) + com_idx = q02_data_splitter(path) + rmse = [] + for i in com_idx: + train_idx = i[0] + valid_idx = i[1] + X_train, y_train = data.ix[train_idx, fe], data.ix[train_idx, 'Demand'] + X_valid, y_valid = data.ix[valid_idx, fe], data.ix[valid_idx, 'Demand'] + model = LinearRegression() + model.fit(X_train, y_train) + y_pred = model.predict(X_valid) + rms = mean_squared_error(y_valid, y_pred)**0.5 + rmse.append(rms) + return np.mean(rmse) - diff --git a/q06_linear_regression/tests/test_sol.pkl b/q06_linear_regression/tests/test_sol.pkl new file mode 100644 index 0000000..e0cbf28 Binary files /dev/null and b/q06_linear_regression/tests/test_sol.pkl differ diff --git a/q06_linear_regression/tests/user_sol.pkl b/q06_linear_regression/tests/user_sol.pkl new file mode 100644 index 0000000..6794af1 Binary files /dev/null and b/q06_linear_regression/tests/user_sol.pkl differ diff --git a/q07_randomforest_regressor/build.py b/q07_randomforest_regressor/build.py index 4cdb470..12949db 100644 --- a/q07_randomforest_regressor/build.py +++ b/q07_randomforest_regressor/build.py @@ -1,3 +1,4 @@ +# %load q07_randomforest_regressor/build.py import pandas as pd import numpy as np import math @@ -6,7 +7,23 @@ from greyatomlib.time_series_day_02_project.q05_feature_engineering_part4.build import q05_feature_engineering_part4 from greyatomlib.time_series_day_02_project.q02_data_splitter.build import q02_data_splitter -fe = ["WorkDay", "Peakhours", "Peakmonths"] +fe = ['WorkDay', 'Peakhours', 'Peakmonths'] + +def q07_randomforest_regressor(path, columns = fe, random_state = 9): + np.random.seed(random_state) + data = q05_feature_engineering_part4(path) + com_idx = q02_data_splitter(path) + rmse = [] + for i in com_idx: + train_idx = i[0] + valid_idx = i[1] + X_train, y_train = data.ix[train_idx, fe], data.ix[train_idx, 'Demand'] + X_valid, y_valid = data.ix[valid_idx, fe], data.ix[valid_idx, 'Demand'] + model = RandomForestRegressor(n_estimators=50, min_samples_leaf=30, random_state=10) + model.fit(X_train, y_train) + y_pred = model.predict(X_valid) + rms = mean_squared_error(y_valid, y_pred)**0.5 + rmse.append(rms) + return np.mean(rmse) - diff --git a/q08_gradientboosting_regressor/build.py b/q08_gradientboosting_regressor/build.py index e661aac..47a5438 100644 --- a/q08_gradientboosting_regressor/build.py +++ b/q08_gradientboosting_regressor/build.py @@ -1,3 +1,4 @@ +# %load q08_gradientboosting_regressor/build.py import pandas as pd import numpy as np import math @@ -6,5 +7,23 @@ from greyatomlib.time_series_day_02_project.q05_feature_engineering_part4.build import q05_feature_engineering_part4 from greyatomlib.time_series_day_02_project.q02_data_splitter.build import q02_data_splitter -fe = ["WorkDay", "Peakhours", "Peakmonths"] +fe = ['WorkDay', 'Peakhours', 'Peakmonths'] +def q08_gradientboosting_regressor(path, columns = fe, random_state = 9): + np.random.seed(random_state) + data = q05_feature_engineering_part4(path) + com_idx = q02_data_splitter(path) + rmse = [] + for i in com_idx: + train_idx = i[0] + valid_idx = i[1] + X_train, y_train = data.ix[train_idx, fe], data.ix[train_idx, 'Demand'] + X_valid, y_valid = data.ix[valid_idx, fe], data.ix[valid_idx, 'Demand'] + model = GradientBoostingRegressor(n_estimators=200, min_samples_leaf=10, learning_rate=0.01, random_state=random_state) + model.fit(X_train, y_train) + y_pred = model.predict(X_valid) + rms = mean_squared_error(y_valid, y_pred)**0.5 + rmse.append(rms) + return np.mean(rmse) + + diff --git a/test_sol.pkl b/test_sol.pkl new file mode 100644 index 0000000..cec104b Binary files /dev/null and b/test_sol.pkl differ diff --git a/user_sol.pkl b/user_sol.pkl new file mode 100644 index 0000000..fb41a08 Binary files /dev/null and b/user_sol.pkl differ