diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index abc397a..a814fae 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 460f88a..44ab5f8 100644 Binary files a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc index f4059a3..1f50837 100644 Binary files a/q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py index edc724a..30bf1a4 100644 --- a/q01_plot_corr/build.py +++ b/q01_plot_corr/build.py @@ -1,5 +1,7 @@ +# %load q01_plot_corr/build.py # Default imports import pandas as pd +import matplotlib.pyplot as plt from matplotlib.pyplot import yticks, xticks, subplots, set_cmap plt.switch_backend('agg') data = pd.read_csv('data/house_prices_multivariate.csv') @@ -9,8 +11,11 @@ def plot_corr(data, size=11): corr = data.corr() fig, ax = subplots(figsize=(size, size)) - set_cmap("YlOrRd") + set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) return ax + + + diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index c4bc30d..41c1c70 100644 Binary files a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc index 40d2b70..cae9b1f 100644 Binary files a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc index 43047f0..253e54e 100644 Binary files a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc index 8372777..623f765 100644 Binary files a/q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py index 9b1046a..2ca57eb 100644 --- a/q02_best_k_features/build.py +++ b/q02_best_k_features/build.py @@ -1,12 +1,30 @@ +# %load q02_best_k_features/build.py # Default imports import pandas as pd +import numpy as np +from sklearn.feature_selection import SelectPercentile,f_regression data = pd.read_csv('data/house_prices_multivariate.csv') -from sklearn.feature_selection import SelectPercentile -from sklearn.feature_selection import f_regression +# Write your solution here: + +def percentile_k_features(data, k = 20): + X = data.drop('SalePrice',axis=1) + y = data['SalePrice'] + + feat_col = X.columns + fs = SelectPercentile(f_regression, percentile=k) + + X_new = fs.fit_transform(X, y) + + imp_features_kth_percentile = [feat_col[i] for i in np.argsort(fs.scores_)[::-1]] + + #print (imp_features_kth_percentile[:7]) + + return imp_features_kth_percentile[:7] + +percentile_k_features(data,20) -# Write your solution here: diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 86a25cf..8853015 100644 Binary files a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc new file mode 100644 index 0000000..5b6d1db Binary files /dev/null and b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..c89e9fa Binary files /dev/null and b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..56743c2 Binary files /dev/null and b/q03_rf_rfe/__pycache__/build.cpython-36.pyc differ diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py index e8a8d20..c9856f8 100644 --- a/q03_rf_rfe/build.py +++ b/q03_rf_rfe/build.py @@ -1,3 +1,4 @@ +# %load q03_rf_rfe/build.py # Default imports import pandas as pd @@ -6,6 +7,21 @@ from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier - # Your solution code here +def rf_rfe(data): + X = data.drop('SalePrice',axis=1) + y = data['SalePrice'] + random_forest_model = RandomForestClassifier() + + rfe = RFE(random_forest_model,n_features_to_select=len(X.columns)/2) + rfe = rfe.fit(X,y) + + #print (list(X.columns[rfe.support_])) + + return list(X.columns[rfe.support_]) + +rf_rfe(data) + + + diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b335170 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc new file mode 100644 index 0000000..e932b48 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..3ab816a Binary files /dev/null and b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..24e6aee Binary files /dev/null and b/q04_select_from_model/__pycache__/build.cpython-36.pyc differ diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py index 12dd1df..a6c68a5 100644 --- a/q04_select_from_model/build.py +++ b/q04_select_from_model/build.py @@ -1,3 +1,4 @@ +# %load q04_select_from_model/build.py # Default imports from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier @@ -6,5 +7,21 @@ data = pd.read_csv('data/house_prices_multivariate.csv') - # Your solution code here + +def select_from_model(data): + X = data.drop('SalePrice',axis=1) + y = data['SalePrice'] + rf_model = RandomForestClassifier() + + select_fm = SelectFromModel(rf_model) + select_fm.fit_transform(X,y) + + #print (list(X.columns[select_fm.get_support()])) + + return list(X.columns[select_fm.get_support()]) + +select_from_model(data) + + + diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..301642f Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc new file mode 100644 index 0000000..1b65d28 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..07307dd Binary files /dev/null and b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/build.cpython-36.pyc b/q05_forward_selected/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..2a4ff3b Binary files /dev/null and b/q05_forward_selected/__pycache__/build.cpython-36.pyc differ diff --git a/q05_forward_selected/build.py b/q05_forward_selected/build.py index 8816abd..209c3ef 100644 --- a/q05_forward_selected/build.py +++ b/q05_forward_selected/build.py @@ -1,10 +1,46 @@ +# %load q05_forward_selected/build.py # Default imports import pandas as pd from sklearn.linear_model import LinearRegression - +from sklearn.metrics import r2_score +import numpy as np data = pd.read_csv('data/house_prices_multivariate.csv') model = LinearRegression() - # Your solution code here + +def forward_selected(data,model): + old_r2_score = 0 + new_r2_score = 1 + features = list(data.drop('SalePrice',axis=1).columns) + selected_features = [] + r2_score_features = [] + X_selected = pd.DataFrame() + result = pd.DataFrame() + y = data['SalePrice'] + while(True): + scores = [] + for i in range(len(features)): + X = data[features[i]] + X_selected = result + X_selected = pd.concat([X_selected,X], axis=1) + model.fit(X_selected,y) + y_pred = model.predict(X_selected) + scores.append(r2_score(y,y_pred)) + X_selected = result + np_scores = np.array(scores) + new_r2_score = np_scores.max() + if(new_r2_score>old_r2_score): + old_r2_score=new_r2_score + result = pd.concat([result,data[features[np.argmax(np_scores)]]], axis=1) + data = data.drop(features[np.argmax(np_scores)],axis = 1) + selected_features.append(features[np.argmax(np_scores)]) + r2_score_features.append(new_r2_score) + features.remove(features[np.argmax(np_scores)]) + else: + break + return selected_features,r2_score_features + + + diff --git a/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ed00bcb Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc new file mode 100644 index 0000000..350040a Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc differ