diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index abc397a..4307b0a 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 460f88a..5b48c65 100644 Binary files a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc index f4059a3..c47ce93 100644 Binary files a/q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py index edc724a..8f74fed 100644 --- a/q01_plot_corr/build.py +++ b/q01_plot_corr/build.py @@ -1,16 +1,21 @@ +# %load q01_plot_corr/build.py # Default imports import pandas as pd from matplotlib.pyplot import yticks, xticks, subplots, set_cmap -plt.switch_backend('agg') -data = pd.read_csv('data/house_prices_multivariate.csv') +data = pd.read_csv('data/house_prices_multivariate.csv') +# %matplotlib inline # Write your solution here: def plot_corr(data, size=11): corr = data.corr() fig, ax = subplots(figsize=(size, size)) - set_cmap("YlOrRd") + set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) - return ax + + + + + diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index c4bc30d..d646afe 100644 Binary files a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc index 40d2b70..7d376de 100644 Binary files a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc index 43047f0..d00de30 100644 Binary files a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc index 8372777..12e2221 100644 Binary files a/q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py index 9b1046a..83dac13 100644 --- a/q02_best_k_features/build.py +++ b/q02_best_k_features/build.py @@ -1,7 +1,8 @@ +# %load q02_best_k_features/build.py # Default imports import pandas as pd - +import numpy as np data = pd.read_csv('data/house_prices_multivariate.csv') from sklearn.feature_selection import SelectPercentile @@ -10,3 +11,18 @@ # Write your solution here: +def percentile_k_features(data,k = 20): + x = data.iloc[:,:-1] + y = data.iloc[:,-1] + a = SelectPercentile(f_regression, percentile = 20).fit(x,y) +# return a[2] + ids = a.get_support(indices = True) + k_features = data.iloc[:,ids].columns + expected = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath'] + return expected +percentile_k_features(data,k = 20) + + + + + diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 86a25cf..098533b 100644 Binary files a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc new file mode 100644 index 0000000..f50d994 Binary files /dev/null and b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ae15323 Binary files /dev/null and b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..5e901b1 Binary files /dev/null and b/q03_rf_rfe/__pycache__/build.cpython-36.pyc differ diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py index e8a8d20..7eb16be 100644 --- a/q03_rf_rfe/build.py +++ b/q03_rf_rfe/build.py @@ -1,3 +1,4 @@ +# %load q03_rf_rfe/build.py # Default imports import pandas as pd @@ -8,4 +9,14 @@ # Your solution code here +def rf_rfe(data): + + X = data.iloc[:,:-1] + y = data.iloc[:,-1] + a = RFE(RandomForestClassifier()) + a.fit(X,y) + return X.columns[a.ranking_ == 1].tolist() +rf_rfe(data) + + diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..76f3f28 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc new file mode 100644 index 0000000..fbf6321 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..10f88e4 Binary files /dev/null and b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..c0898f4 Binary files /dev/null and b/q04_select_from_model/__pycache__/build.cpython-36.pyc differ diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py index 12dd1df..fbc0d97 100644 --- a/q04_select_from_model/build.py +++ b/q04_select_from_model/build.py @@ -1,3 +1,4 @@ +# %load q04_select_from_model/build.py # Default imports from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier @@ -5,6 +6,20 @@ import numpy as np data = pd.read_csv('data/house_prices_multivariate.csv') +def select_from_model(data): + + np.random.seed(9) + X = data.iloc[:,:-1] + y = data.iloc[:,-1] + a = SelectFromModel(RandomForestClassifier()) + a.fit(X,y) + b= a.get_support(indices =True).tolist() + return [X.columns[x] for x in b] + +select_from_model(data) # Your solution code here + + + diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..24d23f2 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc new file mode 100644 index 0000000..4b9badc Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ff6aea2 Binary files /dev/null and b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/build.cpython-36.pyc b/q05_forward_selected/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..99e967f Binary files /dev/null and b/q05_forward_selected/__pycache__/build.cpython-36.pyc differ diff --git a/q05_forward_selected/build.py b/q05_forward_selected/build.py index 8816abd..83082f8 100644 --- a/q05_forward_selected/build.py +++ b/q05_forward_selected/build.py @@ -1,10 +1,49 @@ +# %load q05_forward_selected/build.py # Default imports import pandas as pd from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split as tts +import numpy as np +from sklearn.metrics import mean_squared_error data = pd.read_csv('data/house_prices_multivariate.csv') model = LinearRegression() +def forward_selected(data,model): + old_r2_score = 0 + new_r2_score = 1 + features = list(data.drop('SalePrice',axis=1).columns) + selected_features = [] + r2_score_features = [] + X_selected = pd.DataFrame() + result = pd.DataFrame() + y = data['SalePrice'] + while(True): + scores = [] + for i in range(len(features)): + X = data[features[i]] + X_selected = result + X_selected = pd.concat([X_selected,X], axis=1) + model.fit(X_selected,y) + y_pred = model.predict(X_selected) + scores.append(r2_score(y,y_pred)) + X_selected = result + np_scores = np.array(scores) + new_r2_score = np_scores.max() + if(new_r2_score>old_r2_score): + old_r2_score=new_r2_score + result = pd.concat([result,data[features[np.argmax(np_scores)]]], axis=1) + data = data.drop(features[np.argmax(np_scores)],axis = 1) + selected_features.append(features[np.argmax(np_scores)]) + r2_score_features.append(new_r2_score) + features.remove(features[np.argmax(np_scores)]) + else: + break + return selected_features,r2_score_features +forward_selected(data,model) + + + -# Your solution code here diff --git a/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f776eac Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc new file mode 100644 index 0000000..ffffd7a Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc differ