diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index abc397a..3334d9c 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 460f88a..577d1b0 100644 Binary files a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc index f4059a3..ac4dca5 100644 Binary files a/q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py index edc724a..2c456aa 100644 --- a/q01_plot_corr/build.py +++ b/q01_plot_corr/build.py @@ -1,16 +1,25 @@ +# %load q01_plot_corr/build.py # Default imports import pandas as pd from matplotlib.pyplot import yticks, xticks, subplots, set_cmap +import matplotlib.pyplot as plt plt.switch_backend('agg') data = pd.read_csv('data/house_prices_multivariate.csv') -# Write your solution here: + +#Write your solution here: def plot_corr(data, size=11): corr = data.corr() fig, ax = subplots(figsize=(size, size)) - set_cmap("YlOrRd") + set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) - return ax + #return ax + +plot_corr(data,) +data._get_numeric_data().columns + + + diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index c4bc30d..ad5b424 100644 Binary files a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc index 40d2b70..899f703 100644 Binary files a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc index 43047f0..ada48c3 100644 Binary files a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc index 8372777..007f00f 100644 Binary files a/q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py index 9b1046a..bb437f6 100644 --- a/q02_best_k_features/build.py +++ b/q02_best_k_features/build.py @@ -1,3 +1,4 @@ +# %load q02_best_k_features/build.py # Default imports import pandas as pd @@ -10,3 +11,21 @@ # Write your solution here: +def percentile_k_features(df, K=20): + x = df.iloc[:,:-1] + y = df.iloc[:,-1] + #selecting features on the basis of p-value i.e whose value less than percentile is true + best_feature = SelectPercentile(f_regression, percentile=K) + #selecting best features from X + best_feature.fit_transform(x,y) + #creating dataframe from score, get_support, result + d = {'support': best_feature.get_support(),'values':best_feature.scores_} + df1 = pd.DataFrame(d,index = x.columns) + #sorting values according get_support + df1 = df1.sort_values('values', ascending=False) + #selecting only rows whose value of support is True + col = df1[df1.support].index + return list(col) # returning list of features +percentile_k_features(data ,20) + + diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 86a25cf..4b92724 100644 Binary files a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc new file mode 100644 index 0000000..7d1e9e9 Binary files /dev/null and b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5b44655 Binary files /dev/null and b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..251efa0 Binary files /dev/null and b/q03_rf_rfe/__pycache__/build.cpython-36.pyc differ diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py index e8a8d20..7483767 100644 --- a/q03_rf_rfe/build.py +++ b/q03_rf_rfe/build.py @@ -1,3 +1,4 @@ +# %load q03_rf_rfe/build.py # Default imports import pandas as pd @@ -7,5 +8,18 @@ from sklearn.ensemble import RandomForestClassifier -# Your solution code here +# Your solution code hered +def rf_rfe(df): + + x = df.iloc[:,:-1] + y = df.iloc[:,-1] + #creating a model + Ra = RandomForestClassifier() + rf = RFE(Ra) + #selecting best features all avaiable features + rf.fit(x,y) + most_sig = list(x.columns[rf.support_]) + return most_sig +rf_rfe(data) + diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2c093d8 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc new file mode 100644 index 0000000..177a35f Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2fd8e3e Binary files /dev/null and b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..43ef503 Binary files /dev/null and b/q04_select_from_model/__pycache__/build.cpython-36.pyc differ diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py index 12dd1df..45b2b33 100644 --- a/q04_select_from_model/build.py +++ b/q04_select_from_model/build.py @@ -1,3 +1,4 @@ +# %load q04_select_from_model/build.py # Default imports from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier @@ -6,5 +7,16 @@ data = pd.read_csv('data/house_prices_multivariate.csv') - +np.random.seed(9) # Your solution code here +def select_from_model(df): + x = df.iloc[:,:-1] + y = df.iloc[:,-1] + Random = RandomForestClassifier() + best_features = SelectFromModel(Random) + best_features.fit(x,y) + feature_name = list(x.columns[best_features.get_support()]) + return feature_name +select_from_model(data) + + diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f11ce02 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc new file mode 100644 index 0000000..62bb0a3 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..edc7811 Binary files /dev/null and b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/build.cpython-36.pyc b/q05_forward_selected/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..8f323ba Binary files /dev/null and b/q05_forward_selected/__pycache__/build.cpython-36.pyc differ diff --git a/q05_forward_selected/build.py b/q05_forward_selected/build.py index 8816abd..b73e5ed 100644 --- a/q05_forward_selected/build.py +++ b/q05_forward_selected/build.py @@ -1,6 +1,10 @@ +# %load q05_forward_selected/build.py # Default imports import pandas as pd from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, accuracy_score +model = LinearRegression() data = pd.read_csv('data/house_prices_multivariate.csv') @@ -8,3 +12,92 @@ # Your solution code here + + + +def forward_selected(data, model): + X = data.iloc[:,:-1] + y = data.iloc[:,-1] + l = [] + + score = -1000 + c = '' + variable_1 = [] + variable_2 = [] + column = list(X.columns) + for i in range(len(column)): + + + for col in column: + #print(col) + l.append(col) + model.fit(X[l],y) + acc = model.score(X[l],y) + #print(col, acc) + if acc > score: + score = acc + c = col + l.pop(len(l)-1) +# print(' ') + #print(col,c, score, acc) + #print(' ') + if c in l: + pass + else: + l.append(c) + column.remove(c) + variable_2.append(c) + variable_1.append(score) + return variable_2, variable_1 + + +var1 , var2 = forward_selected(data, model) +var2 +# from sklearn.linear_model import LinearRegression +# from sklearn.model_selection import train_test_split +# from sklearn.metrics import r2_score, accuracy_score +# model = LinearRegression() + + + +# def forward_selected(X,y,i): +# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state =i) +# l = [] + +# score = -1000 +# c = '' +# variable_1 = [] +# variable_2 = [] +# column = list(X_train.columns) +# for i in range(len(column)): + + +# for col in column: +# #print(col) +# l.append(col) +# model.fit(X_train[l],y_train) +# y_pred = model.predict(X_test[l]) +# acc = r2_score(y_pred, y_test) +# #print(col, acc) +# if acc > score: +# score = acc +# c = col +# l.pop(len(l)-1) +# # print(' ') +# # print(c, score) +# if c in l: +# pass +# else: +# l.append(c) +# column.remove(c) +# variable_2.append(c) +# variable_1.append(score) +# return variable_2 +# data.columns + + + + + + + diff --git a/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d5ebebd Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc new file mode 100644 index 0000000..daf02de Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc differ