diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 3337b5d..f188c6d 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_Unique_users_subreddit/__pycache__/__init__.cpython-36.pyc b/q01_Unique_users_subreddit/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e8efba7 Binary files /dev/null and b/q01_Unique_users_subreddit/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_Unique_users_subreddit/__pycache__/build.cpython-36.pyc b/q01_Unique_users_subreddit/__pycache__/build.cpython-36.pyc index ca2efed..fd43e03 100644 Binary files a/q01_Unique_users_subreddit/__pycache__/build.cpython-36.pyc and b/q01_Unique_users_subreddit/__pycache__/build.cpython-36.pyc differ diff --git a/q01_Unique_users_subreddit/build.py b/q01_Unique_users_subreddit/build.py index 82fe7e5..a8e9d31 100644 --- a/q01_Unique_users_subreddit/build.py +++ b/q01_Unique_users_subreddit/build.py @@ -1,7 +1,20 @@ +# %load q01_Unique_users_subreddit/build.py import pandas as pd import numpy as np from sklearn.model_selection import train_test_split -def q01_Unique_users_subreddit(): +def q01_Unique_users_subreddit(path): + + df = pd.read_csv(path, compression='zip') + variable1 = len(df['username'].unique()) + variable2 = len(df['subreddit'].unique()) + + return df,variable1, variable2 + + +path = 'data/subreddit-interactions-for-25000-users.zip' +q01_Unique_users_subreddit(path) + + diff --git a/q01_Unique_users_subreddit/tests/__pycache__/__init__.cpython-36.pyc b/q01_Unique_users_subreddit/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2dac1b9 Binary files /dev/null and b/q01_Unique_users_subreddit/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_Unique_users_subreddit/tests/__pycache__/test.cpython-36.pyc b/q01_Unique_users_subreddit/tests/__pycache__/test.cpython-36.pyc new file mode 100644 index 0000000..91111fa Binary files /dev/null and b/q01_Unique_users_subreddit/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q02_top_subreddits_wordcloud/__pycache__/__init__.cpython-36.pyc b/q02_top_subreddits_wordcloud/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ad71a27 Binary files /dev/null and b/q02_top_subreddits_wordcloud/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_top_subreddits_wordcloud/__pycache__/build.cpython-36.pyc b/q02_top_subreddits_wordcloud/__pycache__/build.cpython-36.pyc index 98c1cbb..d724505 100644 Binary files a/q02_top_subreddits_wordcloud/__pycache__/build.cpython-36.pyc and b/q02_top_subreddits_wordcloud/__pycache__/build.cpython-36.pyc differ diff --git a/q02_top_subreddits_wordcloud/build.py b/q02_top_subreddits_wordcloud/build.py index ead5d42..899a6da 100644 --- a/q02_top_subreddits_wordcloud/build.py +++ b/q02_top_subreddits_wordcloud/build.py @@ -1,3 +1,4 @@ +# %load q02_top_subreddits_wordcloud/build.py import pandas as pd import numpy as np import matplotlib.pyplot as plt @@ -5,6 +6,29 @@ from sklearn.model_selection import train_test_split from greyatomlib.recommendor_system_project.q01_Unique_users_subreddit.build import q01_Unique_users_subreddit -def q02_top_subreddits_wordcloud(): +def q02_top_subreddits_wordcloud(path): + + # importing data + df, u_user, u_subreddit = q01_Unique_users_subreddit(path) + # Generating a DataFrame that comprise count of each username by subreddit + df_count_subreddit = df.groupby('subreddit')['username'].count().reset_index().sort_values('username',ascending = False) + # setting subreddit name as index of dataframe + df_count_subreddit.index = df_count_subreddit['subreddit'] + df_count_subreddit.drop('subreddit', inplace = True,axis = 1) + #creating dictionary of dataframe where key is subreddit name and value is frequency of particular subreddit + d = df_count_subreddit.to_dict()['username'] + # creating object of wordCloud + wordcloud = WordCloud() + # generating wordcloud with frequencies store in dictionary + wordcloud.generate_from_frequencies(frequencies=d) + plt.figure() + plt.imshow(wordcloud, interpolation='bilinear') + plt.axis('off') + plt.show() + + +path = 'data/subreddit-interactions-for-25000-users.zip' +q02_top_subreddits_wordcloud(path) +ls diff --git a/q02_top_subreddits_wordcloud/tests/__pycache__/__init__.cpython-36.pyc b/q02_top_subreddits_wordcloud/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..1c349b2 Binary files /dev/null and b/q02_top_subreddits_wordcloud/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_top_subreddits_wordcloud/tests/__pycache__/test.cpython-36.pyc b/q02_top_subreddits_wordcloud/tests/__pycache__/test.cpython-36.pyc new file mode 100644 index 0000000..205eba2 Binary files /dev/null and b/q02_top_subreddits_wordcloud/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q03_plot_topK_subreddit_of_a_user/__pycache__/__init__.cpython-36.pyc b/q03_plot_topK_subreddit_of_a_user/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b8d2571 Binary files /dev/null and b/q03_plot_topK_subreddit_of_a_user/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_plot_topK_subreddit_of_a_user/__pycache__/build.cpython-36.pyc b/q03_plot_topK_subreddit_of_a_user/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..e02ab36 Binary files /dev/null and b/q03_plot_topK_subreddit_of_a_user/__pycache__/build.cpython-36.pyc differ diff --git a/q03_plot_topK_subreddit_of_a_user/build.py b/q03_plot_topK_subreddit_of_a_user/build.py index 59a1799..9993eb8 100644 --- a/q03_plot_topK_subreddit_of_a_user/build.py +++ b/q03_plot_topK_subreddit_of_a_user/build.py @@ -1,8 +1,18 @@ +# %load q03_plot_topK_subreddit_of_a_user/build.py import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from greyatomlib.recommendor_system_project.q01_Unique_users_subreddit.build import q01_Unique_users_subreddit -def q03_plot_topK_subreddit_of_a_user(): +def q03_plot_topK_subreddit_of_a_user(path, user='kabanossi', k= 14): + df, u_user, u_subreddit = q01_Unique_users_subreddit(path) + df1= df.groupby('subreddit')['username'].count().reset_index().sort_values('username',ascending=False) + df1['percentage'] = df1['username'].apply(lambda value: (float(value)/total_user)*100) + return df1[:k] + +path = 'data/subreddit-interactions-for-25000-users.zip' +q03_plot_topK_subreddit_of_a_user(path, user='kabanossi', k= 14) + + diff --git a/q03_plot_topK_subreddit_of_a_user/tests/__pycache__/__init__.cpython-36.pyc b/q03_plot_topK_subreddit_of_a_user/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..3886876 Binary files /dev/null and b/q03_plot_topK_subreddit_of_a_user/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_plot_topK_subreddit_of_a_user/tests/__pycache__/test.cpython-36.pyc b/q03_plot_topK_subreddit_of_a_user/tests/__pycache__/test.cpython-36.pyc new file mode 100644 index 0000000..200d9e7 Binary files /dev/null and b/q03_plot_topK_subreddit_of_a_user/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q04_weightage/__pycache__/__init__.cpython-36.pyc b/q04_weightage/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a885472 Binary files /dev/null and b/q04_weightage/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_weightage/__pycache__/build.cpython-36.pyc b/q04_weightage/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..a06bdec Binary files /dev/null and b/q04_weightage/__pycache__/build.cpython-36.pyc differ diff --git a/q04_weightage/build.py b/q04_weightage/build.py index f7c9792..7fe61f7 100644 --- a/q04_weightage/build.py +++ b/q04_weightage/build.py @@ -1,6 +1,24 @@ +# %load q04_weightage/build.py import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from greyatomlib.recommendor_system_project.q01_Unique_users_subreddit.build import q01_Unique_users_subreddit -def q04_weightage(): +def q04_weightage(path): + + df, u_user, u_subreddit = q01_Unique_users_subreddit(path) + # minimum value in utc + mininum = min(df['utc']) + #maximum value in utc for normalization + maximum = max(df['utc']) + + # creating weight column + df['weight'] = ((df['utc'] - mininum)+1)/maximum + + return df + + +path = 'data/subreddit-interactions-for-25000-users.zip' +q04_weightage(path) + + diff --git a/q04_weightage/tests/__pycache__/__init__.cpython-36.pyc b/q04_weightage/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..cc04cfd Binary files /dev/null and b/q04_weightage/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_weightage/tests/__pycache__/test.cpython-36.pyc b/q04_weightage/tests/__pycache__/test.cpython-36.pyc new file mode 100644 index 0000000..8a875ad Binary files /dev/null and b/q04_weightage/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q05_groupby_users_subreddit/__pycache__/__init__.cpython-36.pyc b/q05_groupby_users_subreddit/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..1dcad8b Binary files /dev/null and b/q05_groupby_users_subreddit/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_groupby_users_subreddit/__pycache__/build.cpython-36.pyc b/q05_groupby_users_subreddit/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..a12cbcd Binary files /dev/null and b/q05_groupby_users_subreddit/__pycache__/build.cpython-36.pyc differ diff --git a/q05_groupby_users_subreddit/build.py b/q05_groupby_users_subreddit/build.py index 556dbc4..766336a 100644 --- a/q05_groupby_users_subreddit/build.py +++ b/q05_groupby_users_subreddit/build.py @@ -1,7 +1,17 @@ +# %load q05_groupby_users_subreddit/build.py import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from greyatomlib.recommendor_system_project.q04_weightage.build import q04_weightage -def q05_groupby_users_subreddit(): +def q05_groupby_users_subreddit(path): + + df = q04_weightage(path) + df1 = df.groupby(['username','subreddit'])['weights'].sum().reset_index() + + return df1 + +path = 'data/subreddit-interactions-for-25000-users.zip' +q05_groupby_users_subreddit(path) + diff --git a/q05_groupby_users_subreddit/tests/__pycache__/__init__.cpython-36.pyc b/q05_groupby_users_subreddit/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..0c74a31 Binary files /dev/null and b/q05_groupby_users_subreddit/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_groupby_users_subreddit/tests/__pycache__/test.cpython-36.pyc b/q05_groupby_users_subreddit/tests/__pycache__/test.cpython-36.pyc new file mode 100644 index 0000000..d5f6d50 Binary files /dev/null and b/q05_groupby_users_subreddit/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q06_similarity/__pycache__/__init__.cpython-36.pyc b/q06_similarity/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4dedadc Binary files /dev/null and b/q06_similarity/__pycache__/__init__.cpython-36.pyc differ diff --git a/q06_similarity/__pycache__/build.cpython-36.pyc b/q06_similarity/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..3f59fd0 Binary files /dev/null and b/q06_similarity/__pycache__/build.cpython-36.pyc differ diff --git a/q06_similarity/build.py b/q06_similarity/build.py index 387e3b0..547e5c9 100644 --- a/q06_similarity/build.py +++ b/q06_similarity/build.py @@ -1,3 +1,4 @@ +# %load q06_similarity/build.py import pandas as pd import numpy as np @@ -6,7 +7,7 @@ from greyatomlib.recommendor_system_project.q05_groupby_users_subreddit.build import q05_groupby_users_subreddit def q06_similarity(path, kind='subreddit', similarity_function=cosine_similarity): - "write your solution here" + 'write your solution here' df = q05_groupby_users_subreddit(path) df01 = df.iloc[:100,:] matrix= df01.pivot_table(values='weights',columns='subreddit',index='username') @@ -22,3 +23,6 @@ def q06_similarity(path, kind='subreddit', similarity_function=cosine_similarity a = q06_similarity('data/subreddit-interactions-for-25000-users.zip') print(a) + + + diff --git a/q06_similarity/tests/__pycache__/__init__.cpython-36.pyc b/q06_similarity/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9c29d90 Binary files /dev/null and b/q06_similarity/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q06_similarity/tests/__pycache__/test.cpython-36.pyc b/q06_similarity/tests/__pycache__/test.cpython-36.pyc new file mode 100644 index 0000000..e46588d Binary files /dev/null and b/q06_similarity/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q06_similarity/tests/test_sol.pkl b/q06_similarity/tests/test_sol.pkl new file mode 100644 index 0000000..c143620 Binary files /dev/null and b/q06_similarity/tests/test_sol.pkl differ diff --git a/q06_similarity/tests/user_sol.pkl b/q06_similarity/tests/user_sol.pkl new file mode 100644 index 0000000..ed8b5e6 Binary files /dev/null and b/q06_similarity/tests/user_sol.pkl differ diff --git a/q07_recommendations/__pycache__/__init__.cpython-36.pyc b/q07_recommendations/__pycache__/__init__.cpython-36.pyc index 6fe9d54..b6a5c34 100644 Binary files a/q07_recommendations/__pycache__/__init__.cpython-36.pyc and b/q07_recommendations/__pycache__/__init__.cpython-36.pyc differ diff --git a/q07_recommendations/__pycache__/build.cpython-36.pyc b/q07_recommendations/__pycache__/build.cpython-36.pyc index 5e2b99a..b4fabcd 100644 Binary files a/q07_recommendations/__pycache__/build.cpython-36.pyc and b/q07_recommendations/__pycache__/build.cpython-36.pyc differ diff --git a/q07_recommendations/build.py b/q07_recommendations/build.py index f3ed7f3..3b1d9f2 100644 --- a/q07_recommendations/build.py +++ b/q07_recommendations/build.py @@ -1,3 +1,4 @@ +# %load q07_recommendations/build.py import pandas as pd import numpy as np from sklearn.model_selection import train_test_split @@ -5,7 +6,7 @@ from greyatomlib.recommendor_system_project.q06_similarity.build import q06_similarity def q07_recommendations(path, user='--ANUSTART-', similarity_function=cosine_similarity, kind='subreddit', number=5): - "write your solution here" + 'write your solution here' new_df, matrix = q06_similarity(path, kind='subreddit', similarity_function=cosine_similarity) final_dict = dict() sorted_sub = matrix.loc[user,:].sort_values(ascending=False).index @@ -19,3 +20,7 @@ def q07_recommendations(path, user='--ANUSTART-', similarity_function=cosine_sim final = [x[0] for x in sorted_dict] recommend = [x for x in final if matrix.loc[user,x]==0.0] return recommend[0:number] + + + + diff --git a/q07_recommendations/tests/__pycache__/__init__.cpython-36.pyc b/q07_recommendations/tests/__pycache__/__init__.cpython-36.pyc index 9328f69..d5af7c8 100644 Binary files a/q07_recommendations/tests/__pycache__/__init__.cpython-36.pyc and b/q07_recommendations/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q07_recommendations/tests/__pycache__/test.cpython-36.pyc b/q07_recommendations/tests/__pycache__/test.cpython-36.pyc index f2918bb..bee1417 100644 Binary files a/q07_recommendations/tests/__pycache__/test.cpython-36.pyc and b/q07_recommendations/tests/__pycache__/test.cpython-36.pyc differ