diff --git a/analysis_of_tags.ipynb b/analysis_of_tags.ipynb new file mode 100644 index 0000000..0236607 --- /dev/null +++ b/analysis_of_tags.ipynb @@ -0,0 +1,639 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of tag" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tags are our class labels. As we were trying to predict them, we should deep dive and understand them very well. After removing all the duplicated data we are left with 4.2 Million data points and 42k unique tags.\n", + "The number of times a tag appeared is an interesting thing to understand. So I just counted it and put it into a dictionary. If we observe the table below, the “.a” tag appeared in 18 questions, the “.app” tag appeared in 37 questions, and so on. Remember, we will never have a tag repeating two times in the same question." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To save time a subset of the original dataset is used here." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn as sk\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdTitleBodyTags
01How to check if an uploaded file is an image w...<p>I'd like to check if an uploaded file is an...php image-processing file-upload upload mime-t...
12How can I prevent firefox from closing when I ...<p>In my favorite editor (vim), I regularly us...firefox
23R Error Invalid type (list) for variable<p>I am import matlab file and construct a dat...r matlab machine-learning
34How do I replace special characters in a URL?<p>This is probably very simple, but I simply ...c# url encoding
45How to modify whois contact details?<pre><code>function modify(.......)\\r\\n{\\r\\n ...php api file-get-contents
\n", + "
" + ], + "text/plain": [ + " Id Title \\\n", + "0 1 How to check if an uploaded file is an image w... \n", + "1 2 How can I prevent firefox from closing when I ... \n", + "2 3 R Error Invalid type (list) for variable \n", + "3 4 How do I replace special characters in a URL? \n", + "4 5 How to modify whois contact details? \n", + "\n", + " Body \\\n", + "0

I'd like to check if an uploaded file is an... \n", + "1

In my favorite editor (vim), I regularly us... \n", + "2

I am import matlab file and construct a dat... \n", + "3

This is probably very simple, but I simply ... \n", + "4

function modify(.......)\\r\\n{\\r\\n  ...   \n",
+       "\n",
+       "                                                Tags  \n",
+       "0  php image-processing file-upload upload mime-t...  \n",
+       "1                                            firefox  \n",
+       "2                          r matlab machine-learning  \n",
+       "3                                    c# url encoding  \n",
+       "4                          php api file-get-contents  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"train_000.csv\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataframe shape:  (41234, 4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Dataframe shape: \",df.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Removing Duplicates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of duplicate questions :  0\n",
+      "Dataframe shape after removing duplicate values :  (41234, 4)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\lenovo\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
+      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
+     ]
+    }
+   ],
+   "source": [
+    "duplicates = df.sort_values('Title', ascending=False).duplicated('Title')\n",
+    "print(\"Total number of duplicate questions : \", duplicates.sum())\n",
+    "df = df[~duplicates]\n",
+    "print(\"Dataframe shape after removing duplicate values : \", df.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data Analysis on Tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdTitleBodyTagstag_count
01How to check if an uploaded file is an image w...<p>I'd like to check if an uploaded file is an...php image-processing file-upload upload mime-t...5
12How can I prevent firefox from closing when I ...<p>In my favorite editor (vim), I regularly us...firefox1
23R Error Invalid type (list) for variable<p>I am import matlab file and construct a dat...r matlab machine-learning3
34How do I replace special characters in a URL?<p>This is probably very simple, but I simply ...c# url encoding3
45How to modify whois contact details?<pre><code>function modify(.......)\\r\\n{\\r\\n ...php api file-get-contents3
\n", + "
" + ], + "text/plain": [ + " Id Title \\\n", + "0 1 How to check if an uploaded file is an image w... \n", + "1 2 How can I prevent firefox from closing when I ... \n", + "2 3 R Error Invalid type (list) for variable \n", + "3 4 How do I replace special characters in a URL? \n", + "4 5 How to modify whois contact details? \n", + "\n", + " Body \\\n", + "0

I'd like to check if an uploaded file is an... \n", + "1

In my favorite editor (vim), I regularly us... \n", + "2

I am import matlab file and construct a dat... \n", + "3

This is probably very simple, but I simply ... \n", + "4

function modify(.......)\\r\\n{\\r\\n  ...   \n",
+       "\n",
+       "                                                Tags  tag_count  \n",
+       "0  php image-processing file-upload upload mime-t...          5  \n",
+       "1                                            firefox          1  \n",
+       "2                          r matlab machine-learning          3  \n",
+       "3                                    c# url encoding          3  \n",
+       "4                          php api file-get-contents          3  "
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"tag_count\"] = df[\"Tags\"].apply(lambda x : len(x.split()))\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3    11922\n",
+       "2    10963\n",
+       "4     7870\n",
+       "1     5641\n",
+       "5     4838\n",
+       "Name: tag_count, dtype: int64"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"tag_count\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum number of tags in a question:  5\n",
+      "Minimum number of tags in a question:  1\n",
+      "Average number of tags in a question:  2.8860406460687784\n"
+     ]
+    }
+   ],
+   "source": [
+    "print( \"Max number of tags in a question: \", df[\"tag_count\"].max())\n",
+    "print( \"Min number of tags in a question: \", df[\"tag_count\"].min())\n",
+    "print( \"Average number of tags in a question: \", df[\"tag_count\"].mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Text(0, 0.5, 'Frequency')"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.countplot(df[\"tag_count\"])\n", + "plt.title(\"Number of tags in a questions \")\n", + "plt.xlabel(\"Number of Tags\")\n", + "plt.ylabel(\"Frequency\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Counting frequency of different tags and saving it to a dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = CountVectorizer(tokenizer= lambda text : text.split(\" \"))\n", + "tag_dtm = vectorizer.fit_transform(df[\"Tags\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['.class-file',\n", + " '.each',\n", + " '.emf',\n", + " '.hgtags',\n", + " '.htaccess',\n", + " '.htpasswd',\n", + " '.mov',\n", + " '.net',\n", + " '.net-1.1',\n", + " '.net-2.0',\n", + " '.net-3.5',\n", + " '.net-4.0',\n", + " '.net-4.5',\n", + " '.net-assembly',\n", + " '.net-cf-3.5']" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tags = vectorizer.get_feature_names()\n", + "tags[:15]" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "freqs = tag_dtm.sum(axis=0).A1\n", + "result = dict(zip(tags,freqs))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TagsCounts
0.class-file1
1.each6
2.emf1
3.hgtags1
4.htaccess146
\n", + "
" + ], + "text/plain": [ + " Tags Counts\n", + "0 .class-file 1\n", + "1 .each 6\n", + "2 .emf 1\n", + "3 .hgtags 1\n", + "4 .htaccess 146" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag_df = pd.DataFrame(result.items(), columns=[\"Tags\", \"Counts\"])\n", + "tag_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False)\n", + "tag_counts = tag_df_sorted[\"Counts\"].values" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(tag_counts[:50])\n", + "plt.scatter(x= list(range(0,50,2)), y = tag_counts[0:50:2], c= 'yellow')\n", + "plt.scatter(x= list(range(0,50,10)), y = tag_counts[0:50:10], c = \"blue\")\n", + "plt.grid()\n", + "plt.xlabel(\"Tag Number\")\n", + "plt.ylabel(\"Number of times the tag Appear\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "itr=np.arange(20)\n", + "tag_df_sorted.head(20).plot(kind='bar')\n", + "plt.title('Frequency of top 20 tags')\n", + "plt.xticks(itr, tag_df_sorted['Tags'])\n", + "plt.xlabel('Tags')\n", + "plt.ylabel('Counts')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### From the above bar plot we can see that C# ,java,php are the most searched tags on stackoverflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}