From 377e9e48052b9c40ea65b509dd0d39e291e27b15 Mon Sep 17 00:00:00 2001 From: alexjmsherman Date: Fri, 27 Apr 2018 23:13:41 -0400 Subject: [PATCH] remove N (size of corpus vocabulary) from phrase model formula as it is not used in gensim implementation --- executable/Modern_NLP_in_Python.ipynb | 258 +++++++------------------- 1 file changed, 67 insertions(+), 191 deletions(-) diff --git a/executable/Modern_NLP_in_Python.ipynb b/executable/Modern_NLP_in_Python.ipynb index ed4d489..d92a696 100644 --- a/executable/Modern_NLP_in_Python.ipynb +++ b/executable/Modern_NLP_in_Python.ipynb @@ -81,9 +81,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -131,9 +129,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -178,9 +174,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -242,9 +236,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -335,9 +327,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import spacy\n", @@ -357,9 +347,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -414,9 +402,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -442,9 +428,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -497,9 +481,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -712,7 +694,6 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [ @@ -805,9 +786,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1222,9 +1201,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1701,9 +1678,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2185,9 +2160,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3004,13 +2977,12 @@ "source": [ "_Phrase modeling_ is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our reviews and looking for words that _co-occur_ (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is:\n", "\n", - "$$\\frac{count(A\\ B) - count_{min}}{count(A) * count(B)} * N > threshold$$\n", + "$$\\frac{count(A\\ B) - count_{min}}{count(A) * count(B)} > threshold$$\n", "\n", "...where:\n", "* $count(A)$ is the number of times token $A$ appears in the corpus\n", "* $count(B)$ is the number of times token $B$ appears in the corpus\n", "* $count(A\\ B)$ is the number of times the tokens $A\\ B$ appear in the corpus *in order*\n", - "* $N$ is the total size of the corpus vocabulary\n", "* $count_{min}$ is a user-defined parameter to ensure that accepted phrases occur a minimum number of times\n", "* $threshold$ is a user-defined parameter to control how strong of a relationship between two tokens the model requires before accepting them as a phrase\n", "\n", @@ -3122,9 +3094,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3157,9 +3127,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "unigram_sentences = LineSentence(unigram_sentences_filepath)" @@ -3175,9 +3143,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3233,9 +3199,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3283,9 +3247,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3326,9 +3288,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3385,9 +3345,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3435,9 +3393,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3478,9 +3434,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3548,9 +3502,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3600,9 +3552,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3688,9 +3638,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from gensim.corpora import Dictionary, MmCorpus\n", @@ -3724,9 +3672,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3804,9 +3750,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3854,9 +3798,7 @@ { "cell_type": "code", "execution_count": 44, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3920,9 +3862,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -4034,9 +3974,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')\n", @@ -4072,9 +4010,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -4113,9 +4049,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -4259,9 +4193,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lda_description(review_text, min_topic_freq=0.05):\n", @@ -4309,7 +4241,6 @@ "cell_type": "code", "execution_count": 54, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [ @@ -4330,9 +4261,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -4351,9 +4280,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -4376,9 +4303,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -4457,9 +4382,7 @@ { "cell_type": "code", "execution_count": 58, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from gensim.models import Word2Vec\n", @@ -4478,9 +4401,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -4528,9 +4449,7 @@ { "cell_type": "code", "execution_count": 60, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -4554,9 +4473,7 @@ { "cell_type": "code", "execution_count": 90, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -6449,9 +6366,7 @@ { "cell_type": "code", "execution_count": 64, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6491,9 +6406,7 @@ { "cell_type": "code", "execution_count": 65, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6535,9 +6448,7 @@ { "cell_type": "code", "execution_count": 66, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6618,9 +6529,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6651,9 +6560,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6690,9 +6597,7 @@ { "cell_type": "code", "execution_count": 70, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6729,9 +6634,7 @@ { "cell_type": "code", "execution_count": 71, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6764,9 +6667,7 @@ { "cell_type": "code", "execution_count": 72, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6799,9 +6700,7 @@ { "cell_type": "code", "execution_count": 73, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6834,9 +6733,7 @@ { "cell_type": "code", "execution_count": 74, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6869,9 +6766,7 @@ { "cell_type": "code", "execution_count": 75, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6898,7 +6793,6 @@ "cell_type": "code", "execution_count": 76, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [ @@ -6924,9 +6818,7 @@ { "cell_type": "code", "execution_count": 77, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6950,9 +6842,7 @@ { "cell_type": "code", "execution_count": 78, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6983,9 +6873,7 @@ { "cell_type": "code", "execution_count": 79, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -7038,9 +6926,7 @@ { "cell_type": "code", "execution_count": 81, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "tsne_input = word_vectors.drop(spacy.en.STOPWORDS, errors=u'ignore')\n", @@ -7050,9 +6936,7 @@ { "cell_type": "code", "execution_count": 82, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -7263,9 +7147,7 @@ { "cell_type": "code", "execution_count": 93, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -7309,9 +7191,7 @@ { "cell_type": "code", "execution_count": 94, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -7394,9 +7274,7 @@ { "cell_type": "code", "execution_count": 88, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -7564,9 +7442,7 @@ { "cell_type": "code", "execution_count": 89, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -7783,23 +7659,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.6.3" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 }