From 228f585fb29e3d94c6b3b402e92f3b344cdcfdb3 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 5 May 2020 12:54:20 +0000 Subject: [PATCH 1/4] Small update to Project.ipynb for the first part of the tutorial --- .gitignore | 4 +- Project/SageMaker Project.ipynb | 244 ++++++++++++++++++++++++++------ 2 files changed, 202 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index f48746098..2a693d3be 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ __pycache__/ # Distribution / packaging .Python env/ +data/ +cache/ build/ develop-eggs/ dist/ @@ -105,4 +107,4 @@ ENV/ # Notebook files Sentiment Analysis/aclImdb_v1.tar.gz -Sentiment Analysis/aclImdb/ \ No newline at end of file +Sentiment Analysis/aclImdb/ diff --git a/Project/SageMaker Project.ipynb b/Project/SageMaker Project.ipynb index af1816cf2..20da9f4c2 100644 --- a/Project/SageMaker Project.ipynb +++ b/Project/SageMaker Project.ipynb @@ -53,9 +53,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-05-05 10:05:05-- http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", + "Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10\n", + "Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 84125825 (80M) [application/x-gzip]\n", + "Saving to: ‘../data/aclImdb_v1.tar.gz’\n", + "\n", + "../data/aclImdb_v1. 100%[===================>] 80.23M 19.2MB/s in 6.7s \n", + "\n", + "2020-05-05 10:05:12 (12.0 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]\n", + "\n" + ] + } + ], "source": [ "%mkdir ../data\n", "!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", @@ -73,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -109,9 +127,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg\n" + ] + } + ], "source": [ "data, labels = read_imdb_data()\n", "print(\"IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg\".format(\n", @@ -128,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -153,9 +179,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IMDb reviews (combined): train = 25000, test = 25000\n" + ] + } + ], "source": [ "train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)\n", "print(\"IMDb reviews (combined): train = {}, test = {}\".format(len(train_X), len(test_X)))" @@ -170,12 +204,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\"Meatball Machine\" has got to be one of the most complex ridiculous, awful and over-exaggerated sci-fi horror films that I have ever came across. It is about good against evil and a coming-of-age tale, with the aim of to entertain with bloody, sleazy and humorous context. Because of that the violence isn't particularly gruesome and it doesn't make you squirm, but the gratuitous bloodletting and nudity does run freely. The performances by Issei Takahashi and Toru Tezuka is the worst i have seen, if that was not enough it is also directed by an unheard of director called Yudai Yamaguchi. This movie just have it all, it is bad to the bone!, A must see for every b-movie freak!!!... Simply: an enjoying and rare gem.\n", + "0\n" + ] + } + ], "source": [ - "print(train_X[100])\n", - "print(train_y[100])" + "print(train_X[3])\n", + "print(train_y[3])" ] }, { @@ -187,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -220,11 +263,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['first', 'saw', 'film', 'cabl', 'instantli', 'becam', 'one', 'favorit', 'movi', 'big', 'fan', 'jame', 'earl', 'jone', 'robert', 'duval', 'movi', 'paint', 'accur', 'pictur', 'south', 'racist', 'attitud', 'attitud', 'came', 'soll', 'old', 'plantat', 'owner', 'use', 'convict', 'labor', 'soll', 'make', 'move', 'funni', 'rambl', 'give', 'us', 'insight', 'way', 'south', 'back', 'suppos', 'soll', 'live', 'today', 'would', 'diagnos', 'alzheim', 'diseas', 'none', 'less', 'attitud', 'toward', 'littl', 'boy', 'come', 'work', 'convict', 'complex', 'racist', 'view', 'grown', 'trust', 'convict', 'black', 'two', 'convict', 'trust', 'jackson', 'mel', 'winkler', 'ben', 'jame', 'earl', 'jone', 'convers', 'ben', 'soll', 'best', 'movi', 'real', 'chemistri', 'jame', 'earl', 'jone', 'mel', 'winkler', 'great', 'perform', 'well', 'hass', 'movi', 'gotten', 'notorieti', 'howev', 'dvd', 'worth', 'money', 'rayvyn']\n" + ] + } + ], "source": [ - "# TODO: Apply review_to_words to a review (train_X[100] or any other review)\n" + "# TODO: Apply review_to_words to a review (train_X[100] or any other review)\n", + "print(review_to_words(train_X[100]))\n" ] }, { @@ -238,7 +290,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:** \n", + "\n", + "* It removes html tags from the review.\n", + "* It converts all characters to lower case.\n", + "* It splits the review into seperate words.\n", + "* It removes stop words." ] }, { @@ -250,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -298,9 +355,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrote preprocessed data to cache file: preprocessed_data.pkl\n" + ] + } + ], "source": [ "# Preprocess data\n", "train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)" @@ -330,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -341,13 +406,16 @@ " \n", " # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a\n", " # sentence is a list of words.\n", - " \n", - " word_count = {} # A dict storing the words that appear in the reviews along with how often they occur\n", + " word_count = {}\n", + " for sentence in data:\n", + " for word in sentence:\n", + " word_count[word] = word_count[word] + 1 if (word in word_count) else 1 \n", " \n", " # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and\n", " # sorted_words[-1] is the least frequently appearing word.\n", - " \n", - " sorted_words = None\n", + "\n", + " sorted_words = [key for key, value in sorted(word_count.items(), key=lambda item: item[1])]\n", + " sorted_words.reverse() \n", " \n", " word_dict = {} # This is what we are building, a dictionary that translates words into integers\n", " for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'\n", @@ -358,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -381,11 +449,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['movi', 'film', 'one', 'like', 'time']\n" + ] + } + ], "source": [ - "# TODO: Use this space to determine the five most frequently appearing words in the training set." + "# TODO: Use this space to determine the five most frequently appearing words in the training set.\n", + "frequent = list(word_dict.keys())[:5]\n", + "# Yes it does as most of these words are common when discussing movies (either negative or positive)" ] }, { @@ -399,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -410,7 +488,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -429,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -461,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -478,11 +556,55 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use this cell to examine one of the processed reviews to make sure everything is working as intended." + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 28 135 3 1693 3018 812 4 395 2 116 123 508 3841 1273\n", + " 497 3780 2 989 1441 269 1120 2408 1516 1516 333 1 72 1\n", + " 1520 65 1975 3928 1 8 195 84 3699 57 98 1530 31 1120\n", + " 64 278 1 75 433 15 1 1 2627 537 264 1516 611 52\n", + " 237 45 40 1975 991 2408 234 1854 1341 1975 247 42 1975 1341\n", + " 1452 3192 1 893 508 3841 1273 1415 893 1 53 2 71 1084\n", + " 508 3841 1273 3192 1 26 60 17 1 2 1690 1 115 197\n", + " 218 200 1 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0] 101\n" + ] + } + ], + "source": [ + "# Use this cell to examine one of the processed reviews to make sure everything is working as intended.\n", + "print(train_X[100])" ] }, { @@ -514,7 +636,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -536,7 +658,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -552,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -566,6 +688,38 @@ "**NOTE:** The cell above uploads the entire contents of our data directory. This includes the `word_dict.pkl` file. This is fortunate as we will need this later on when we create an endpoint that accepts an arbitrary review. For now, we will just take note of the fact that it resides in the data directory (and so also in the S3 training bucket) and that we will need to make sure it gets saved in the model directory." ] }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "On branch master\r\n", + "Your branch is up-to-date with 'origin/master'.\r\n", + "\r\n", + "Changes not staged for commit:\r\n", + " (use \"git add ...\" to update what will be committed)\r\n", + " (use \"git checkout -- ...\" to discard changes in working directory)\r\n", + "\r\n", + "\t\u001b[31mmodified: SageMaker Project.ipynb\u001b[m\r\n", + "\r\n", + "Untracked files:\r\n", + " (use \"git add ...\" to include in what will be committed)\r\n", + "\r\n", + "\t\u001b[31m../cache/\u001b[m\r\n", + "\t\u001b[31m../data/\u001b[m\r\n", + "\r\n", + "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n" + ] + } + ], + "source": [ + "!cat ../.giti" + ] + }, { "cell_type": "markdown", "metadata": {}, From 8ee5a1db175906ff0cc46d473eeab10601067646 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 5 May 2020 15:02:28 +0000 Subject: [PATCH 2/4] Start on pytorch --- Project/SageMaker Project.ipynb | 103 ++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 39 deletions(-) diff --git a/Project/SageMaker Project.ipynb b/Project/SageMaker Project.ipynb index 20da9f4c2..93e71437e 100644 --- a/Project/SageMaker Project.ipynb +++ b/Project/SageMaker Project.ipynb @@ -688,38 +688,6 @@ "**NOTE:** The cell above uploads the entire contents of our data directory. This includes the `word_dict.pkl` file. This is fortunate as we will need this later on when we create an endpoint that accepts an arbitrary review. For now, we will just take note of the fact that it resides in the data directory (and so also in the S3 training bucket) and that we will need to make sure it gets saved in the model directory." ] }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "On branch master\r\n", - "Your branch is up-to-date with 'origin/master'.\r\n", - "\r\n", - "Changes not staged for commit:\r\n", - " (use \"git add ...\" to update what will be committed)\r\n", - " (use \"git checkout -- ...\" to discard changes in working directory)\r\n", - "\r\n", - "\t\u001b[31mmodified: SageMaker Project.ipynb\u001b[m\r\n", - "\r\n", - "Untracked files:\r\n", - " (use \"git add ...\" to include in what will be committed)\r\n", - "\r\n", - "\t\u001b[31m../cache/\u001b[m\r\n", - "\t\u001b[31m../data/\u001b[m\r\n", - "\r\n", - "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n" - ] - } - ], - "source": [ - "!cat ../.giti" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -739,9 +707,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.nn\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mnn\u001b[39;49;00m\r\n", + "\r\n", + "\u001b[34mclass\u001b[39;49;00m \u001b[04m\u001b[32mLSTMClassifier\u001b[39;49;00m(nn.Module):\r\n", + " \u001b[33m\"\"\"\u001b[39;49;00m\r\n", + "\u001b[33m This is the simple RNN model we will be using to perform Sentiment Analysis.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[34mdef\u001b[39;49;00m \u001b[32m__init__\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, embedding_dim, hidden_dim, vocab_size):\r\n", + " \u001b[33m\"\"\"\u001b[39;49;00m\r\n", + "\u001b[33m Initialize the model by settingg up the various layers.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + " \u001b[36msuper\u001b[39;49;00m(LSTMClassifier, \u001b[36mself\u001b[39;49;00m).\u001b[32m__init__\u001b[39;49;00m()\r\n", + "\r\n", + " \u001b[36mself\u001b[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=\u001b[34m0\u001b[39;49;00m)\r\n", + " \u001b[36mself\u001b[39;49;00m.lstm = nn.LSTM(embedding_dim, hidden_dim)\r\n", + " \u001b[36mself\u001b[39;49;00m.dense = nn.Linear(in_features=hidden_dim, out_features=\u001b[34m1\u001b[39;49;00m)\r\n", + " \u001b[36mself\u001b[39;49;00m.sig = nn.Sigmoid()\r\n", + " \r\n", + " \u001b[36mself\u001b[39;49;00m.word_dict = \u001b[36mNone\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[34mdef\u001b[39;49;00m \u001b[32mforward\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, x):\r\n", + " \u001b[33m\"\"\"\u001b[39;49;00m\r\n", + "\u001b[33m Perform a forward pass of our model on some input.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + " x = x.t()\r\n", + " lengths = x[\u001b[34m0\u001b[39;49;00m,:]\r\n", + " reviews = x[\u001b[34m1\u001b[39;49;00m:,:]\r\n", + " embeds = \u001b[36mself\u001b[39;49;00m.embedding(reviews)\r\n", + " lstm_out, _ = \u001b[36mself\u001b[39;49;00m.lstm(embeds)\r\n", + " out = \u001b[36mself\u001b[39;49;00m.dense(lstm_out)\r\n", + " out = out[lengths - \u001b[34m1\u001b[39;49;00m, \u001b[36mrange\u001b[39;49;00m(\u001b[36mlen\u001b[39;49;00m(lengths))]\r\n", + " \u001b[34mreturn\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.sig(out.squeeze())\r\n" + ] + } + ], "source": [ "!pygmentize train/model.py" ] @@ -757,7 +764,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -768,6 +775,7 @@ "train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None, nrows=250)\n", "\n", "# Turn the input pandas dataframe into tensors\n", + "\n", "train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()\n", "train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()\n", "\n", @@ -788,7 +796,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -803,8 +811,13 @@ " batch_y = batch_y.to(device)\n", " \n", " # TODO: Complete this train method to train the model provided.\n", - " \n", + " optimizer.zero_grad()\n", + " output = model(batch_X)\n", + " loss = loss_fn(output, batch_y)\n", + " loss.backward()\n", " total_loss += loss.data.item()\n", + " optimizer.step()\n", + " \n", " print(\"Epoch: {}, BCELoss: {}\".format(epoch, total_loss / len(train_loader)))" ] }, @@ -817,9 +830,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 91, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch: 1, BCELoss: 0.691946291923523\n", + "Epoch: 2, BCELoss: 0.6817818284034729\n", + "Epoch: 3, BCELoss: 0.6726435661315918\n", + "Epoch: 4, BCELoss: 0.6623459339141846\n", + "Epoch: 5, BCELoss: 0.6493294477462769\n" + ] + } + ], "source": [ "import torch.optim as optim\n", "from train.model import LSTMClassifier\n", From 7b2568f0a73445539d2ff65b0700f910d2a921ff Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 7 May 2020 13:50:25 +0000 Subject: [PATCH 3/4] Add endpoint and and update html --- ...GBoost (Updating a Model) - Solution.ipynb | 79 ++- Project/SageMaker Project.ipynb | 494 +++++++++++++++--- Project/serve/predict.py | 9 +- Project/train/train.py | 33 +- Project/website/index.html | 2 +- 5 files changed, 520 insertions(+), 97 deletions(-) diff --git a/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb b/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb index 49f38937b..1f77e8fa2 100644 --- a/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb +++ b/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -433,9 +447,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'os' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprefix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'sentiment-update'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mtest_location\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'test.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprefix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mval_location\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'validation.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprefix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mtrain_location\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprefix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'os' is not defined" + ] + } + ], "source": [ "import sagemaker\n", "\n", @@ -470,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -483,9 +509,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='0.90-1'. For example:\n", + "\tget_image_uri(region, 'xgboost', '0.90-1').\n" + ] + } + ], "source": [ "# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.\n", "# As a matter of convenience, the training and inference code both use the same container.\n", @@ -496,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -531,9 +566,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'train_location' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ms3_input_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msagemaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ms3_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_location\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0ms3_input_validation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msagemaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ms3_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mval_location\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'train_location' is not defined" + ] + } + ], "source": [ "s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')\n", "s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')" @@ -561,9 +608,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'xgb' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mxgb_transformer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransformer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minstance_count\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minstance_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'ml.m4.xlarge'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'xgb' is not defined" + ] + } + ], "source": [ "xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')" ] diff --git a/Project/SageMaker Project.ipynb b/Project/SageMaker Project.ipynb index 93e71437e..1a51b36e7 100644 --- a/Project/SageMaker Project.ipynb +++ b/Project/SageMaker Project.ipynb @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -127,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -154,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -179,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -204,15 +204,15 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\"Meatball Machine\" has got to be one of the most complex ridiculous, awful and over-exaggerated sci-fi horror films that I have ever came across. It is about good against evil and a coming-of-age tale, with the aim of to entertain with bloody, sleazy and humorous context. Because of that the violence isn't particularly gruesome and it doesn't make you squirm, but the gratuitous bloodletting and nudity does run freely. The performances by Issei Takahashi and Toru Tezuka is the worst i have seen, if that was not enough it is also directed by an unheard of director called Yudai Yamaguchi. This movie just have it all, it is bad to the bone!, A must see for every b-movie freak!!!... Simply: an enjoying and rare gem.\n", - "0\n" + "The blend of biography with poetry and live action with animation makes this a true work of art. The narration by Sir Michael Redgrave is moving. The length of the work makes it easily accessible for class room exposure or TV/Video time slots.\n", + "1\n" ] } ], @@ -230,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -263,14 +263,14 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['first', 'saw', 'film', 'cabl', 'instantli', 'becam', 'one', 'favorit', 'movi', 'big', 'fan', 'jame', 'earl', 'jone', 'robert', 'duval', 'movi', 'paint', 'accur', 'pictur', 'south', 'racist', 'attitud', 'attitud', 'came', 'soll', 'old', 'plantat', 'owner', 'use', 'convict', 'labor', 'soll', 'make', 'move', 'funni', 'rambl', 'give', 'us', 'insight', 'way', 'south', 'back', 'suppos', 'soll', 'live', 'today', 'would', 'diagnos', 'alzheim', 'diseas', 'none', 'less', 'attitud', 'toward', 'littl', 'boy', 'come', 'work', 'convict', 'complex', 'racist', 'view', 'grown', 'trust', 'convict', 'black', 'two', 'convict', 'trust', 'jackson', 'mel', 'winkler', 'ben', 'jame', 'earl', 'jone', 'convers', 'ben', 'soll', 'best', 'movi', 'real', 'chemistri', 'jame', 'earl', 'jone', 'mel', 'winkler', 'great', 'perform', 'well', 'hass', 'movi', 'gotten', 'notorieti', 'howev', 'dvd', 'worth', 'money', 'rayvyn']\n" + "['devil', 'hunter', 'gain', 'notorieti', 'fact', 'dpp', 'video', 'nasti', 'list', 'realli', 'mani', 'film', 'list', 'god', 'dpp', 'known', 'reason', 'tamest', 'bunch', 'lot', 'warrant', 'ban', 'shame', 'never', 'would', 'sat', 'fact', 'shop', 'list', 'plot', 'actual', 'give', 'film', 'decent', 'base', 'least', 'decent', 'base', 'cannib', 'film', 'follow', 'actress', 'kidnap', 'drag', 'amazon', 'jungl', 'hunter', 'hire', 'find', 'along', 'way', 'brave', 'nativ', 'lead', 'man', 'call', 'devil', 'henc', 'titl', 'film', 'basic', 'plod', 'along', 'eighti', 'five', 'minut', 'realli', 'mani', 'scene', 'interest', 'real', 'shame', 'jess', 'franco', 'end', 'make', 'film', 'like', 'man', 'clearli', 'talent', 'seen', 'film', 'diabol', 'dr', 'z', 'venu', 'fur', 'faceless', 'kill', 'ecstasi', 'unfortun', 'good', 'film', 'gem', 'amongst', 'heap', 'crap', 'devil', 'hunter', 'much', 'part', 'crap', 'saw', 'film', 'pure', 'want', 'abl', 'say', 'seen', 'everyth', 'dpp', 'list', 'two', 'go', 'guess', 'peopl', 'seen', 'saw', 'lookout', 'nasti', 'realli', 'reason', 'bother', 'one']\n" ] } ], @@ -307,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -355,14 +355,14 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wrote preprocessed data to cache file: preprocessed_data.pkl\n" + "Read preprocessed data from cache file: preprocessed_data.pkl\n" ] } ], @@ -395,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -426,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -449,17 +449,9 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 37, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['movi', 'film', 'one', 'like', 'time']\n" - ] - } - ], + "outputs": [], "source": [ "# TODO: Use this space to determine the five most frequently appearing words in the training set.\n", "frequent = list(word_dict.keys())[:5]\n", @@ -477,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -488,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -507,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -539,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -618,7 +610,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:** This could be a problem as the results are stored as variables as oposed to in a cached file. This means the results are not being stored." ] }, { @@ -636,9 +628,21 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'train_y' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_y\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_X_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_X\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'train_y' is not defined" + ] + } + ], "source": [ "import pandas as pd\n", " \n", @@ -658,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -674,10 +678,11 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ + "data_dir = '../data/pytorch'\n", "input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)" ] }, @@ -879,18 +884,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from sagemaker.pytorch import PyTorch\n", "\n", + "\n", "estimator = PyTorch(entry_point=\"train.py\",\n", " source_dir=\"train\",\n", " role=role,\n", " framework_version='0.4.0',\n", " train_instance_count=1,\n", - " train_instance_type='ml.p2.xlarge',\n", + " train_instance_type='ml.m4.xlarge',\n", " hyperparameters={\n", " 'epochs': 10,\n", " 'hidden_dim': 200,\n", @@ -901,7 +907,180 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-05-07 10:30:45 Starting - Starting the training job......\n", + "2020-05-07 10:31:26 Starting - Launching requested ML instances......\n", + "2020-05-07 10:32:24 Starting - Preparing the instances for training......\n", + "2020-05-07 10:33:43 Downloading - Downloading input data\n", + "2020-05-07 10:33:43 Training - Downloading the training image...\n", + "2020-05-07 10:34:02 Training - Training image download completed. Training in progress.\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n", + "\u001b[34mbash: no job control in this shell\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:03,146 sagemaker-containers INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:03,149 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:03,162 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,578 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Module train does not provide a setup.py. \u001b[0m\n", + "\u001b[34mGenerating setup.py\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", + "\u001b[34m/usr/bin/python -m pip install -U . -r requirements.txt\u001b[0m\n", + "\u001b[34mProcessing /opt/ml/code\u001b[0m\n", + "\u001b[34mCollecting pandas (from -r requirements.txt (line 1))\n", + " Downloading https://files.pythonhosted.org/packages/74/24/0cdbf8907e1e3bc5a8da03345c23cbed7044330bb8f73bb12e711a640a00/pandas-0.24.2-cp35-cp35m-manylinux1_x86_64.whl (10.0MB)\u001b[0m\n", + "\u001b[34mCollecting numpy (from -r requirements.txt (line 2))\n", + " Downloading https://files.pythonhosted.org/packages/38/92/fa5295d9755c7876cb8490eab866e1780154033fa45978d9cf74ffbd4c68/numpy-1.18.4-cp35-cp35m-manylinux1_x86_64.whl (20.0MB)\u001b[0m\n", + "\u001b[34mCollecting nltk (from -r requirements.txt (line 3))\n", + " Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)\u001b[0m\n", + "\u001b[34mCollecting beautifulsoup4 (from -r requirements.txt (line 4))\n", + " Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)\u001b[0m\n", + "\u001b[34mCollecting html5lib (from -r requirements.txt (line 5))\n", + " Downloading https://files.pythonhosted.org/packages/a5/62/bbd2be0e7943ec8504b517e62bab011b4946e1258842bc159e5dfde15b96/html5lib-1.0.1-py2.py3-none-any.whl (117kB)\u001b[0m\n", + "\u001b[34mCollecting pytz>=2011k (from pandas->-r requirements.txt (line 1))\n", + " Downloading https://files.pythonhosted.org/packages/4f/a4/879454d49688e2fad93e59d7d4efda580b783c745fd2ec2a3adf87b0808d/pytz-2020.1-py2.py3-none-any.whl (510kB)\u001b[0m\n", + "\u001b[34mRequirement already satisfied, skipping upgrade: python-dateutil>=2.5.0 in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2.7.5)\u001b[0m\n", + "\u001b[34mRequirement already satisfied, skipping upgrade: click in /usr/local/lib/python3.5/dist-packages (from nltk->-r requirements.txt (line 3)) (7.0)\u001b[0m\n", + "\u001b[34mCollecting joblib (from nltk->-r requirements.txt (line 3))\n", + " Downloading https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl (294kB)\u001b[0m\n", + "\u001b[34mCollecting regex (from nltk->-r requirements.txt (line 3))\u001b[0m\n", + "\u001b[34m Downloading https://files.pythonhosted.org/packages/4c/e7/eee73c42c1193fecc0e91361a163cbb8dfbea62c3db7618ad986e5b43a14/regex-2020.4.4.tar.gz (695kB)\u001b[0m\n", + "\u001b[34mCollecting tqdm (from nltk->-r requirements.txt (line 3))\n", + " Downloading https://files.pythonhosted.org/packages/c9/40/058b12e8ba10e35f89c9b1fdfc2d4c7f8c05947df2d5eb3c7b258019fda0/tqdm-4.46.0-py2.py3-none-any.whl (63kB)\u001b[0m\n", + "\u001b[34mCollecting soupsieve>1.2 (from beautifulsoup4->-r requirements.txt (line 4))\n", + " Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl\u001b[0m\n", + "\u001b[34mRequirement already satisfied, skipping upgrade: six>=1.9 in /usr/local/lib/python3.5/dist-packages (from html5lib->-r requirements.txt (line 5)) (1.11.0)\u001b[0m\n", + "\u001b[34mCollecting webencodings (from html5lib->-r requirements.txt (line 5))\n", + " Downloading https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl\u001b[0m\n", + "\u001b[34mBuilding wheels for collected packages: nltk, train, regex\n", + " Running setup.py bdist_wheel for nltk: started\u001b[0m\n", + "\u001b[34m Running setup.py bdist_wheel for nltk: finished with status 'done'\n", + " Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306\n", + " Running setup.py bdist_wheel for train: started\u001b[0m\n", + "\u001b[34m Running setup.py bdist_wheel for train: finished with status 'done'\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-4dxw0rzu/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\n", + " Running setup.py bdist_wheel for regex: started\u001b[0m\n", + "\u001b[34m Running setup.py bdist_wheel for regex: finished with status 'done'\n", + " Stored in directory: /root/.cache/pip/wheels/e6/9b/ae/2972da29cc7759b71dee015813b7c6931917d6a51e64ed5e79\u001b[0m\n", + "\u001b[34mSuccessfully built nltk train regex\u001b[0m\n", + "\u001b[34mInstalling collected packages: numpy, pytz, pandas, joblib, regex, tqdm, nltk, soupsieve, beautifulsoup4, webencodings, html5lib, train\n", + " Found existing installation: numpy 1.15.4\u001b[0m\n", + "\u001b[34m Uninstalling numpy-1.15.4:\u001b[0m\n", + "\u001b[34m Successfully uninstalled numpy-1.15.4\u001b[0m\n", + "\u001b[34mSuccessfully installed beautifulsoup4-4.9.0 html5lib-1.0.1 joblib-0.14.1 nltk-3.5 numpy-1.18.4 pandas-0.24.2 pytz-2020.1 regex-2020.4.4 soupsieve-2.0 tqdm-4.46.0 train-1.0.0 webencodings-0.5.1\u001b[0m\n", + "\u001b[34mYou are using pip version 18.1, however version 20.1 is available.\u001b[0m\n", + "\u001b[34mYou should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:27,264 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:27,278 sagemaker-containers INFO Invoking user script\n", + "\u001b[0m\n", + "\u001b[34mTraining Env:\n", + "\u001b[0m\n", + "\u001b[34m{\n", + " \"module_name\": \"train\",\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"hyperparameters\": {\n", + " \"hidden_dim\": 200,\n", + " \"epochs\": 10\n", + " },\n", + " \"log_level\": 20,\n", + " \"module_dir\": \"s3://sagemaker-eu-west-2-705833918113/sagemaker-pytorch-2020-05-07-10-30-44-825/source/sourcedir.tar.gz\",\n", + " \"num_gpus\": 0,\n", + " \"input_data_config\": {\n", + " \"training\": {\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"network_interface_name\": \"eth0\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"channel_input_dirs\": {\n", + " \"training\": \"/opt/ml/input/data/training\"\n", + " },\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"num_cpus\": 4,\n", + " \"job_name\": \"sagemaker-pytorch-2020-05-07-10-30-44-825\",\n", + " \"additional_framework_parameters\": {},\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n", + " \"user_entry_point\": \"train.py\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"current_host\": \"algo-1\",\n", + " \"resource_config\": {\n", + " \"network_interface_name\": \"eth0\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"current_host\": \"algo-1\"\n", + " },\n", + " \"model_dir\": \"/opt/ml/model\"\u001b[0m\n", + "\u001b[34m}\n", + "\u001b[0m\n", + "\u001b[34mEnvironment variables:\n", + "\u001b[0m\n", + "\u001b[34mSM_HP_HIDDEN_DIM=200\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", + "\u001b[34mSM_HPS={\"epochs\":10,\"hidden_dim\":200}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"training\"]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mPYTHONPATH=/usr/local/bin:/usr/lib/python35.zip:/usr/lib/python3.5:/usr/lib/python3.5/plat-x86_64-linux-gnu:/usr/lib/python3.5/lib-dynload:/usr/local/lib/python3.5/dist-packages:/usr/lib/python3/dist-packages\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[\"--epochs\",\"10\",\"--hidden_dim\",\"200\"]\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_CHANNEL_TRAINING=/opt/ml/input/data/training\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=s3://sagemaker-eu-west-2-705833918113/sagemaker-pytorch-2020-05-07-10-30-44-825/source/sourcedir.tar.gz\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", + "\u001b[34mSM_HP_EPOCHS=10\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"epochs\":10,\"hidden_dim\":200},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"job_name\":\"sagemaker-pytorch-2020-05-07-10-30-44-825\",\"log_level\":20,\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-west-2-705833918113/sagemaker-pytorch-2020-05-07-10-30-44-825/source/sourcedir.tar.gz\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\n", + "\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\n", + "\u001b[0m\n", + "\u001b[34m/usr/bin/python -m train --epochs 10 --hidden_dim 200\n", + "\n", + "\u001b[0m\n", + "\u001b[34mUsing device cpu.\u001b[0m\n", + "\u001b[34mGet train data loader.\u001b[0m\n", + "\u001b[34mModel loaded with embedding_dim 32, hidden_dim 200, vocab_size 5000.\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mEpoch: 1, BCELoss: 0.6686128937468236\u001b[0m\n", + "\u001b[34mEpoch: 5, BCELoss: 0.36999733411535923\u001b[0m\n", + "\u001b[34mEpoch: 7, BCELoss: 0.3187107614108494\u001b[0m\n", + "\u001b[34mEpoch: 8, BCELoss: 0.31308953798547084\u001b[0m\n", + "\u001b[34mEpoch: 9, BCELoss: 0.2818753390896077\u001b[0m\n", + "\u001b[34mEpoch: 10, BCELoss: 0.2627122727583866\u001b[0m\n", + "\u001b[34m2020-05-07 12:24:00,607 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n", + "\n", + "2020-05-07 12:24:10 Uploading - Uploading generated training model\n", + "2020-05-07 12:24:10 Completed - Training job completed\n", + "Training seconds: 6643\n", + "Billable seconds: 6643\n" + ] + } + ], "source": [ "estimator.fit({'training': input_data})" ] @@ -933,11 +1112,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ - "# TODO: Deploy the trained model" + "# TODO: Deploy the trained model\n", + "predictor = estimator.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')" ] }, { @@ -951,7 +1131,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -960,7 +1140,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -977,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -987,9 +1167,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.84836" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import accuracy_score\n", "accuracy_score(test_y, predictions)" @@ -1006,7 +1197,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:**\n", + "This models accuracy score is very close to the XGBoost models accuracy score.\n", + "\n", + "These two models might perform differently as they work better with different sizes of data sets whilst neural networks work better with very large data sets they are often out performed by alternatives with smaller data sets. So even though neural network might be better designed for natural language processing the size of the data set meant it was unable to outperform the XGBoost model. Also XGBoost tends to perform better on more structured data like the one used in this project.\n", + "\n", + "In this instance an XGBoost might be preferable as it needs less computing power to be trained however if the data sample was much larger the neural network model would likly out perform the XGBoost model.\n" ] }, { @@ -1020,7 +1216,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1044,12 +1240,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1374, 50, 53, 3, 4, 878, 173, 392, 682, 29, 723, 2, 4412, 275, 2081, 1059, 760, 1, 581, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n" + ] + } + ], "source": [ "# TODO: Convert test_review into a form usable by the model and save the results in test_data\n", - "test_data = None" + "test_review_to_words = review_to_words(test_review)\n", + "test_data, _ = convert_and_pad(word_dict, test_review_to_words)\n", + "print(test_data)" ] }, { @@ -1061,11 +1267,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array(0.7069678, dtype=float32)" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "predictor.predict(test_data)" + "predictor.predict([test_data])" ] }, { @@ -1120,9 +1337,108 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36margparse\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mjson\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mos\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mpickle\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36msys\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36msagemaker_containers\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mpandas\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mpd\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mnumpy\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mnp\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.nn\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mnn\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.optim\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36moptim\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.utils.data\u001b[39;49;00m\r\n", + "\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mmodel\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m LSTMClassifier\r\n", + "\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mutils\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m review_to_words, convert_and_pad\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32mmodel_fn\u001b[39;49;00m(model_dir):\r\n", + " \u001b[33m\"\"\"Load the PyTorch model from the `model_dir` directory.\"\"\"\u001b[39;49;00m\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mLoading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + "\r\n", + " \u001b[37m# First, load the parameters used to create the model.\u001b[39;49;00m\r\n", + " model_info = {}\r\n", + " model_info_path = os.path.join(model_dir, \u001b[33m'\u001b[39;49;00m\u001b[33mmodel_info.pth\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mwith\u001b[39;49;00m \u001b[36mopen\u001b[39;49;00m(model_info_path, \u001b[33m'\u001b[39;49;00m\u001b[33mrb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[34mas\u001b[39;49;00m f:\r\n", + " model_info = torch.load(f)\r\n", + "\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mmodel_info: {}\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(model_info))\r\n", + "\r\n", + " \u001b[37m# Determine the device and construct the model.\u001b[39;49;00m\r\n", + " device = torch.device(\u001b[33m\"\u001b[39;49;00m\u001b[33mcuda\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m \u001b[34mif\u001b[39;49;00m torch.cuda.is_available() \u001b[34melse\u001b[39;49;00m \u001b[33m\"\u001b[39;49;00m\u001b[33mcpu\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " model = LSTMClassifier(model_info[\u001b[33m'\u001b[39;49;00m\u001b[33membedding_dim\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m], model_info[\u001b[33m'\u001b[39;49;00m\u001b[33mhidden_dim\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m], model_info[\u001b[33m'\u001b[39;49;00m\u001b[33mvocab_size\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + "\r\n", + " \u001b[37m# Load the store model parameters.\u001b[39;49;00m\r\n", + " model_path = os.path.join(model_dir, \u001b[33m'\u001b[39;49;00m\u001b[33mmodel.pth\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mwith\u001b[39;49;00m \u001b[36mopen\u001b[39;49;00m(model_path, \u001b[33m'\u001b[39;49;00m\u001b[33mrb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[34mas\u001b[39;49;00m f:\r\n", + " model.load_state_dict(torch.load(f))\r\n", + "\r\n", + " \u001b[37m# Load the saved word_dict.\u001b[39;49;00m\r\n", + " word_dict_path = os.path.join(model_dir, \u001b[33m'\u001b[39;49;00m\u001b[33mword_dict.pkl\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mwith\u001b[39;49;00m \u001b[36mopen\u001b[39;49;00m(word_dict_path, \u001b[33m'\u001b[39;49;00m\u001b[33mrb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[34mas\u001b[39;49;00m f:\r\n", + " model.word_dict = pickle.load(f)\r\n", + "\r\n", + " model.to(device).eval()\r\n", + "\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mDone loading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \u001b[34mreturn\u001b[39;49;00m model\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32minput_fn\u001b[39;49;00m(serialized_input_data, content_type):\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mDeserializing the input data.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mif\u001b[39;49;00m content_type == \u001b[33m'\u001b[39;49;00m\u001b[33mtext/plain\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m:\r\n", + " data = serialized_input_data.decode(\u001b[33m'\u001b[39;49;00m\u001b[33mutf-8\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mreturn\u001b[39;49;00m data\r\n", + " \u001b[34mraise\u001b[39;49;00m \u001b[36mException\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mRequested unsupported ContentType in content_type: \u001b[39;49;00m\u001b[33m'\u001b[39;49;00m + content_type)\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32moutput_fn\u001b[39;49;00m(prediction_output, accept):\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mSerializing the generated output.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mreturn\u001b[39;49;00m \u001b[36mstr\u001b[39;49;00m(prediction_output)\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32mpredict_fn\u001b[39;49;00m(input_data, model):\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mInferring sentiment of input data.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + "\r\n", + " device = torch.device(\u001b[33m\"\u001b[39;49;00m\u001b[33mcuda\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m \u001b[34mif\u001b[39;49;00m torch.cuda.is_available() \u001b[34melse\u001b[39;49;00m \u001b[33m\"\u001b[39;49;00m\u001b[33mcpu\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[34mif\u001b[39;49;00m model.word_dict \u001b[35mis\u001b[39;49;00m \u001b[36mNone\u001b[39;49;00m:\r\n", + " \u001b[34mraise\u001b[39;49;00m \u001b[36mException\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mModel has not been loaded properly, no word_dict.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[37m# TODO: Process input_data so that it is ready to be sent to our model.\u001b[39;49;00m\r\n", + " \u001b[37m# You should produce two variables:\u001b[39;49;00m\r\n", + " \u001b[37m# data_X - A sequence of length 500 which represents the converted review\u001b[39;49;00m\r\n", + " \u001b[37m# data_len - The length of the review\u001b[39;49;00m\r\n", + "\r\n", + " data_X = \u001b[36mNone\u001b[39;49;00m\r\n", + " data_len = \u001b[36mNone\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[37m# Using data_X and data_len we construct an appropriate input tensor. Remember\u001b[39;49;00m\r\n", + " \u001b[37m# that our model expects input data of the form 'len, review[500]'.\u001b[39;49;00m\r\n", + " data_pack = np.hstack((data_len, data_X))\r\n", + " data_pack = data_pack.reshape(\u001b[34m1\u001b[39;49;00m, -\u001b[34m1\u001b[39;49;00m)\r\n", + " \r\n", + " data = torch.from_numpy(data_pack)\r\n", + " data = data.to(device)\r\n", + "\r\n", + " \u001b[37m# Make sure to put the model into evaluation mode\u001b[39;49;00m\r\n", + " model.eval()\r\n", + "\r\n", + " \u001b[37m# TODO: Compute the result of applying the model to the input data. The variable `result` should\u001b[39;49;00m\r\n", + " \u001b[37m# be a numpy array which contains a single integer which is either 1 or 0\u001b[39;49;00m\r\n", + "\r\n", + " result = \u001b[36mNone\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[34mreturn\u001b[39;49;00m result\r\n" + ] + } + ], "source": [ "!pygmentize serve/predict.py" ] @@ -1149,9 +1465,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------!" + ] + } + ], "source": [ "from sagemaker.predictor import RealTimePredictor\n", "from sagemaker.pytorch import PyTorchModel\n", @@ -1180,11 +1504,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import glob\n", + "import os\n", "\n", "def test_reviews(data_dir='../data/aclImdb', stop=250):\n", " \n", @@ -1212,7 +1537,10 @@ " # Read in the review and convert to 'utf-8' for transmission via HTTP\n", " review_input = review.read().encode('utf-8')\n", " # Send the review to the predictor and store the results\n", - " results.append(int(predictor.predict(review_input)))\n", + " result = predictor.predict(review_input)\n", + " result = result.decode('UTF-8')\n", + " result = eval(result)\n", + " results.append(int(result[0]))\n", " \n", " # Sending reviews to our endpoint one at a time takes a while so we\n", " # only send a small number of reviews\n", @@ -1225,18 +1553,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting pos files\n", + "Starting neg files\n" + ] + } + ], "source": [ - "ground, results = test_reviews()" + "ground, results= test_reviews()\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.848" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import accuracy_score\n", "accuracy_score(ground, results)" @@ -1251,9 +1599,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 226, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "b'[1.]'" + ] + }, + "execution_count": 226, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "predictor.predict(test_review)" ] @@ -1401,7 +1760,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:**\n", + "\"You might have hoped that any commentary around a new drama about Nazis trying to establish a fourth Reich in modern-day America would not need include the word “timely”. But we are where we are.\"\n", + "\n", + "It predicted that the review was bad, which it was." ] }, { diff --git a/Project/serve/predict.py b/Project/serve/predict.py index 00c9149e6..450ebf307 100644 --- a/Project/serve/predict.py +++ b/Project/serve/predict.py @@ -69,9 +69,9 @@ def predict_fn(input_data, model): # You should produce two variables: # data_X - A sequence of length 500 which represents the converted review # data_len - The length of the review + review_words = review_to_words(input_data) - data_X = None - data_len = None + data_X, data_len = convert_and_pad(model.word_dict, review_words) # Using data_X and data_len we construct an appropriate input tensor. Remember # that our model expects input data of the form 'len, review[500]'. @@ -86,7 +86,6 @@ def predict_fn(input_data, model): # TODO: Compute the result of applying the model to the input data. The variable `result` should # be a numpy array which contains a single integer which is either 1 or 0 - - result = None - + result = model(data) + result = np.array([round(result.item())]) return result diff --git a/Project/train/train.py b/Project/train/train.py index 9cf9915b8..ac0a6e474 100644 --- a/Project/train/train.py +++ b/Project/train/train.py @@ -54,22 +54,25 @@ def _get_train_data_loader(batch_size, training_dir): return torch.utils.data.DataLoader(train_ds, batch_size=batch_size) - def train(model, train_loader, epochs, optimizer, loss_fn, device): - """ - This is the training method that is called by the PyTorch training script. The parameters - passed are as follows: - model - The PyTorch model that we wish to train. - train_loader - The PyTorch DataLoader that should be used during training. - epochs - The total number of epochs to train for. - optimizer - The optimizer to use during training. - loss_fn - The loss function used for training. - device - Where the model and data should be loaded (gpu or cpu). - """ - - # TODO: Paste the train() method developed in the notebook here. - - pass + for epoch in range(1, epochs + 1): + model.train() + total_loss = 0 + for batch in train_loader: + batch_X, batch_y = batch + + batch_X = batch_X.to(device) + batch_y = batch_y.to(device) + + # TODO: Complete this train method to train the model provided. + optimizer.zero_grad() + output = model(batch_X) + loss = loss_fn(output, batch_y) + loss.backward() + total_loss += loss.data.item() + optimizer.step() + + print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader))) if __name__ == '__main__': diff --git a/Project/website/index.html b/Project/website/index.html index 6ae4feffb..fae8276e2 100644 --- a/Project/website/index.html +++ b/Project/website/index.html @@ -37,7 +37,7 @@

Is your review positive, or negative?

Enter your review below and click submit to find out...

From 2e851ebb4ee4bb72189c9714b8d7741430c02390 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 7 May 2020 15:37:40 +0000 Subject: [PATCH 4/4] Update a unfilled question --- Project/SageMaker Project.ipynb | 78 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/Project/SageMaker Project.ipynb b/Project/SageMaker Project.ipynb index 1a51b36e7..b0fe3880e 100644 --- a/Project/SageMaker Project.ipynb +++ b/Project/SageMaker Project.ipynb @@ -53,23 +53,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-05-05 10:05:05-- http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", + "mkdir: cannot create directory ‘../data’: File exists\n", + "--2020-05-07 14:11:39-- http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", "Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10\n", "Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 84125825 (80M) [application/x-gzip]\n", "Saving to: ‘../data/aclImdb_v1.tar.gz’\n", "\n", - "../data/aclImdb_v1. 100%[===================>] 80.23M 19.2MB/s in 6.7s \n", + "../data/aclImdb_v1. 100%[===================>] 80.23M 19.7MB/s in 6.9s \n", "\n", - "2020-05-05 10:05:12 (12.0 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]\n", + "2020-05-07 14:11:47 (11.6 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]\n", "\n" ] } @@ -91,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -127,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -154,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -179,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -204,15 +205,15 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The blend of biography with poetry and live action with animation makes this a true work of art. The narration by Sir Michael Redgrave is moving. The length of the work makes it easily accessible for class room exposure or TV/Video time slots.\n", - "1\n" + "I can not believe such slanted, jingoistic material is getting passed off to Americans as art house material. Early on, from such telling lines like \"we want to make sure they are playing for the right team\" and manipulative framing and lighting, A Love Divided shows it's true face. The crass manner in which the Irish Catholics are shown as hegemonic, the Protestants as peaceful and downtrodden, is as poor a representation of history as early US westerns that depict the struggle between cowboys and American Indians. The truth of the story is distorted with the stereotypes and outright vilification of the Irish Catholics in the story; a corruption admitted by the filmmakers themselves! It is sad that people today still think that they can win moral sway by making a film so easily recognized for it's obvious intent, so far from attempting art. This film has no business being anywhere in any legitimate cinema or library.\n", + "0\n" ] } ], @@ -230,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -263,14 +264,14 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['devil', 'hunter', 'gain', 'notorieti', 'fact', 'dpp', 'video', 'nasti', 'list', 'realli', 'mani', 'film', 'list', 'god', 'dpp', 'known', 'reason', 'tamest', 'bunch', 'lot', 'warrant', 'ban', 'shame', 'never', 'would', 'sat', 'fact', 'shop', 'list', 'plot', 'actual', 'give', 'film', 'decent', 'base', 'least', 'decent', 'base', 'cannib', 'film', 'follow', 'actress', 'kidnap', 'drag', 'amazon', 'jungl', 'hunter', 'hire', 'find', 'along', 'way', 'brave', 'nativ', 'lead', 'man', 'call', 'devil', 'henc', 'titl', 'film', 'basic', 'plod', 'along', 'eighti', 'five', 'minut', 'realli', 'mani', 'scene', 'interest', 'real', 'shame', 'jess', 'franco', 'end', 'make', 'film', 'like', 'man', 'clearli', 'talent', 'seen', 'film', 'diabol', 'dr', 'z', 'venu', 'fur', 'faceless', 'kill', 'ecstasi', 'unfortun', 'good', 'film', 'gem', 'amongst', 'heap', 'crap', 'devil', 'hunter', 'much', 'part', 'crap', 'saw', 'film', 'pure', 'want', 'abl', 'say', 'seen', 'everyth', 'dpp', 'list', 'two', 'go', 'guess', 'peopl', 'seen', 'saw', 'lookout', 'nasti', 'realli', 'reason', 'bother', 'one']\n" + "['mani', 'sourc', 'routin', 'lump', 'thought', 'provok', 'period', 'drama', 'part', 'base', 'histor', 'fact', 'togeth', 'superfici', 'similar', 'nunsploit', 'mainstay', '70', 'euro', 'trash', 'cinema', 'overlook', 'righteou', 'anger', 'drive', 'whole', 'endeavor', 'perhap', 'coincident', 'also', 'director', 'gianfranco', 'mingozzi', 'singular', 'attempt', 'narr', 'film', 'make', 'outsid', 'mani', 'well', 'receiv', 'documentari', 'safe', 'set', 'within', 'histor', 'context', 'flavia', 'chart', 'grow', 'rebellion', 'earli', '15th', 'centuri', 'italian', 'nun', 'florinda', 'bolkan', 'career', 'perform', 'even', 'surpass', 'sterl', 'work', 'lucio', 'fulci', 'devast', 'tortur', 'duckl', 'lock', 'away', 'convent', 'nobleman', 'father', 'desper', 'attempt', 'curb', 'girl', 'bud', 'sensuou', 'natur', 'wonder', 'women', 'releg', 'secondari', 'role', 'best', 'life', 'holi', 'scriptur', 'confront', 'way', 'male', 'domin', 'ruptur', 'femal', 'live', 'inspir', 'revolt', 'fuel', 'rant', 'semi', 'craze', 'older', 'sister', 'agatha', 'indel', 'portray', 'veteran', 'actress', 'maria', 'casar', 'marcel', 'carn', 'le', 'enfant', 'du', 'paradi', 'construct', 'muslim', 'invas', 'join', 'oppressor', 'perhap', 'unwittingli', 'manipul', 'bid', 'flavia', 'truli', 'becom', 'outcast', 'alreadi', 'felt', 'expect', 'tragic', 'result', 'breathtak', 'widescreen', 'composit', 'alfio', 'contini', 'shot', 'michelangelo', 'antonioni', 'zabriski', 'point', 'uncompromis', 'auster', 'account', 'one', 'woman', 'fierc', 'yet', 'ultim', 'futil', 'fight', 'patriarch', 'societi', 'allot', 'right', 'beyond', 'childbear', 'whore', 'sister', 'agatha', 'wryli', 'remark', 'lengthi', 'drug', 'induc', 'fantasi', 'sequenc', 'clearli', 'model', 'ken', 'russel', 'otherwis', 'far', 'flamboy', 'devil', 'notwithstand', 'movi', 'turn', 'rel', 'stingi', 'skin', 'depart', 'make', 'someth', 'mockeri', 'semi', 'porn', 'reput', 'seriou', 'work', 'deserv', 'rediscoveri', 'restor', 'unjustli', 'tarnish', 'reput']\n" ] } ], @@ -307,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -355,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -395,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -426,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -449,12 +450,21 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 43, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['movi', 'film', 'one', 'like', 'time']\n" + ] + } + ], "source": [ "# TODO: Use this space to determine the five most frequently appearing words in the training set.\n", "frequent = list(word_dict.keys())[:5]\n", + "# ['movi', 'film', 'one', 'like', 'time']\n", "# Yes it does as most of these words are common when discussing movies (either negative or positive)" ] }, @@ -469,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -480,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -499,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -531,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -548,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -590,7 +600,7 @@ " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", - " 0 0 0 0 0 0 0 0 0 0] 101\n" + " 0 0 0 0 0 0 0 0 0 0]\n" ] } ], @@ -628,21 +638,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 49, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'train_y' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_y\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_X_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_X\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'train_y' is not defined" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", " \n",