diff --git a/course/Problem009_Linear/009.ipynb b/course/Problem009_Linear/009.ipynb index 0839d2d..9542b01 100644 --- a/course/Problem009_Linear/009.ipynb +++ b/course/Problem009_Linear/009.ipynb @@ -1,183 +1,272 @@ { - "cells": [ - { - "source": [ - "## What does linear model do\n" - ], - "cell_type": "markdown", - "metadata": {} + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2-final" + }, + "colab": { + "name": "009.ipynb", + "provenance": [] + } }, - { - "source": [ - "## 1-dimensional input (feature)" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn import linear_model\n", - "X_train = np.array([[1],[2],[3],[4]])\n", - "Y_train = np.array([1,2,3,4])\n", - "\n", - "#TBD fit linear model and predict for x=7\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "X_train = np.array([[1],[2],[3],[4]])\n", - "Y_train = np.array([3,4,5,6])\n", - "\n", - "#TBD fit linear model and predict for x=7" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_train = np.array([[0],[0.1],[2],[2.1],[4],[4.1]])\n", - "Y_train = np.array([0,2,1,3,2,4])\n", - "\n", - "#TBD fit linear model and predict for x=4\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD Plot above 3 models" - ] - }, - { - "source": [ - "## 2 dimensional input (features)" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from random import random, seed\n", - "\n", - "n = 50\n", - "seed(1) \n", - "X_train = [[random(), random()] for i in range(n)] \n", - "Y_train = [random() for i in range(n)]\n", - "\n", - "#TBD fit linear model and predict for x=(7,8)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "#Plot above model" - ] - }, - { - "source": [ - "## 13 dimensional input (features)" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD Load boston dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD Split into Train and Test dataset. random 90% 10%" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD Fit linear model on Training data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD Predict for Test data" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD Mean squared error for your predictions" - ] - }, - { - "source": [ - "## Problems with linear model\n", - "![title](../../img/anscombe_quartet.png)\n", - "\n", - "### Ansombe Quartet \n", - "### y = x/2 + 3" - ], - "cell_type": "markdown", - "metadata": {} - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2-final" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "jbjvfrGgOEt9" + }, + "source": [ + "## What does linear model do\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1-0V54gtOEuB" + }, + "source": [ + "## 1-dimensional input (feature)" + ] + }, + { + "cell_type": "code", + "metadata": { + "scrolled": true, + "id": "5f1YGDlzOEuE" + }, + "source": [ + "import numpy as np\n", + "from sklearn import linear_model\n", + "X_train = np.array([[1],[2],[3],[4]])\n", + "Y_train = np.array([1,2,3,4])\n", + "\n", + "#TBD fit linear model and predict for x=7\n" + ], + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "lo0J-zXaOEuV" + }, + "source": [ + "X_train = np.array([[1],[2],[3],[4]])\n", + "Y_train = np.array([3,4,5,6])\n", + "\n", + "#TBD fit linear model and predict for x=7" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Xz3ZTdR7OEuc", + "outputId": "19f604f9-8728-461b-b833-815f5d9c3d7f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "X_train = np.array([[0],[0.1],[2],[2.1],[4],[4.1]])\n", + "Y_train = np.array([0,2,1,3,2,4])\n", + "\n", + "#TBD fit linear model and predict for x=4\n", + "reg = linear_model.LinearRegression().fit(X_train, Y_train)\n", + "reg.predict(np.array([[4]]))" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([3.01061505])" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "N04ZwbaWOEuk" + }, + "source": [ + "#TBD Plot above 3 models\n", + "import matplotlib.pyplot as plt\n", + "plt.scatter(X_diabetes_X_test, diabetes_y_test, color='black')\n", + "plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)\n", + "\n", + "plt.xticks(())\n", + "plt.yticks(())\n", + "\n", + "plt.show()\n" + ], + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IAc6HdwhOEut" + }, + "source": [ + "## 2 dimensional input (features)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XjOeDXoiOEuv" + }, + "source": [ + "from random import random, seed\n", + "\n", + "n = 50\n", + "seed(1) \n", + "X_train = [[random(), random()] for i in range(n)] \n", + "Y_train = [random() for i in range(n)]\n", + "\n", + "#TBD fit linear model and predict for x=(7,8)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ihfs8cYOOEu3" + }, + "source": [ + "#Plot above model\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a-aCUHidOEu9" + }, + "source": [ + "## 13 dimensional input (features)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zz7qHJAhOEu-" + }, + "source": [ + "#TBD Load boston dataset\n", + "from sklearn.datasets import load_boston\n", + "X, Y = load_boston(return_X_y=True)" + ], + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "DndPbHJJOEvD" + }, + "source": [ + "#TBD Split into Train and Test dataset. random 90% 10%\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y)" + ], + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VRg5q8Y0OEvI" + }, + "source": [ + "#TBD Fit linear model on Training data\n", + "\n", + "reg = linear_model.LinearRegression().fit(X_train, y_train)\n" + ], + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wZv25sakOEvP" + }, + "source": [ + "#TBD Predict for Test data\n", + "y_hat = reg.predict(X_test)" + ], + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_cyh6pyJOEvU", + "outputId": "b78d4aae-8193-468e-f0a5-1a2405c7ce95", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "#TBD Mean squared error for your predictions\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", + "\n", + "mean_squared_error(y_test,y_hat)" + ], + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "26.87119408116287" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 25 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IEgYG8mhOEva" + }, + "source": [ + "## Problems with linear model\n", + "![title](https://github.com/wrwills/pml/blob/master/img/anscombe_quartet.png?raw=1)\n", + "\n", + "### Ansombe Quartet \n", + "### y = x/2 + 3" + ] + } + ] } \ No newline at end of file diff --git a/course/Problem010_NonLinear/010.ipynb b/course/Problem010_NonLinear/010.ipynb index 6917e31..042bd03 100644 --- a/course/Problem010_NonLinear/010.ipynb +++ b/course/Problem010_NonLinear/010.ipynb @@ -1,78 +1,96 @@ { - "cells": [ - { - "source": [ - "## Non Linear functions\n", - "$\n", - "sigmoid(x)={\\frac {1}{1+e^{-x}}}\n", - "$\n", - "\n", - "$\n", - "tanh(x)={\\frac {e^{x} - e^{-x}}{e^{x}+e^{-x}}}\n", - "$\n", - "\n", - "$\n", - "ReLU(x) = \\left\\{\n", - " \\begin{array}{lll}\n", - " 0 & for & x \\leq 0 \\\\\n", - " x & for & x > 0\n", - " \\end{array}\n", - " \\right.\n", - "$\n", - "\n" - ], - "cell_type": "markdown", - "metadata": { - "scrolled": true - } + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2-final" + }, + "colab": { + "name": "010.ipynb", + "provenance": [] + } }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD implement following non-linear functions\n", - "\n", - "def sigmoid(x):\n", - " pass\n", - "\n", - "def tanh(x):\n", - " pass\n", - "\n", - "def relu(x):\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD plot above non-linear functions" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2-final" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "scrolled": true, + "id": "ena-s6S0TAoE" + }, + "source": [ + "## Non Linear functions\n", + "$\n", + "sigmoid(x)={\\frac {1}{1+e^{-x}}}\n", + "$\n", + "\n", + "$\n", + "tanh(x)={\\frac {e^{x} - e^{-x}}{e^{x}+e^{-x}}}\n", + "$\n", + "\n", + "$\n", + "ReLU(x) = \\left\\{\n", + " \\begin{array}{lll}\n", + " 0 & for & x \\leq 0 \\\\\n", + " x & for & x > 0\n", + " \\end{array}\n", + " \\right.\n", + "$\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "P9RebSbYTAoG" + }, + "source": [ + "#TBD implement following non-linear functions\n", + "import numpy as np\n", + "\n", + "def sigmoid(x):\n", + " return 1 / (1 + np.exp(-xe))\n", + "\n", + "def tanh(x):\n", + " return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) \n", + "\n", + "def relu(x):\n", + " if (x > 0):\n", + " return x\n", + " else:\n", + " return 0\n", + " \n", + "\n", + "\n", + "import matplotlib.pyplot as plt\n", + " " + ], + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "v6M0gH_HTAoR" + }, + "source": [ + "#TBD plot above non-linear functions" + ], + "execution_count": null, + "outputs": [] + } + ] } \ No newline at end of file diff --git a/course/Problem014_Weights/014.ipynb b/course/Problem014_Weights/014.ipynb index e101b56..a36797a 100644 --- a/course/Problem014_Weights/014.ipynb +++ b/course/Problem014_Weights/014.ipynb @@ -1,77 +1,116 @@ { - "cells": [ - { - "source": [ - "## Weights or Coeffecients or Theta or Strength of Connection etc" - ], - "cell_type": "markdown", - "metadata": { - "scrolled": true - } - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "#TBD Load boston dataset\n", - "#TBD Split into Train and Test dataset. random 90% 10%\n", - "#TBD Fit linear model on Training data\n", - "#TBD From the model get the weights\n" - ] - }, - { - "source": [ - "## A special weight: Bias or Intercept" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# TBD: From above model get the Bias term" - ] - }, - { - "source": [ - "# TBD: Using weights and bias to simulate above model predict for training data Y_hat1\n", - "# TBD: Using model.predict predict for training data Y_hat2\n", - "# TBD: Mean squared error for Y_hat1, Y_hat2 (They must be very close, nearly equal)\n" - ], - "cell_type": "code", - "metadata": {}, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "name": "Python 3.8.2 64-bit ('pml')", - "display_name": "Python 3.8.2 64-bit ('pml')", - "metadata": { - "interpreter": { - "hash": "a4c9474aacc61cf72d0f1c29f4a339e5d6b2171c287541cfd684cf058783219b" + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "name": "Python 3.8.2 64-bit ('pml')", + "display_name": "Python 3.8.2 64-bit ('pml')", + "metadata": { + "interpreter": { + "hash": "a4c9474aacc61cf72d0f1c29f4a339e5d6b2171c287541cfd684cf058783219b" + } + } + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2-final" + }, + "colab": { + "name": "014.ipynb", + "provenance": [] } - } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2-final" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "scrolled": true, + "id": "n4xSXLYs-zDl" + }, + "source": [ + "## Weights or Coeffecients or Theta or Strength of Connection etc" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vs-klc3P-zDo", + "outputId": "ed776736-bfcc-4b4e-8dc4-06c7b82eaecd", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "#TBD Load boston dataset\n", + "#TBD Split into Train and Test dataset. random 90% 10%\n", + "#TBD Fit linear model on Training data\n", + "#TBD From the model get the weights\n", + "from sklearn.datasets import load_boston\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import linear_model\n", + "\n", + "X, Y = load_boston(return_X_y=True)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.9)\n", + "reg = linear_model.LinearRegression().fit(X_train, y_train)\n", + "reg.predict(X_test)\n", + "print(reg.coef_)\n", + "print(reg.intercept_)\n" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[-7.70271024e-02 4.30241516e-02 2.37372970e-02 2.51900506e+00\n", + " -1.63731209e+01 4.05950395e+00 -5.77856631e-03 -1.37688221e+00\n", + " 2.90955226e-01 -1.34401015e-02 -9.42813017e-01 8.88950633e-03\n", + " -4.93583459e-01]\n", + "34.1861458109347\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-eu3MSl5-zD4" + }, + "source": [ + "## A special weight: Bias or Intercept" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "iuRZFCrF-zD5" + }, + "source": [ + "# TBD: From above model get the Bias term" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gOK4usUn-zEC" + }, + "source": [ + "# TBD: Using weights and bias to simulate above model predict for training data Y_hat1\n", + "# TBD: Using model.predict predict for training data Y_hat2\n", + "# TBD: Mean squared error for Y_hat1, Y_hat2 (They must be very close, nearly equal)\n" + ], + "execution_count": null, + "outputs": [] + } + ] } \ No newline at end of file diff --git a/course/Problem032_knn_collaborative_filtering/032.ipynb b/course/Problem032_knn_collaborative_filtering/032.ipynb new file mode 100644 index 0000000..ec83ceb --- /dev/null +++ b/course/Problem032_knn_collaborative_filtering/032.ipynb @@ -0,0 +1,470 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "colab": { + "name": "032.ipynb", + "provenance": [] + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "8JptQO420ugZ" + }, + "source": [ + "# Recommender Systems\n", + "\n", + "\n", + "recommmendation system: presents items to users in a relevant way\n", + "\n", + "user: party that is receiving Recommendations\n", + "item: the passive party that is being recommended to users \n", + "\n", + "Content based vs collaborative filtering\n", + "In practice most will be hybrid.\n", + "\n", + "\n", + "## Recommendations at Expedia Group\n", + "\n", + "### EPS\n", + "\n", + "https://confluence.expedia.biz/pages/viewpage.action?pageId=890552932\n", + "\n", + "https://www.dropbox.com/s/cf77o15jlahabay/wid-eps-recommendations.pdf\n", + "\n", + "### BEX?\n", + "\n", + "### Hcom?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p9rUr94G1YPQ" + }, + "source": [ + "## Collaborative Filtering\n", + "\n", + "* Memory Based - KNearest Neighbours\n", + "* Model Based - SVD\n", + "* and others\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "erBRh2L22J7E" + }, + "source": [ + "## K Nearest Neighbours" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "H6Drt1tW0ugf" + }, + "source": [ + "import pandas as pd\n", + "\n", + "def display_matrix(data):\n", + " df = pd.DataFrame(data) \n", + " df.rename(columns=lambda x: \"item\" + str(x), inplace=True)\n", + " df.rename(index = lambda x: \"user\" + str(x), inplace=True)\n", + " return df\n" + ], + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "id": "vnbFZ4w52WIa", + "outputId": "cb34a176-9123-4036-d6f0-bdcae8dc9247" + }, + "source": [ + "from scipy import spatial\n", + "import matplotlib.pyplot as plt\n", + "import itertools\n", + "from sklearn.neighbors import NearestNeighbors\n", + "import math\n", + "\n", + "\n", + "def points(xs):\n", + " return [[x[0] for x in xs],[x[1] for x in xs]]\n", + "\n", + "def plot(xs):\n", + " ps = points(xs)\n", + " plt.plot(ps[0],ps[1], 'ro')\n", + " mx = max([max(ps[0]), max(ps[1])])\n", + " mn = min([min(ps[0]), min(ps[1]), 0])\n", + " gap = math.ceil((mx - mn) / 10)\n", + " plt.axis([mn, mx + gap, mn, mx + gap])\n", + "\n", + "\n", + "# https://realpython.com/build-recommendation-engine-collaborative-filtering/\n", + "# conceptually we can say that these are users with explicit ratings for movies x and y\n", + "# or we could say that they're\n", + "pts = [\n", + " [4, 1],\n", + " [3.5, 0.5],\n", + " [0.5, 4],\n", + " [1.5, 5]\n", + "]\n", + "\n", + "plot(pts)" + ], + "execution_count": 26, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAD8CAYAAAC8TPVwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAMgUlEQVR4nO3dX6ikd33H8c9n/4ieNZoLRwludk8KJUGEJnFIkRVpI0pag+1FL5SjF1I4N62stCDVRYoXeyt6UQpDEpvi1CCJgZK2qQEjaaDZOCduNNndFgnZzQZljxRJ1gNK4qcX8yzZzZ6TeU7yPDv7nfN+wWFmnv3t5DuEfe+zzzzPjJMIAFDDrnkPAABoj2gDQCFEGwAKIdoAUAjRBoBCiDYAFNIq2ravtX2/7VO2T9r+cN+DAQAut6flum9KejjJX9h+m6SlHmcCAGzBsy6usf1uSccl/V64EgcA5qrNnvYNktYlfcv2H0hak3Q4ya8vXmR7VdKqJO3bt+9DN910U9ezAsDCWltb+2WSwax1bfa0h5KekHQoyTHb35T0UpKvbvV7hsNhJpPJdmcGgB3L9lqS4ax1bd6IPCvpbJJjzeP7Jd36VoYDALw5M6Od5BeSXrB9Y7PpY5JO9DoVAGBTbc8e+YKkcXPmyHOSPt/fSACArbSKdpLjkmYeawEA9IsrIgGgEKINAIUQbQAohGgDQCFEGwAKIdoAUAjRBoBCiDYAFEK0AaAQog0AhRBtACiEaANAIUQbAAoh2gBQCNEGgEKINgAUQrQBoBCiDQCFEG0AKIRoA0AhRBsACiHaAFAI0QaAQog2ABRCtAGgEKINAIUQbQAoZE+bRbafl/SypFclvZJk2OdQAIDNbWdP+4+T3Eywr5DxWFpelnbtmt6Ox/OeCMBVoNWeNq6w8VhaXZU2NqaPT5+ePpaklZX5zQVg7truaUfS922v2V7tcyBIOnLktWBfsLEx3Q5gR2u7p/2RJC/afq+kR2yfSvLYxQuamK9K0oEDBzoec4c5c2Z72wHsGK32tJO82Nyek/SgpNs2WTNKMkwyHAwG3U6502z1lx5/GQI73sxo295n+5oL9yV9QtIzfQ+2ox09Ki0tXbptaWm6HcCO1mZP+32SHrf9tKQnJf1bkof7HWuHW1mRRiPp4EHJnt6ORrwJCUBO0vmTDofDTCaTzp8XABaV7bU2p1RzRSQAFEK0AaAQog0AhRBtACiEaANAIUQbAAoh2gBQCNEGgEKINgAUQrQBoBCiDQCFEG0AKIRoA0AhRBsACiHaAFAI0QaAQog2ABRCtAGgEKINAIUQbQAohGgDQCFEGwAKIdoAUAjRBoBCiDYAFEK0AaAQog0AhRBtACikdbRt77b9Y9sP9TlQa+OxtLws7do1vR2P5z0RAPRuzzbWHpZ0UtK7epqlvfFYWl2VNjamj0+fnj6WpJWV+c0FAD1rtadte7+kT0q6q99xWjpy5LVgX7CxMd0OAAus7eGRb0j6kqTfbbXA9qrtie3J+vp6J8Nt6cyZ7W0HgAUxM9q275R0LsnaG61LMkoyTDIcDAadDbipAwe2tx0AFkSbPe1Dkj5l+3lJ90m63fa3e51qlqNHpaWlS7ctLU23A8ACmxntJF9Osj/JsqRPS/pBks/2PtkbWVmRRiPp4EHJnt6ORrwJCWDhbefskavLygqRBrDjbCvaSX4o6Ye9TAIAmIkrIgGgEKINAIUQbQAohGgDQCFEGwAKIdoAUAjRBoBCiDYAFEK0AaAQog0AhRBtACiEaANAIUQbAAoh2gBQCNEGgEKINgAUQrQBoBCiDQCFEG0AKIRoA0AhRBsACiHaAFAI0QaAQog2ABRCtAGgEKINAIUQbQAoZGa0bb/d9pO2n7b9rO2vXYnBAACX29NizW8k3Z7kvO29kh63/R9Jnuh5NgDA68yMdpJIOt883Nv8pM+hAACba3VM2/Zu28clnZP0SJJjm6xZtT2xPVlfX+96TgCAWkY7yatJbpa0X9Jttj+4yZpRkmGS4WAw6HpOAIC2efZIkl9JelTSHf2MAwB4I23OHhnYvra5/w5JH5d0qu/BAACXa3P2yHWS7rW9W9PIfzfJQ/2OBQDYTJuzR34i6ZYrMAsAYAauiASAQog2ABRCtAGgEKINAIUQbQAohGgDQCFEGwAKIdoAUAjRBoBCiDYAFEK0AaAQog0AhRBtACiEaANAIUQbAAoh2gBQCNEGgEKINgAUQrQBoBCiDQCFEG0AKIRoA0AhRBsACiHaAFAI0QaAQog2ABRCtAGgkJnRtn297Udtn7D9rO3DV2IwAMDl9rRY84qkv03ylO1rJK3ZfiTJiZ5nAwC8zsw97SQ/T/JUc/9lSSclvb/vwQAAl9vWMW3by5JukXRsk19btT2xPVlfX+9mOgDAJVpH2/Y7JT0g6YtJXnr9rycZJRkmGQ4Ggy5nBAA0WkXb9l5Ngz1O8r1+RwIAbKXN2SOWdLekk0m+3v9IAICttNnTPiTpc5Jut328+fnTnucCAGxi5il/SR6X5CswCwBgBq6IBIBCiDYAFEK0AaAQog0AhRBtACiEaANAIUQbAAoh2gBQCNEGgEKINgAUQrQBoBCiDQCFEG0AKIRoA0AhRBsACiHaAFAI0QaAQog2ABRCtAGgEKINAIUQbQAohGgDQCFEGwAKIdoAUAjRBoBCiDYAFEK0AaCQmdG2fY/tc7afuRIDAQC21mZP+58k3dHzHADeivFYWl6Wdu2a3o7H854IPdkza0GSx2wv9z8KgDdlPJZWV6WNjenj06enjyVpZWV+c6EXHNMGqjty5LVgX7CxMd2OhdNZtG2v2p7Ynqyvr3f1tABmOXNme9tRWmfRTjJKMkwyHAwGXT0tgFkOHNjedpTG4RGguqNHpaWlS7ctLU23Y+G0OeXvO5L+W9KNts/a/sv+xwLQ2sqKNBpJBw9K9vR2NOJNyAXlJJ0/6XA4zGQy6fx5AWBR2V5LMpy1jsMjAFAI0QaAQog2ABRCtAGgEKINAIUQbQAohGgDQCFEGwAKIdoAUAjRBoBCiDbQB75JBj2Z+c01ALaJb5JBj9jTBrrGN8mgR0Qb6BrfJIMeEW2ga3yTDHpEtIGu8U0y6BHRBrrGN8mgR5w9AvRhZYVIoxfsaQNAIUQbAAoh2gBQCNEGgEKINgAUQrQBoBCiDQCFEG0AKIRoA0AhRBsACmkVbdt32P4f2z+z/Xd9DwUA2NzMaNveLekfJP2JpA9I+oztD/Q9GADgcm32tG+T9LMkzyX5raT7JP1Zv2MBADbT5lP+3i/phYsen5X0h69fZHtVUvNFePqN7Wfe+nhXpfdI+uW8h+gRr682Xl9dN7ZZ1NlHsyYZSRpJku1JkmFXz301WeTXJvH6quP11WV70mZdm8MjL0q6/qLH+5ttAIArrE20fyTp923fYPttkj4t6V/7HQsAsJmZh0eSvGL7ryX9p6Tdku5J8uyM3zbqYrir1CK/NonXVx2vr65Wr81J+h4EANARrogEgEKINgAU0mm0F/lyd9v32D63qOef277e9qO2T9h+1vbhec/UJdtvt/2k7aeb1/e1ec/UNdu7bf/Y9kPznqVrtp+3/VPbx9ueGleJ7Wtt32/7lO2Ttj+85dqujmk3l7v/r6SPa3oBzo8kfSbJiU7+A3Nm+6OSzkv65yQfnPc8XbN9naTrkjxl+xpJa5L+fIH+/1nSviTnbe+V9Likw0memPNonbH9N5KGkt6V5M55z9Ml289LGiZZyAtrbN8r6b+S3NWcpbeU5Febre1yT3uhL3dP8pik/5v3HH1J8vMkTzX3X5Z0UtOrYRdCps43D/c2PwvzLrzt/ZI+Kemuec+C7bH9bkkflXS3JCX57VbBlrqN9maXuy/MH/qdxPaypFskHZvvJN1qDh8cl3RO0iNJFun1fUPSlyT9bt6D9CSSvm97rfnIjEVyg6R1Sd9qDm/dZXvfVot5IxKXsP1OSQ9I+mKSl+Y9T5eSvJrkZk2v6r3N9kIc5rJ9p6RzSdbmPUuPPpLkVk0/bfSvmsOVi2KPpFsl/WOSWyT9WtKW7wl2GW0udy+uOdb7gKRxku/Ne56+NP/0fFTSHfOepSOHJH2qOe57n6TbbX97viN1K8mLze05SQ9qejh2UZyVdPaif/ndr2nEN9VltLncvbDmjbq7JZ1M8vV5z9M12wPb1zb336HpG+an5jtVN5J8Ocn+JMua/rn7QZLPznmsztje17w5ruawwSckLcxZXEl+IekF2xc+5e9jkrY8AaDLT/l7M5e7l2H7O5L+SNJ7bJ+V9PdJ7p7vVJ06JOlzkn7aHPeVpK8k+fc5ztSl6yTd25zltEvSd5Ms3KlxC+p9kh6c7ldoj6R/SfLwfEfq3BckjZsd3uckfX6rhVzGDgCF8EYkABRCtAGgEKINAIUQbQAohGgDQCFEGwAKIdoAUMj/A38CGauCEAmLAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "5CHKtKgz7VI8", + "outputId": "20091c22-531d-430c-e7a2-660a74df2666" + }, + "source": [ + "display_matrix(pts)" + ], + "execution_count": 27, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item0item1
user04.01.0
user13.50.5
user20.54.0
user31.55.0
\n", + "
" + ], + "text/plain": [ + " item0 item1\n", + "user0 4.0 1.0\n", + "user1 3.5 0.5\n", + "user2 0.5 4.0\n", + "user3 1.5 5.0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 27 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HDpISfFN7vzc" + }, + "source": [ + "#User-based collaborative filtering:\n", + "#Find the users who have similar taste of products as the current user , similarity is based on purchasing behavior of the user, so based on the neighbor purchasing behavior we can recommend items to the current user.\n", + "\n", + "# Let's build up intuitions\n", + "# Lets add a new user who gave movie x a certain rating but hasn't given one for movie y; what rating will she give to movie y?\n", + "# Let's say that her rating will be the average of the ratings given by the nearest 2 users for movie y\n", + "# TBD\n", + "def predicted_rating(x, pts):\n", + " nearest_users = None\n", + " return 0.0\n", + "\n", + "\n", + "# problems with this?" + ], + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5skLWSs3DcE8", + "outputId": "94f925c4-1a37-4c79-dd6d-a4dda3d8c89e" + }, + "source": [ + "# now let's see if we can try using sklearn's nearest neighbour's library\n", + "neigh = NearestNeighbors(n_neighbors=2, metric='euclidean')\n", + "neigh.fit(pts)\n", + "\n", + "print(neigh.kneighbors([[4,1]]))\n", + "\n", + "print(neigh.kneighbors([[4,1.1]]))\n", + "\n", + "print(neigh.kneighbors([[5,2]]))\n", + "\n", + "print(neigh.kneighbors([[0,0.25]]))\n", + "\n", + "# what's wrong with the result here?" + ], + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(array([[0. , 0.70710678]]), array([[0, 1]]))\n", + "(array([[0.1 , 0.78102497]]), array([[0, 1]]))\n", + "(array([[1.41421356, 2.12132034]]), array([[0, 1]]))\n", + "(array([[3.50891721, 3.78318649]]), array([[1, 2]]))\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zPmAdNSHD53I", + "outputId": "2bb12be7-443c-4851-cb2a-1494d94a9413" + }, + "source": [ + "def normalise(x):\n", + " avg = (x[0] + x[1]) / 2\n", + " return [x[0] - avg, x[1] - avg]\n", + "\n", + "neigh = NearestNeighbors(n_neighbors=2)\n", + "neigh.fit([normalise(pt) for pt in pts])\n", + "neigh.kneighbors([[0.1,0]])\n" + ], + "execution_count": 30, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(array([[2.05182845, 2.05182845]]), array([[0, 1]]))" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "A7K-czAtEvp6", + "outputId": "e11d2d20-7a4f-4e16-86d6-5bf053202712" + }, + "source": [ + "# let's add some items and introduce some sparsity\n", + "pts = [\n", + " [4, 1, None, None],\n", + " [3.5, 0.5, None, 2],\n", + " [None, None, 3, 4],\n", + " [0.5, 4, None, None],\n", + " [1.5, 5, None, None]\n", + "]\n", + "display_matrix(pts)" + ], + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item0item1item2item3
user04.01.0NaNNaN
user13.50.5NaN2.0
user2NaNNaN3.04.0
user30.54.0NaNNaN
user41.55.0NaNNaN
\n", + "
" + ], + "text/plain": [ + " item0 item1 item2 item3\n", + "user0 4.0 1.0 NaN NaN\n", + "user1 3.5 0.5 NaN 2.0\n", + "user2 NaN NaN 3.0 4.0\n", + "user3 0.5 4.0 NaN NaN\n", + "user4 1.5 5.0 NaN NaN" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bJ-wPJdfFC7d" + }, + "source": [ + "def normalise(xs):\n", + " nxs = list(filter(lambda x: x is not None, xs))\n", + " avg = sum(nxs) / len(nxs)\n", + " return [0.0 if x is None else x - avg for x in xs]\n", + "\n" + ], + "execution_count": 33, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "rB0RwCBq0ugs" + }, + "source": [ + "# TDB: write a function to predict ratings for a new entry \n", + "def predicted_rating(x, pts):\n", + " return 0.0" + ], + "execution_count": 35, + "outputs": [] + } + ] +} \ No newline at end of file