From 0427da55147aa76aa618267046942c0d7d4b2581 Mon Sep 17 00:00:00 2001 From: ammaraamir Date: Fri, 12 May 2017 18:17:18 +0500 Subject: [PATCH] Added notebook for MongoDB Assignment --- AssignmentMongoDB-Ammar.ipynb | 370 ++++++++++++++++++++++++++++++++++ 1 file changed, 370 insertions(+) create mode 100644 AssignmentMongoDB-Ammar.ipynb diff --git a/AssignmentMongoDB-Ammar.ipynb b/AssignmentMongoDB-Ammar.ipynb new file mode 100644 index 0000000..8e65b6d --- /dev/null +++ b/AssignmentMongoDB-Ammar.ipynb @@ -0,0 +1,370 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import ijson\n", + "filename = \"../rows.json\"\n", + "with open(filename, 'r') as f:\n", + " objects = ijson.items(f, 'meta.view.columns.item')\n", + " columns = list(objects)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[u':sid',\n", + " u':id',\n", + " u':position',\n", + " u':created_at',\n", + " u':created_meta',\n", + " u':updated_at',\n", + " u':updated_meta',\n", + " u':meta',\n", + " u'date_of_stop',\n", + " u'time_of_stop',\n", + " u'agency',\n", + " u'subagency',\n", + " u'description',\n", + " u'location',\n", + " u'latitude',\n", + " u'longitude',\n", + " u'accident',\n", + " u'belts',\n", + " u'personal_injury',\n", + " u'property_damage',\n", + " u'fatal',\n", + " u'commercial_license',\n", + " u'hazmat',\n", + " u'commercial_vehicle',\n", + " u'alcohol',\n", + " u'work_zone',\n", + " u'state',\n", + " u'vehicle_type',\n", + " u'year',\n", + " u'make',\n", + " u'model',\n", + " u'color',\n", + " u'violation_type',\n", + " u'charge',\n", + " u'article',\n", + " u'contributed_to_accident',\n", + " u'race',\n", + " u'gender',\n", + " u'driver_city',\n", + " u'driver_state',\n", + " u'dl_state',\n", + " u'arrest_type',\n", + " u'geolocation']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "column_names = [col[\"fieldName\"] for col in columns]\n", + "column_names" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "good_columns = [\n", + " \"date_of_stop\", \n", + " \"time_of_stop\", \n", + " \"agency\", \n", + " \"subagency\",\n", + " \"description\",\n", + " \"location\", \n", + " \"latitude\", \n", + " \"longitude\", \n", + " \"vehicle_type\", \n", + " \"year\", \n", + " \"make\", \n", + " \"model\", \n", + " \"color\", \n", + " \"violation_type\",\n", + " \"race\", \n", + " \"gender\", \n", + " \"driver_state\", \n", + " \"driver_city\", \n", + " \"dl_state\",\n", + " \"arrest_type\"\n", + "]\n", + "data = []\n", + "with open(filename, 'r') as f:\n", + " objects = ijson.items(f, 'data.item')\n", + " for row in objects:\n", + " selected_row = []\n", + " for item in good_columns:\n", + " selected_row.append(row[column_names.index(item)])\n", + " data.append(selected_row)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1093289, 20)\n" + ] + } + ], + "source": [ + "df = pd.DataFrame(data, columns=good_columns)\n", + "print df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df.date_of_stop=pd.to_datetime(df.date_of_stop)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pymongo import MongoClient\n", + "import json\n", + "client = MongoClient('localhost', 27017)\n", + "db = client['traffic_violations']\n", + "colls = db.collections" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "coll.insert_many(df.to_dict('records'))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rColor=coll.aggregate([{\"$group\" : {\"_id\":\"$color\", \"count\":{\"$sum\":1}}}])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{u'count': 23, u'_id': u'CHROME'}\n", + "{u'count': 357, u'_id': u'COPPER'}\n", + "{u'count': 4247, u'_id': u'YELLOW'}\n", + "{u'count': 2588, u'_id': u'BRONZE'}\n", + "{u'count': 5245, u'_id': u'BROWN'}\n", + "{u'count': 6491, u'_id': u'GREEN, LGT'}\n", + "{u'count': 19330, u'_id': u'MAROON'}\n", + "{u'count': 2211, u'_id': u'PURPLE'}\n", + "{u'count': 35901, u'_id': u'GOLD'}\n", + "{u'count': 167, u'_id': u'PINK'}\n", + "{u'count': 746, u'_id': u'CREAM'}\n", + "{u'count': 23864, u'_id': u'TAN'}\n", + "{u'count': 12812, u'_id': u'GREEN, DK'}\n", + "{u'count': 14942, u'_id': u'BLUE, LIGHT'}\n", + "{u'count': 14254, u'_id': nan}\n", + "{u'count': 43944, u'_id': u'GREEN'}\n", + "{u'count': 3829, u'_id': u'ORANGE'}\n", + "{u'count': 164915, u'_id': u'WHITE'}\n", + "{u'count': 20, u'_id': u'CAMOUFLAGE'}\n", + "{u'count': 924, u'_id': u'MULTICOLOR'}\n", + "{u'count': 80581, u'_id': u'BLUE'}\n", + "{u'count': 199971, u'_id': u'SILVER'}\n", + "{u'count': 23227, u'_id': u'BLUE, DARK'}\n", + "{u'count': 86613, u'_id': u'RED'}\n", + "{u'count': 116635, u'_id': u'GRAY'}\n", + "{u'count': 216292, u'_id': u'BLACK'}\n", + "{u'count': 13160, u'_id': u'BEIGE'}\n" + ] + } + ], + "source": [ + "for item in rColor:\n", + " print item" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rArrest=coll.aggregate([{\"$group\" : {\"_id\":\"$arrest_type\", \"count\":{\"$sum\":1}}}])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{u'count': 44, u'_id': u'K - Aircraft Assist'}\n", + "{u'count': 299, u'_id': u'J - Unmarked Moving Radar (Moving)'}\n", + "{u'count': 260, u'_id': u'P - Mounted Patrol'}\n", + "{u'count': 561, u'_id': u'H - Unmarked Moving Radar (Stationary)'}\n", + "{u'count': 1545, u'_id': u'I - Marked Moving Radar (Moving)'}\n", + "{u'count': 399, u'_id': u'C - Marked VASCAR'}\n", + "{u'count': 232, u'_id': u'D - Unmarked VASCAR'}\n", + "{u'count': 6882, u'_id': u'E - Marked Stationary Radar'}\n", + "{u'count': 12875, u'_id': u'S - License Plate Recognition'}\n", + "{u'count': 1632, u'_id': u'M - Marked (Off-Duty)'}\n", + "{u'count': 5083, u'_id': u'R - Unmarked Laser'}\n", + "{u'count': 161, u'_id': u'N - Unmarked (Off-Duty)'}\n", + "{u'count': 10336, u'_id': u'L - Motorcycle'}\n", + "{u'count': 110653, u'_id': u'Q - Marked Laser'}\n", + "{u'count': 4077, u'_id': u'G - Marked Moving Radar (Stationary)'}\n", + "{u'count': 11397, u'_id': u'O - Foot Patrol'}\n", + "{u'count': 35195, u'_id': u'B - Unmarked Patrol'}\n", + "{u'count': 772, u'_id': u'F - Unmarked Stationary Radar'}\n", + "{u'count': 890886, u'_id': u'A - Marked Patrol'}\n" + ] + } + ], + "source": [ + "for item in rArrest:\n", + " print item" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rDay=coll.aggregate(\n", + " [\n", + " {\n", + " \"$project\":\n", + " {\n", + " \"dayRes\": { \"$dayOfWeek\": \"$date_of_stop\" },\n", + " } \n", + " \n", + " },{\n", + " \"$group\": {\n", + " \"_id\":\"$dayRes\", \"count\":{\"$sum\":1}\n", + " }\n", + " }\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'rDay' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdays\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrDay\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mdays\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minsert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'rDay' is not defined" + ] + } + ], + "source": [ + "for item in rDay:\n", + " print item\n", + " \n", + "# 1 means Sunday and 7 means Saturday" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "client.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11+" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}