From 4ec419dcfc9c973a5f5472b390f772f8336f3686 Mon Sep 17 00:00:00 2001 From: further-reading Date: Wed, 10 Nov 2021 17:08:02 +0000 Subject: [PATCH 1/8] Making new version with modern code --- scrapy_price_monitor/.gitignore | 57 ++++++++++-- .../price_monitor/spiders/amazon.py | 0 .../price_monitor/spiders/base_spider.py | 0 .../price_monitor/spiders/bestbuy.py | 0 .../price_monitor/spiders/ebay.py | 0 scrapy_price_monitor/bin/monitor.py | 12 ++- .../price_monitor/collection_helper.py | 91 +++++++++++++++++++ scrapy_price_monitor/price_monitor/items.py | 27 ++++-- .../price_monitor/pipelines.py | 23 +++-- .../price_monitor/settings.py | 21 ++--- .../price_monitor/spiders/_base.py | 16 ++++ .../price_monitor/spiders/books_toscrape.py | 15 +++ .../price_monitor/templates/email.html | 3 +- scrapy_price_monitor/price_monitor/utils.py | 2 +- scrapy_price_monitor/scrapinghub.yml | 7 +- scrapy_price_monitor/scrapy.cfg | 2 +- scrapy_price_monitor/setup.py | 12 +-- 17 files changed, 231 insertions(+), 57 deletions(-) rename scrapy_price_monitor/{ => _scrapy_price_monitor_OLD}/price_monitor/spiders/amazon.py (100%) rename scrapy_price_monitor/{ => _scrapy_price_monitor_OLD}/price_monitor/spiders/base_spider.py (100%) rename scrapy_price_monitor/{ => _scrapy_price_monitor_OLD}/price_monitor/spiders/bestbuy.py (100%) rename scrapy_price_monitor/{ => _scrapy_price_monitor_OLD}/price_monitor/spiders/ebay.py (100%) create mode 100644 scrapy_price_monitor/price_monitor/collection_helper.py create mode 100644 scrapy_price_monitor/price_monitor/spiders/_base.py create mode 100644 scrapy_price_monitor/price_monitor/spiders/books_toscrape.py diff --git a/scrapy_price_monitor/.gitignore b/scrapy_price_monitor/.gitignore index 57c0c1e..872c981 100644 --- a/scrapy_price_monitor/.gitignore +++ b/scrapy_price_monitor/.gitignore @@ -8,7 +8,6 @@ __pycache__/ # Distribution / packaging .Python -env/ build/ develop-eggs/ dist/ @@ -20,9 +19,13 @@ lib64/ parts/ sdist/ var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ *.egg-info/ .installed.cfg *.egg +MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -37,13 +40,16 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ +.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml -*,cover +*.cover +*.py,cover .hypothesis/ +.pytest_cache/ # Translations *.mo @@ -52,6 +58,8 @@ coverage.xml # Django stuff: *.log local_settings.py +db.sqlite3 +db.sqlite3-journal # Flask stuff: instance/ @@ -66,27 +74,58 @@ docs/_build/ # PyBuilder target/ -# IPython Notebook +# Jupyter Notebook .ipynb_checkpoints +# IPython +profile_default/ +ipython_config.py + # pyenv .python-version -# celery beat schedule file +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff celerybeat-schedule +celerybeat.pid -# dotenv -.env +# SageMath parsed files +*.sage.py -# virtualenv -.venv/ +# Environments +.env +.venv +env/ venv/ ENV/ +env.bak/ +venv.bak/ # Spyder project settings .spyderproject +.spyproject # Rope project settings .ropeproject -.scrapy \ No newline at end of file +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.idea diff --git a/scrapy_price_monitor/price_monitor/spiders/amazon.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/amazon.py similarity index 100% rename from scrapy_price_monitor/price_monitor/spiders/amazon.py rename to scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/amazon.py diff --git a/scrapy_price_monitor/price_monitor/spiders/base_spider.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/base_spider.py similarity index 100% rename from scrapy_price_monitor/price_monitor/spiders/base_spider.py rename to scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/base_spider.py diff --git a/scrapy_price_monitor/price_monitor/spiders/bestbuy.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/bestbuy.py similarity index 100% rename from scrapy_price_monitor/price_monitor/spiders/bestbuy.py rename to scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/bestbuy.py diff --git a/scrapy_price_monitor/price_monitor/spiders/ebay.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/ebay.py similarity index 100% rename from scrapy_price_monitor/price_monitor/spiders/ebay.py rename to scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/ebay.py diff --git a/scrapy_price_monitor/bin/monitor.py b/scrapy_price_monitor/bin/monitor.py index a9dc370..9f4be62 100644 --- a/scrapy_price_monitor/bin/monitor.py +++ b/scrapy_price_monitor/bin/monitor.py @@ -5,10 +5,11 @@ from datetime import datetime, timedelta import boto -from hubstorage import HubstorageClient from jinja2 import Environment, PackageLoader + from price_monitor import settings from price_monitor.utils import get_product_names, get_retailers_for_product +from price_monitor.collection_helper import CollectionHelper from w3lib.html import remove_tags jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates')) @@ -42,8 +43,13 @@ class DealsFetcher(object): def __init__(self, product_name, apikey, project_id, hours): self.product_name = product_name - project = HubstorageClient(apikey).get_project(project_id) - self.item_store = project.collections.new_store(product_name) + collection = CollectionHelper( + proj_id=project_id, + collection_name=product_name, + api_key=apikey, + create=True, + ) + self.item_store = collection.store self.load_items_from_last_n_hours(hours) def load_items_from_last_n_hours(self, n=24): diff --git a/scrapy_price_monitor/price_monitor/collection_helper.py b/scrapy_price_monitor/price_monitor/collection_helper.py new file mode 100644 index 0000000..2134118 --- /dev/null +++ b/scrapy_price_monitor/price_monitor/collection_helper.py @@ -0,0 +1,91 @@ +import scrapinghub + + +class CollectionHelper: + """Adapter to make interacting with scraping collection easier""" + def __init__(self, proj_id, collection_name, api_key=None, create=False): + sh_client = scrapinghub.ScrapinghubClient(api_key) + project = sh_client.get_project(proj_id) + collections = project.collections + self.store = collections.get_store(collection_name) + self.writer = self.store.create_writer() + if create: + # a store is created by writing into it + # if create is true, write and then delete a placeholder + self.store.set({'_key': 'placeholder', 'value': 123}) + self.delete(['placeholder']) + + def get(self, key, default=None): + """ + Gets value of key + Args: + key: Key searching for + default: What to return if key not present + Returns: + The value of item with the key in collection or the default if not present. + """ + # I use the .list method here because .get only returns bytes. + search = self.store.list([key]) + if not search: + return default + return search[0]['value'] + + def set(self, key, value, flush=False): + """ + Set value at key + Args: + key: The key for the item + value: Thew value for the item + flush(bool): Whether to flush the writer + """ + self.writer.write({'_key': key, 'value': value}) + if flush: + # This is using a batch writer and will not write if the batch isn't filled + # The flush option will flush the writer, causing anything in the current batch to be written + self.flush_writer() + + def delete(self, keys): + """ + Delete keys from store. + Args: + keys(list): List of keys to delete + """ + self.store.delete(keys) + + def flush_writer(self): + """ + Flush the writer + """ + self.writer.flush() + + def iter_items(self): + """ + Create an iterable over all items in the collection + Returns(generator) + """ + return self.store.iter() + + def list_items(self): + """ + Create a list of all items in the collection + Returns(list) + """ + return list(self.iter_items()) + + def list_keys(self): + """ + Get a list of all keys in the collection + Returns(list) + """ + items_generator = self.iter_items() + keys = [i['_key'] for i in items_generator] + return keys + + def list_values(self): + """ + Get a list of all keys in the collection + Returns(list) + """ + items_generator = self.iter_items() + values = [i['value'] for i in items_generator] + return values \ No newline at end of file diff --git a/scrapy_price_monitor/price_monitor/items.py b/scrapy_price_monitor/price_monitor/items.py index 20a91f9..6c5824d 100644 --- a/scrapy_price_monitor/price_monitor/items.py +++ b/scrapy_price_monitor/price_monitor/items.py @@ -1,14 +1,21 @@ -# -*- coding: utf-8 -*- +from scrapy import Item, Field +from scrapy.loader import ItemLoader +from itemloaders.processors import TakeFirst, MapCompose +from price_parser import Price + + +class PriceMonitorItem(Item): + url = Field() + title = Field() + price = Field() + + +class PriceLoader(ItemLoader): + default_output_processor = TakeFirst() + + title_in = MapCompose(lambda x: x.strip()) + price_in = MapCompose(lambda x: Price.fromstring(x).amount_float) -# Define here the models for your scraped items -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html -import scrapy -class PriceMonitorItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass diff --git a/scrapy_price_monitor/price_monitor/pipelines.py b/scrapy_price_monitor/price_monitor/pipelines.py index 18de561..3c8b4b2 100644 --- a/scrapy_price_monitor/price_monitor/pipelines.py +++ b/scrapy_price_monitor/price_monitor/pipelines.py @@ -1,21 +1,28 @@ -# -*- coding: utf-8 -*- from price_monitor import settings -from hubstorage import HubstorageClient +from price_monitor.collection_helper import CollectionHelper from price_monitor.utils import reversed_timestamp, get_product_names -class CollectionStoragePipeline(object): - +class CollectionStoragePipeline: def open_spider(self, spider): - client = HubstorageClient(auth=settings.SHUB_KEY) - project = client.get_project(settings.SHUB_PROJ_ID) self.data_stores = {} for product_name in get_product_names(): - self.data_stores[product_name] = project.collections.new_store(product_name) + store = CollectionHelper( + proj_id=settings.SHUB_PROJ_ID, + collection_name=product_name, + api_key=settings.SHUB_KEY, + create=True, + ) + self.data_stores[product_name] = store def process_item(self, item, spider): key = "{}-{}-{}".format( reversed_timestamp(), item.get('product_name'), item.get('retailer') ) - self.data_stores[item['product_name']].set({'_key': key, 'value': item}) + store = self.data_stores[item['product_name']] + store.set(key, item) return item + + def close_spider(self, spider): + for store in self.data_stores.values(): + store.flush_writer() diff --git a/scrapy_price_monitor/price_monitor/settings.py b/scrapy_price_monitor/price_monitor/settings.py index 9888b56..b55aa4e 100644 --- a/scrapy_price_monitor/price_monitor/settings.py +++ b/scrapy_price_monitor/price_monitor/settings.py @@ -1,27 +1,20 @@ -# -*- coding: utf-8 -*- import os - BOT_NAME = 'price_monitor' + SPIDER_MODULES = ['price_monitor.spiders'] NEWSPIDER_MODULE = 'price_monitor.spiders' -ROBOTSTXT_OBEY = True - -SHUB_KEY = os.getenv('$SHUB_KEY') -# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below +# if you want to run it locally, replace None with your scrapy cloud API key +SHUB_KEY = None +# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '999999').split('/')[0] +ITEM_PIPELINES = { + 'price_monitor.pipelines.CollectionStoragePipeline': 400, +} # settings for Amazon SES email service AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') EMAIL_ALERT_FROM = 'Price Monitor ' EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] - -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'price_monitor.pipelines.CollectionStoragePipeline': 400, -} - -AUTOTHROTTLE_ENABLED = True -# HTTPCACHE_ENABLED = True diff --git a/scrapy_price_monitor/price_monitor/spiders/_base.py b/scrapy_price_monitor/price_monitor/spiders/_base.py new file mode 100644 index 0000000..e726c9c --- /dev/null +++ b/scrapy_price_monitor/price_monitor/spiders/_base.py @@ -0,0 +1,16 @@ +import json +import pkgutil +import scrapy +from datetime import datetime + + +class BaseSpider(scrapy.Spider): + + def start_requests(self): + products = json.loads(pkgutil.get_data('price_monitor', 'resources/urls.json').decode()) + for name, urls in products.items(): + for url in urls: + if self.name in url: + now = datetime.now().strftime('%Y/%m/%d %H:%M:%S') + item = {'product_name': name, 'retailer': self.name, 'when': now} + yield scrapy.Request(url, meta={'item': item}) diff --git a/scrapy_price_monitor/price_monitor/spiders/books_toscrape.py b/scrapy_price_monitor/price_monitor/spiders/books_toscrape.py new file mode 100644 index 0000000..d1fdc32 --- /dev/null +++ b/scrapy_price_monitor/price_monitor/spiders/books_toscrape.py @@ -0,0 +1,15 @@ +from price_monitor.spiders._base import BaseSpider +from price_monitor.items import PriceLoader + + +class BooksSpider(BaseSpider): + name = "books.toscrape.com" + + def parse(self, response): + item = response.meta.get('item', {}) + loader = PriceLoader(item=item, response=response) + loader.add_value('url', response.url) + loader.add_css('name', 'h1::text') + loader.add_css('price', '.price_color::text') + yield loader.load_item() + diff --git a/scrapy_price_monitor/price_monitor/templates/email.html b/scrapy_price_monitor/price_monitor/templates/email.html index c51ef0c..bdab7f6 100644 --- a/scrapy_price_monitor/price_monitor/templates/email.html +++ b/scrapy_price_monitor/price_monitor/templates/email.html @@ -10,5 +10,4 @@

🎉 Hey, we found a good deal! 🎁

Visit the product page at {{item.retailer}}: {{item.url}}

{% endfor %} - - + \ No newline at end of file diff --git a/scrapy_price_monitor/price_monitor/utils.py b/scrapy_price_monitor/price_monitor/utils.py index 8deb616..cc17f6a 100644 --- a/scrapy_price_monitor/price_monitor/utils.py +++ b/scrapy_price_monitor/price_monitor/utils.py @@ -32,4 +32,4 @@ def get_retailers_for_product(product_name): data = json.loads( pkgutil.get_data("price_monitor", "resources/urls.json").decode() ) - return {get_retailer_name_from_url(url) for url in data[product_name]} + return {get_retailer_name_from_url(url) for url in data[product_name]} \ No newline at end of file diff --git a/scrapy_price_monitor/scrapinghub.yml b/scrapy_price_monitor/scrapinghub.yml index 7a8527c..43503c6 100644 --- a/scrapy_price_monitor/scrapinghub.yml +++ b/scrapy_price_monitor/scrapinghub.yml @@ -1,3 +1,4 @@ -requirements_file: requirements.txt -stacks: - default: scrapy:1.1-py3 +stack: scrapy:2.5 +version: GIT +requirements: + file: requirements.txt diff --git a/scrapy_price_monitor/scrapy.cfg b/scrapy_price_monitor/scrapy.cfg index d34a107..966b999 100644 --- a/scrapy_price_monitor/scrapy.cfg +++ b/scrapy_price_monitor/scrapy.cfg @@ -1,7 +1,7 @@ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html +# https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = price_monitor.settings diff --git a/scrapy_price_monitor/setup.py b/scrapy_price_monitor/setup.py index 3e0698a..b22a90d 100644 --- a/scrapy_price_monitor/setup.py +++ b/scrapy_price_monitor/setup.py @@ -3,10 +3,10 @@ from setuptools import setup, find_packages setup( - name='project', - version='1.0', - packages=find_packages(), - package_data={'price_monitor': ['resources/*.json', 'templates/*.html']}, - scripts=['bin/monitor.py'], - entry_points={'scrapy': ['settings = price_monitor.settings']}, + name = 'project', + version = '1.0', + packages = find_packages(), + package_data = {'price_monitor': ['resources/*.json', 'templates/*.html']}, + scripts = ['bin/monitor.py'], + entry_points = {'scrapy': ['settings = price_monitor.settings']}, ) From 8c2404821cd2b7d287e5e2f45709b0ab9d5709fa Mon Sep 17 00:00:00 2001 From: further-reading Date: Wed, 10 Nov 2021 17:09:58 +0000 Subject: [PATCH 2/8] Making new version with modern code --- scrapy_price_monitor/README.md | 26 +-- .../_scrapy_price_monitor_OLD/.gitignore | 92 +++++++++++ .../_scrapy_price_monitor_OLD/README.md | 153 ++++++++++++++++++ .../_scrapy_price_monitor_OLD/bin/monitor.py | 119 ++++++++++++++ .../price_monitor/__init__.py | 0 .../price_monitor/items.py | 14 ++ .../price_monitor/pipelines.py | 21 +++ .../price_monitor/resources/urls.json | 27 ++++ .../price_monitor/settings.py | 27 ++++ .../price_monitor/spiders/__init__.py | 4 + .../price_monitor/templates/email.html | 14 ++ .../price_monitor/utils.py | 35 ++++ .../requirements.txt | 5 + .../_scrapy_price_monitor_OLD/scrapinghub.yml | 3 + .../_scrapy_price_monitor_OLD/scrapy.cfg | 11 ++ .../_scrapy_price_monitor_OLD/setup.py | 12 ++ .../price_monitor/resources/urls.json | 32 ++-- .../price_monitor/templates/email.html | 2 +- scrapy_price_monitor/requirements.txt | 1 + 19 files changed, 557 insertions(+), 41 deletions(-) create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/__init__.py create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg create mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py diff --git a/scrapy_price_monitor/README.md b/scrapy_price_monitor/README.md index dc32238..02c4134 100644 --- a/scrapy_price_monitor/README.md +++ b/scrapy_price_monitor/README.md @@ -2,7 +2,8 @@ Scrapy Price Monitor ==================== This is a simple price monitor built with [Scrapy](https://github.com/scrapy/scrapy) -and [Scrapy Cloud](https://scrapinghub.com/scrapy-cloud). +and [Scrapy Cloud](https://www.zyte.com/scrapy-cloud/). It is an updated version of +[this sample](https://github.com/scrapinghub/sample-projects/tree/master/scrapy_price_monitor/_scrapy_price_monitor_OLD). It is basically a Scrapy project with one spider for each online retailer that we want to monitor prices from. In addition to the spiders, there's a Python @@ -19,11 +20,6 @@ the already supported retailers, just add a new key for that product and add the URL list as its value, such as: { - "headsetlogitech": [ - "https://www.amazon.com/.../B005GTO07O/", - "http://www.bestbuy.com/.../3436118.p", - "http://www.ebay.com/.../110985874014" - ], "NewProduct": [ "http://url.for.retailer.x", "http://url.for.retailer.y", @@ -34,16 +30,8 @@ the URL list as its value, such as: ## Supporting Further Retailers -This project currently only works with 3 online retailers, and you can list them -running: - - $ scrapy list - amazon.com - bestbuy.com - ebay.com - -If the retailer that you want to monitor is not yet supported, just create a spider -to handle the product pages from it. To include a spider for samsclub.com, you +To add a retailer, just create a spider to handle the product pages from it. +To include a spider for samsclub.com, you could run: $ scrapy genspider samsclub.com samsclub.com @@ -74,7 +62,7 @@ later when showing how to schedule the project on Scrapy Cloud. 1. Clone this repo: - $ git clone git@github.com:stummjr/scrapy_price_monitor.git + $ git clone git@github.com:further-reading/price-monitoring-sample.git 2. Enter the folder and install the project dependencies: @@ -141,9 +129,9 @@ To do that, first add your Scrapy Cloud project id to [settings.py `SHUB_PROJ_ID Then run the spiders via command line: - $ scrapy crawl bestbuy.com + $ scrapy crawl books.toscrape.com -This will run the spider named as `bestbuy.com` and store the scraped data into +This will run the spider named as `books.toscrape.com` and store the scraped data into a Scrapy Cloud collection, under the project you set in the last step. You can also run the price monitor via command line: diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore b/scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore new file mode 100644 index 0000000..57c0c1e --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore @@ -0,0 +1,92 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +.venv/ +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + +.scrapy \ No newline at end of file diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md b/scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md new file mode 100644 index 0000000..8162616 --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md @@ -0,0 +1,153 @@ +Scrapy Price Monitor - Old Version +==================== + +_This is a deprecated version of this project. It may not function on the latest versions of scrapy, python and the target websites_ + +This is a simple price monitor built with [Scrapy](https://github.com/scrapy/scrapy) +and [Scrapy Cloud](https://scrapinghub.com/scrapy-cloud). + +It is basically a Scrapy project with one spider for each online retailer that +we want to monitor prices from. In addition to the spiders, there's a Python +Script that is scheduled to run periodically on Scrapy Cloud, checking whether +the latest prices are the best ones in a given time span. If so, the monitor +sends an email alerting you about the price drops. + + +## Including Products to Monitor + +There's a `resources/urls.json` file that lists the URLs from the products that +we want to monitor. If you just want to include a new product to monitor from +the already supported retailers, just add a new key for that product and add +the URL list as its value, such as: + + { + "headsetlogitech": [ + "https://www.amazon.com/.../B005GTO07O/", + "http://www.bestbuy.com/.../3436118.p", + "http://www.ebay.com/.../110985874014" + ], + "NewProduct": [ + "http://url.for.retailer.x", + "http://url.for.retailer.y", + "http://url.for.retailer.z" + ] + } + + +## Supporting Further Retailers + +This project currently only works with 3 online retailers, and you can list them +running: + + $ scrapy list + amazon.com + bestbuy.com + ebay.com + +If the retailer that you want to monitor is not yet supported, just create a spider +to handle the product pages from it. To include a spider for samsclub.com, you +could run: + + $ scrapy genspider samsclub.com samsclub.com + +And then, open the spider and add the extraction rules: + + $ scrapy edit samsclub.com + +Have a look at the current spiders and implement the new ones using the same +structure, subclassing `BaseSpider` instead of `scrapy.Spider`. This way, your +spiders will automatically read the URLs list from `resources/urls.json`. + + +## Customizing the Price Monitor + +The price monitor sends an email using Amazon SES service, so to run it you +have to set both `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` variables in +`price_monitor/settings.py`. If you want to use another email service, +you have to rewrite the `send_email_alert` function in +`price_monitor/bin/monitor.py`. + +The price monitor can be further customized via parameters to the +`price_monitor/bin/monitor.py` script. We will dig on those parameters +later when showing how to schedule the project on Scrapy Cloud. + + +## Installing and Running + +1. Clone this repo: + + $ git clone git@github.com:stummjr/scrapy_price_monitor.git + +2. Enter the folder and install the project dependencies: + + $ cd scrapy_price_monitor + $ pip install -r requirements.txt + +3. Create a free forever account on Scrapy Cloud: +https://app.scrapinghub.com/account/signup/. + +4. Create a Scrapy project on Scrapy Cloud and copy the project id from the project URL. + +5. Install [Scrapinghub command line tool (shub)](https://github.com/scrapinghub/shub): + + $ pip install shub + +6. Authenticate using your Scrapinghub API key: + + $ shub login + +7. Finally, deploy the local project to your Scrapy Cloud project: + + $ shub deploy + +This video also explains how to deploy a Scrapy project to Scrapy Cloud: +https://youtu.be/JYch0zRmcgU + + +## How to Schedule on Scrapy Cloud + +After you have deployed the project to Scrapy Cloud, it's time to schedule its +execution on Scrapy Cloud. + +This project has two main components: + +- the [**spiders**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/spiders) that collect prices from the retailers' websites +- the [**price monitor script**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/bin/monitor.py) that checks whether there's a new deal in the latest prices + +You have to schedule both the spiders and the monitor to run periodically on +Scrapy Cloud. It's a good idea to schedule all the spiders to run at the same +time and schedule the monitor to run about 15 minutes after the spiders. + +Take a look at this video to learn how to schedule periodic jobs on Scrapy Cloud: +https://youtu.be/JYch0zRmcgU?t=1m51s + + +### Parameters for the Monitor Script + +The monitor script takes these parameters and you can pass them via the parameters box in the +scheduling dialog: + +- `--days`: how many days of data we want to compare with the scraped prices. +- `--threshold`: a margin that you can set to avoid getting alerts from minor price changes. For example, if you set it to 1.0, you will only get alerts when the price drop is bigger than $1.00. +- `--apikey`: your Scrapy Cloud API key. You can get it in: https://app.scrapinghub.com/account/apikey. +- `--project`: the Scrapy Cloud project where the monitor is deployed (you can grab it from your project URL at Scrapy Cloud). + + +## Running in a Local Environment + +You can run this project on Scrapy Cloud or on your local environment. The only dependency +from Scrapy Cloud is the [Collections API](https://doc.scrapinghub.com/api/collections.html), +but the spiders and the monitor can be executed locally. + +To do that, first add your Scrapy Cloud project id to [settings.py `SHUB_PROJ_ID` variable](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/settings.py#L11). + +Then run the spiders via command line: + + $ scrapy crawl bestbuy.com + +This will run the spider named as `bestbuy.com` and store the scraped data into +a Scrapy Cloud collection, under the project you set in the last step. + +You can also run the price monitor via command line: + + $ python bin/monitor.py --apikey --days 2 --threshold 1 --project diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py new file mode 100644 index 0000000..a9dc370 --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py @@ -0,0 +1,119 @@ +"""Simple price monitor built with Scrapy and Scrapy Cloud +""" +import argparse +import os +from datetime import datetime, timedelta + +import boto +from hubstorage import HubstorageClient +from jinja2 import Environment, PackageLoader +from price_monitor import settings +from price_monitor.utils import get_product_names, get_retailers_for_product +from w3lib.html import remove_tags + +jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates')) + + +class DealsChecker(object): + + def __init__(self, latest_deals, previous_deals, price_threshold=0): + self.price_threshold = price_threshold + self.latest_deals = latest_deals + self.previous_deals = previous_deals + + def is_from_latest_crawl(self, deal): + """Checks whether the given deal is from the most recent execution. + """ + return deal in self.latest_deals + + def get_best_deal(self): + """Returns the item with the best overall price. self.price_threshold can be set to avoid + considering minor price drops. + """ + best_so_far = min(self.previous_deals, key=lambda x: x.get('price')) + best_from_last = min(self.latest_deals, key=lambda x: x.get('price')) + if best_from_last.get('price') + self.price_threshold < best_so_far.get('price'): + return best_from_last + else: + return best_so_far + + +class DealsFetcher(object): + + def __init__(self, product_name, apikey, project_id, hours): + self.product_name = product_name + project = HubstorageClient(apikey).get_project(project_id) + self.item_store = project.collections.new_store(product_name) + self.load_items_from_last_n_hours(hours) + + def load_items_from_last_n_hours(self, n=24): + """Load items from the last n hours, from the newest to the oldest. + """ + since_time = int((datetime.now() - timedelta(hours=n)).timestamp() * 1000) + self.deals = [item.get('value') for item in self.fetch_deals_newer_than(since_time)] + + def fetch_deals_newer_than(self, since_time): + return list(self.item_store.get(meta=['_key', '_ts'], startts=since_time)) + + def get_latest_deal_from_retailer(self, retailer): + """Returns the most recently extracted deal from a given retailer. + """ + for deals in self.deals: + if retailer in deals.get('url'): + return deals + + def get_deals(self): + """Returns a tuple with (deals from latest crawl, deals from previous crawls) + """ + latest_deals = [ + self.get_latest_deal_from_retailer(retailer) + for retailer in get_retailers_for_product(self.product_name) + ] + previous_deals = [ + deal for deal in self.deals if deal not in latest_deals + ] + return latest_deals, previous_deals + + +def send_email_alert(items): + ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY) + html_body = jinja_env.get_template('email.html').render(items=items) + + ses.send_email( + settings.EMAIL_ALERT_FROM, + 'Price drop alert', + remove_tags(html_body), + settings.EMAIL_ALERT_TO, + html_body=html_body + ) + + +def main(args): + items = [] + for prod_name in get_product_names(): + fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) + checker = DealsChecker(*fetcher.get_deals(), args.threshold) + best_deal = checker.get_best_deal() + if checker.is_from_latest_crawl(best_deal): + items.append(best_deal) + + if items: + send_email_alert(items) + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--apikey', default=settings.SHUB_KEY or os.getenv('SHUB_KEY'), + help='API key to use for scrapinghub (fallbacks to SHUB_KEY variable)') + parser.add_argument('--days', type=int, default=1, + help='How many days back to compare with the last price') + parser.add_argument('--threshold', type=float, default=0, + help='A margin to avoid raising alerts with minor price drops') + parser.add_argument('--project', type=int, default=settings.SHUB_PROJ_ID, + help='Project ID to get info from') + + return parser.parse_args() + + +if __name__ == '__main__': + main(parse_args()) diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/__init__.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py new file mode 100644 index 0000000..20a91f9 --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class PriceMonitorItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py new file mode 100644 index 0000000..18de561 --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +from price_monitor import settings +from hubstorage import HubstorageClient +from price_monitor.utils import reversed_timestamp, get_product_names + + +class CollectionStoragePipeline(object): + + def open_spider(self, spider): + client = HubstorageClient(auth=settings.SHUB_KEY) + project = client.get_project(settings.SHUB_PROJ_ID) + self.data_stores = {} + for product_name in get_product_names(): + self.data_stores[product_name] = project.collections.new_store(product_name) + + def process_item(self, item, spider): + key = "{}-{}-{}".format( + reversed_timestamp(), item.get('product_name'), item.get('retailer') + ) + self.data_stores[item['product_name']].set({'_key': key, 'value': item}) + return item diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json new file mode 100644 index 0000000..6bc20ba --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json @@ -0,0 +1,27 @@ +{ + "headsetlogitech": [ + "https://www.amazon.com/Logitech-Wireless-Headset-Over-Design/dp/B005GTO07O/", + "http://www.bestbuy.com/site/logitech-h600-wireless-headset-black/3436118.p", + "http://www.ebay.com/itm/N-Logitech-Wireless-Headset-H600-Over-The-Head-Design-981-000341-/110985874014" + ], + "webcamlogitech": [ + "https://www.amazon.com/Logitech-Widescreen-Calling-Recording-Desktop/dp/B006JH8T3S/", + "http://www.bestbuy.com/site/logitech-c920-pro-webcam-black/4612476.p?skuId=4612476", + "http://www.ebay.com/itm/Logitech-HD-Pro-Webcam-C920-1080p-Widescreen-Video-Calling-and-Recording-/272381890214" + ], + "amazonechodot": [ + "https://www.amazon.com/dp/B01DFKC2SO", + "http://www.bestbuy.com/site/amazon-echo-dot/5578851.p?skuId=5578851", + "http://www.ebay.com/itm/Amazon-Echo-Dot-2nd-Generation-w-Alexa-Voice-Media-Device-All-New-2016-/201668562192" + ], + "nikoncoolpix": [ + "https://www.amazon.com/Nikon-COOLPIX-B500-Digital-Camera/dp/B01C3LEE9G/", + "http://www.bestbuy.com/site/nikon-coolpix-b500-16-0-megapixel-digital-camera-red/4997500.p?skuId=4997500", + "http://www.ebay.com/itm/Nikon-COOLPIX-B500-Digital-Camera-Red-/162225974018" + ], + "bluemicrophone": [ + "https://www.amazon.com/Blue-Snowball-iCE-Condenser-Microphone/dp/B014PYGTUQ/", + "http://www.bestbuy.com/site/blue-microphones-snowball-usb-cardioid-and-omnidirectional-electret-condenser-vocal-microphone-black/9918056.p?skuId=9918056", + "http://www.ebay.com/itm/Blue-Microphones-Snowball-Black-iCE-Condenser-Microphone-/172260373002" + ] +} diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py new file mode 100644 index 0000000..9888b56 --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +import os + +BOT_NAME = 'price_monitor' +SPIDER_MODULES = ['price_monitor.spiders'] +NEWSPIDER_MODULE = 'price_monitor.spiders' + +ROBOTSTXT_OBEY = True + +SHUB_KEY = os.getenv('$SHUB_KEY') +# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below +SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '999999').split('/')[0] + + +# settings for Amazon SES email service +AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') +AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') +EMAIL_ALERT_FROM = 'Price Monitor ' +EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] + +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'price_monitor.pipelines.CollectionStoragePipeline': 400, +} + +AUTOTHROTTLE_ENABLED = True +# HTTPCACHE_ENABLED = True diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html new file mode 100644 index 0000000..c51ef0c --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html @@ -0,0 +1,14 @@ +

🎉 Hey, we found a good deal! 🎁

+ + +{% for item in items %} + +{% endfor %} +
+

Product: {{item.title}}

+

Price: {{item.price}}

+

Store: {{item.retailer}}

+

Price obtained at: {{item.when}}

+

Visit the product page at {{item.retailer}}: {{item.url}}

+
+ diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py new file mode 100644 index 0000000..8deb616 --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py @@ -0,0 +1,35 @@ +import json +import pkgutil +from datetime import datetime, timedelta + + +def timestamp_from_reversed(reversed): + return datetime(5000, 1, 1) - timedelta(seconds=float(reversed)) + + +def reversed_timestamp(): + return str((datetime(5000, 1, 1) - datetime.now()).total_seconds()) + + +def normalize_name(name): + return name.replace('-', '') + + +def get_product_names(): + return [ + normalize_name(name) + for name in json.loads( + pkgutil.get_data("price_monitor", "resources/urls.json").decode() + ).keys() + ] + + +def get_retailer_name_from_url(url): + return url.split("://")[1].split("/")[0].replace("www.", "") + + +def get_retailers_for_product(product_name): + data = json.loads( + pkgutil.get_data("price_monitor", "resources/urls.json").decode() + ) + return {get_retailer_name_from_url(url) for url in data[product_name]} diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt b/scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt new file mode 100644 index 0000000..2567afb --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt @@ -0,0 +1,5 @@ +scrapy +boto +extruct +w3lib +jinja2 diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml b/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml new file mode 100644 index 0000000..7a8527c --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml @@ -0,0 +1,3 @@ +requirements_file: requirements.txt +stacks: + default: scrapy:1.1-py3 diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg b/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg new file mode 100644 index 0000000..d34a107 --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = price_monitor.settings + +[deploy] +#url = http://localhost:6800/ +project = price_monitor diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py new file mode 100644 index 0000000..3e0698a --- /dev/null +++ b/scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py @@ -0,0 +1,12 @@ +# Automatically created by: shub deploy + +from setuptools import setup, find_packages + +setup( + name='project', + version='1.0', + packages=find_packages(), + package_data={'price_monitor': ['resources/*.json', 'templates/*.html']}, + scripts=['bin/monitor.py'], + entry_points={'scrapy': ['settings = price_monitor.settings']}, +) diff --git a/scrapy_price_monitor/price_monitor/resources/urls.json b/scrapy_price_monitor/price_monitor/resources/urls.json index 6bc20ba..046257f 100644 --- a/scrapy_price_monitor/price_monitor/resources/urls.json +++ b/scrapy_price_monitor/price_monitor/resources/urls.json @@ -1,27 +1,17 @@ { - "headsetlogitech": [ - "https://www.amazon.com/Logitech-Wireless-Headset-Over-Design/dp/B005GTO07O/", - "http://www.bestbuy.com/site/logitech-h600-wireless-headset-black/3436118.p", - "http://www.ebay.com/itm/N-Logitech-Wireless-Headset-H600-Over-The-Head-Design-981-000341-/110985874014" + "alightintheattic": [ + "https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html", + "https://fakelink" ], - "webcamlogitech": [ - "https://www.amazon.com/Logitech-Widescreen-Calling-Recording-Desktop/dp/B006JH8T3S/", - "http://www.bestbuy.com/site/logitech-c920-pro-webcam-black/4612476.p?skuId=4612476", - "http://www.ebay.com/itm/Logitech-HD-Pro-Webcam-C920-1080p-Widescreen-Video-Calling-and-Recording-/272381890214" + "shakespearessonnets": [ + "https://books.toscrape.com/catalogue/shakespeares-sonnets_989/index.html", + "https://fakelink" ], - "amazonechodot": [ - "https://www.amazon.com/dp/B01DFKC2SO", - "http://www.bestbuy.com/site/amazon-echo-dot/5578851.p?skuId=5578851", - "http://www.ebay.com/itm/Amazon-Echo-Dot-2nd-Generation-w-Alexa-Voice-Media-Device-All-New-2016-/201668562192" + "soumission": [ + "https://books.toscrape.com/catalogue/soumission_998/index.html", + "https://fakelink" ], - "nikoncoolpix": [ - "https://www.amazon.com/Nikon-COOLPIX-B500-Digital-Camera/dp/B01C3LEE9G/", - "http://www.bestbuy.com/site/nikon-coolpix-b500-16-0-megapixel-digital-camera-red/4997500.p?skuId=4997500", - "http://www.ebay.com/itm/Nikon-COOLPIX-B500-Digital-Camera-Red-/162225974018" - ], - "bluemicrophone": [ - "https://www.amazon.com/Blue-Snowball-iCE-Condenser-Microphone/dp/B014PYGTUQ/", - "http://www.bestbuy.com/site/blue-microphones-snowball-usb-cardioid-and-omnidirectional-electret-condenser-vocal-microphone-black/9918056.p?skuId=9918056", - "http://www.ebay.com/itm/Blue-Microphones-Snowball-Black-iCE-Condenser-Microphone-/172260373002" + "other_product": [ + "https://fakelink" ] } diff --git a/scrapy_price_monitor/price_monitor/templates/email.html b/scrapy_price_monitor/price_monitor/templates/email.html index bdab7f6..af44913 100644 --- a/scrapy_price_monitor/price_monitor/templates/email.html +++ b/scrapy_price_monitor/price_monitor/templates/email.html @@ -10,4 +10,4 @@

🎉 Hey, we found a good deal! 🎁

Visit the product page at {{item.retailer}}: {{item.url}}

{% endfor %} - \ No newline at end of file + diff --git a/scrapy_price_monitor/requirements.txt b/scrapy_price_monitor/requirements.txt index 2567afb..f525147 100644 --- a/scrapy_price_monitor/requirements.txt +++ b/scrapy_price_monitor/requirements.txt @@ -3,3 +3,4 @@ boto extruct w3lib jinja2 +price-parser \ No newline at end of file From 72b906d666ad9e856f309c0c39f712f6af399b31 Mon Sep 17 00:00:00 2001 From: further-reading Date: Fri, 19 Nov 2021 13:25:58 +0000 Subject: [PATCH 3/8] Removing old code and seperating alert code --- .../_scrapy_price_monitor_OLD/.gitignore | 92 ----------- .../_scrapy_price_monitor_OLD/README.md | 153 ------------------ .../_scrapy_price_monitor_OLD/bin/monitor.py | 119 -------------- .../price_monitor/items.py | 14 -- .../price_monitor/pipelines.py | 21 --- .../price_monitor/resources/urls.json | 27 ---- .../price_monitor/settings.py | 27 ---- .../price_monitor/spiders/__init__.py | 4 - .../price_monitor/spiders/amazon.py | 14 -- .../price_monitor/spiders/base_spider.py | 16 -- .../price_monitor/spiders/bestbuy.py | 14 -- .../price_monitor/spiders/ebay.py | 17 -- .../price_monitor/templates/email.html | 14 -- .../price_monitor/utils.py | 35 ---- .../requirements.txt | 5 - .../_scrapy_price_monitor_OLD/scrapinghub.yml | 3 - .../_scrapy_price_monitor_OLD/scrapy.cfg | 11 -- .../_scrapy_price_monitor_OLD/setup.py | 12 -- .../price_monitor => bin}/__init__.py | 0 scrapy_price_monitor/bin/alert.py | 31 ++++ scrapy_price_monitor/bin/monitor.py | 36 ++--- .../{templates => alert_template}/email.html | 0 .../price_monitor/settings.py | 6 - scrapy_price_monitor/requirements.txt | 3 +- scrapy_price_monitor/setup.py | 2 +- 25 files changed, 43 insertions(+), 633 deletions(-) delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/amazon.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/base_spider.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/bestbuy.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/ebay.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg delete mode 100644 scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py rename scrapy_price_monitor/{_scrapy_price_monitor_OLD/price_monitor => bin}/__init__.py (100%) create mode 100644 scrapy_price_monitor/bin/alert.py rename scrapy_price_monitor/price_monitor/{templates => alert_template}/email.html (100%) diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore b/scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore deleted file mode 100644 index 57c0c1e..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/.gitignore +++ /dev/null @@ -1,92 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*,cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# IPython Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# dotenv -.env - -# virtualenv -.venv/ -venv/ -ENV/ - -# Spyder project settings -.spyderproject - -# Rope project settings -.ropeproject - -.scrapy \ No newline at end of file diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md b/scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md deleted file mode 100644 index 8162616..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/README.md +++ /dev/null @@ -1,153 +0,0 @@ -Scrapy Price Monitor - Old Version -==================== - -_This is a deprecated version of this project. It may not function on the latest versions of scrapy, python and the target websites_ - -This is a simple price monitor built with [Scrapy](https://github.com/scrapy/scrapy) -and [Scrapy Cloud](https://scrapinghub.com/scrapy-cloud). - -It is basically a Scrapy project with one spider for each online retailer that -we want to monitor prices from. In addition to the spiders, there's a Python -Script that is scheduled to run periodically on Scrapy Cloud, checking whether -the latest prices are the best ones in a given time span. If so, the monitor -sends an email alerting you about the price drops. - - -## Including Products to Monitor - -There's a `resources/urls.json` file that lists the URLs from the products that -we want to monitor. If you just want to include a new product to monitor from -the already supported retailers, just add a new key for that product and add -the URL list as its value, such as: - - { - "headsetlogitech": [ - "https://www.amazon.com/.../B005GTO07O/", - "http://www.bestbuy.com/.../3436118.p", - "http://www.ebay.com/.../110985874014" - ], - "NewProduct": [ - "http://url.for.retailer.x", - "http://url.for.retailer.y", - "http://url.for.retailer.z" - ] - } - - -## Supporting Further Retailers - -This project currently only works with 3 online retailers, and you can list them -running: - - $ scrapy list - amazon.com - bestbuy.com - ebay.com - -If the retailer that you want to monitor is not yet supported, just create a spider -to handle the product pages from it. To include a spider for samsclub.com, you -could run: - - $ scrapy genspider samsclub.com samsclub.com - -And then, open the spider and add the extraction rules: - - $ scrapy edit samsclub.com - -Have a look at the current spiders and implement the new ones using the same -structure, subclassing `BaseSpider` instead of `scrapy.Spider`. This way, your -spiders will automatically read the URLs list from `resources/urls.json`. - - -## Customizing the Price Monitor - -The price monitor sends an email using Amazon SES service, so to run it you -have to set both `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` variables in -`price_monitor/settings.py`. If you want to use another email service, -you have to rewrite the `send_email_alert` function in -`price_monitor/bin/monitor.py`. - -The price monitor can be further customized via parameters to the -`price_monitor/bin/monitor.py` script. We will dig on those parameters -later when showing how to schedule the project on Scrapy Cloud. - - -## Installing and Running - -1. Clone this repo: - - $ git clone git@github.com:stummjr/scrapy_price_monitor.git - -2. Enter the folder and install the project dependencies: - - $ cd scrapy_price_monitor - $ pip install -r requirements.txt - -3. Create a free forever account on Scrapy Cloud: -https://app.scrapinghub.com/account/signup/. - -4. Create a Scrapy project on Scrapy Cloud and copy the project id from the project URL. - -5. Install [Scrapinghub command line tool (shub)](https://github.com/scrapinghub/shub): - - $ pip install shub - -6. Authenticate using your Scrapinghub API key: - - $ shub login - -7. Finally, deploy the local project to your Scrapy Cloud project: - - $ shub deploy - -This video also explains how to deploy a Scrapy project to Scrapy Cloud: -https://youtu.be/JYch0zRmcgU - - -## How to Schedule on Scrapy Cloud - -After you have deployed the project to Scrapy Cloud, it's time to schedule its -execution on Scrapy Cloud. - -This project has two main components: - -- the [**spiders**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/spiders) that collect prices from the retailers' websites -- the [**price monitor script**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/bin/monitor.py) that checks whether there's a new deal in the latest prices - -You have to schedule both the spiders and the monitor to run periodically on -Scrapy Cloud. It's a good idea to schedule all the spiders to run at the same -time and schedule the monitor to run about 15 minutes after the spiders. - -Take a look at this video to learn how to schedule periodic jobs on Scrapy Cloud: -https://youtu.be/JYch0zRmcgU?t=1m51s - - -### Parameters for the Monitor Script - -The monitor script takes these parameters and you can pass them via the parameters box in the -scheduling dialog: - -- `--days`: how many days of data we want to compare with the scraped prices. -- `--threshold`: a margin that you can set to avoid getting alerts from minor price changes. For example, if you set it to 1.0, you will only get alerts when the price drop is bigger than $1.00. -- `--apikey`: your Scrapy Cloud API key. You can get it in: https://app.scrapinghub.com/account/apikey. -- `--project`: the Scrapy Cloud project where the monitor is deployed (you can grab it from your project URL at Scrapy Cloud). - - -## Running in a Local Environment - -You can run this project on Scrapy Cloud or on your local environment. The only dependency -from Scrapy Cloud is the [Collections API](https://doc.scrapinghub.com/api/collections.html), -but the spiders and the monitor can be executed locally. - -To do that, first add your Scrapy Cloud project id to [settings.py `SHUB_PROJ_ID` variable](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/settings.py#L11). - -Then run the spiders via command line: - - $ scrapy crawl bestbuy.com - -This will run the spider named as `bestbuy.com` and store the scraped data into -a Scrapy Cloud collection, under the project you set in the last step. - -You can also run the price monitor via command line: - - $ python bin/monitor.py --apikey --days 2 --threshold 1 --project diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py deleted file mode 100644 index a9dc370..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/bin/monitor.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Simple price monitor built with Scrapy and Scrapy Cloud -""" -import argparse -import os -from datetime import datetime, timedelta - -import boto -from hubstorage import HubstorageClient -from jinja2 import Environment, PackageLoader -from price_monitor import settings -from price_monitor.utils import get_product_names, get_retailers_for_product -from w3lib.html import remove_tags - -jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates')) - - -class DealsChecker(object): - - def __init__(self, latest_deals, previous_deals, price_threshold=0): - self.price_threshold = price_threshold - self.latest_deals = latest_deals - self.previous_deals = previous_deals - - def is_from_latest_crawl(self, deal): - """Checks whether the given deal is from the most recent execution. - """ - return deal in self.latest_deals - - def get_best_deal(self): - """Returns the item with the best overall price. self.price_threshold can be set to avoid - considering minor price drops. - """ - best_so_far = min(self.previous_deals, key=lambda x: x.get('price')) - best_from_last = min(self.latest_deals, key=lambda x: x.get('price')) - if best_from_last.get('price') + self.price_threshold < best_so_far.get('price'): - return best_from_last - else: - return best_so_far - - -class DealsFetcher(object): - - def __init__(self, product_name, apikey, project_id, hours): - self.product_name = product_name - project = HubstorageClient(apikey).get_project(project_id) - self.item_store = project.collections.new_store(product_name) - self.load_items_from_last_n_hours(hours) - - def load_items_from_last_n_hours(self, n=24): - """Load items from the last n hours, from the newest to the oldest. - """ - since_time = int((datetime.now() - timedelta(hours=n)).timestamp() * 1000) - self.deals = [item.get('value') for item in self.fetch_deals_newer_than(since_time)] - - def fetch_deals_newer_than(self, since_time): - return list(self.item_store.get(meta=['_key', '_ts'], startts=since_time)) - - def get_latest_deal_from_retailer(self, retailer): - """Returns the most recently extracted deal from a given retailer. - """ - for deals in self.deals: - if retailer in deals.get('url'): - return deals - - def get_deals(self): - """Returns a tuple with (deals from latest crawl, deals from previous crawls) - """ - latest_deals = [ - self.get_latest_deal_from_retailer(retailer) - for retailer in get_retailers_for_product(self.product_name) - ] - previous_deals = [ - deal for deal in self.deals if deal not in latest_deals - ] - return latest_deals, previous_deals - - -def send_email_alert(items): - ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY) - html_body = jinja_env.get_template('email.html').render(items=items) - - ses.send_email( - settings.EMAIL_ALERT_FROM, - 'Price drop alert', - remove_tags(html_body), - settings.EMAIL_ALERT_TO, - html_body=html_body - ) - - -def main(args): - items = [] - for prod_name in get_product_names(): - fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) - checker = DealsChecker(*fetcher.get_deals(), args.threshold) - best_deal = checker.get_best_deal() - if checker.is_from_latest_crawl(best_deal): - items.append(best_deal) - - if items: - send_email_alert(items) - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('--apikey', default=settings.SHUB_KEY or os.getenv('SHUB_KEY'), - help='API key to use for scrapinghub (fallbacks to SHUB_KEY variable)') - parser.add_argument('--days', type=int, default=1, - help='How many days back to compare with the last price') - parser.add_argument('--threshold', type=float, default=0, - help='A margin to avoid raising alerts with minor price drops') - parser.add_argument('--project', type=int, default=settings.SHUB_PROJ_ID, - help='Project ID to get info from') - - return parser.parse_args() - - -if __name__ == '__main__': - main(parse_args()) diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py deleted file mode 100644 index 20a91f9..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/items.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class PriceMonitorItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py deleted file mode 100644 index 18de561..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/pipelines.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -from price_monitor import settings -from hubstorage import HubstorageClient -from price_monitor.utils import reversed_timestamp, get_product_names - - -class CollectionStoragePipeline(object): - - def open_spider(self, spider): - client = HubstorageClient(auth=settings.SHUB_KEY) - project = client.get_project(settings.SHUB_PROJ_ID) - self.data_stores = {} - for product_name in get_product_names(): - self.data_stores[product_name] = project.collections.new_store(product_name) - - def process_item(self, item, spider): - key = "{}-{}-{}".format( - reversed_timestamp(), item.get('product_name'), item.get('retailer') - ) - self.data_stores[item['product_name']].set({'_key': key, 'value': item}) - return item diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json deleted file mode 100644 index 6bc20ba..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/resources/urls.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "headsetlogitech": [ - "https://www.amazon.com/Logitech-Wireless-Headset-Over-Design/dp/B005GTO07O/", - "http://www.bestbuy.com/site/logitech-h600-wireless-headset-black/3436118.p", - "http://www.ebay.com/itm/N-Logitech-Wireless-Headset-H600-Over-The-Head-Design-981-000341-/110985874014" - ], - "webcamlogitech": [ - "https://www.amazon.com/Logitech-Widescreen-Calling-Recording-Desktop/dp/B006JH8T3S/", - "http://www.bestbuy.com/site/logitech-c920-pro-webcam-black/4612476.p?skuId=4612476", - "http://www.ebay.com/itm/Logitech-HD-Pro-Webcam-C920-1080p-Widescreen-Video-Calling-and-Recording-/272381890214" - ], - "amazonechodot": [ - "https://www.amazon.com/dp/B01DFKC2SO", - "http://www.bestbuy.com/site/amazon-echo-dot/5578851.p?skuId=5578851", - "http://www.ebay.com/itm/Amazon-Echo-Dot-2nd-Generation-w-Alexa-Voice-Media-Device-All-New-2016-/201668562192" - ], - "nikoncoolpix": [ - "https://www.amazon.com/Nikon-COOLPIX-B500-Digital-Camera/dp/B01C3LEE9G/", - "http://www.bestbuy.com/site/nikon-coolpix-b500-16-0-megapixel-digital-camera-red/4997500.p?skuId=4997500", - "http://www.ebay.com/itm/Nikon-COOLPIX-B500-Digital-Camera-Red-/162225974018" - ], - "bluemicrophone": [ - "https://www.amazon.com/Blue-Snowball-iCE-Condenser-Microphone/dp/B014PYGTUQ/", - "http://www.bestbuy.com/site/blue-microphones-snowball-usb-cardioid-and-omnidirectional-electret-condenser-vocal-microphone-black/9918056.p?skuId=9918056", - "http://www.ebay.com/itm/Blue-Microphones-Snowball-Black-iCE-Condenser-Microphone-/172260373002" - ] -} diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py deleted file mode 100644 index 9888b56..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/settings.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -import os - -BOT_NAME = 'price_monitor' -SPIDER_MODULES = ['price_monitor.spiders'] -NEWSPIDER_MODULE = 'price_monitor.spiders' - -ROBOTSTXT_OBEY = True - -SHUB_KEY = os.getenv('$SHUB_KEY') -# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below -SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '999999').split('/')[0] - - -# settings for Amazon SES email service -AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') -AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') -EMAIL_ALERT_FROM = 'Price Monitor ' -EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] - -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'price_monitor.pipelines.CollectionStoragePipeline': 400, -} - -AUTOTHROTTLE_ENABLED = True -# HTTPCACHE_ENABLED = True diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/amazon.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/amazon.py deleted file mode 100644 index 0f3ec3c..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/amazon.py +++ /dev/null @@ -1,14 +0,0 @@ -from .base_spider import BaseSpider - - -class AmazonSpider(BaseSpider): - name = "amazon.com" - - def parse(self, response): - item = response.meta.get('item', {}) - item['url'] = response.url - item['title'] = response.css("span#productTitle::text").extract_first("").strip() - item['price'] = float( - response.css("span#priceblock_ourprice::text").re_first("\$(.*)") or 0 - ) - yield item diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/base_spider.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/base_spider.py deleted file mode 100644 index e726c9c..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/base_spider.py +++ /dev/null @@ -1,16 +0,0 @@ -import json -import pkgutil -import scrapy -from datetime import datetime - - -class BaseSpider(scrapy.Spider): - - def start_requests(self): - products = json.loads(pkgutil.get_data('price_monitor', 'resources/urls.json').decode()) - for name, urls in products.items(): - for url in urls: - if self.name in url: - now = datetime.now().strftime('%Y/%m/%d %H:%M:%S') - item = {'product_name': name, 'retailer': self.name, 'when': now} - yield scrapy.Request(url, meta={'item': item}) diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/bestbuy.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/bestbuy.py deleted file mode 100644 index 03c49f6..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/bestbuy.py +++ /dev/null @@ -1,14 +0,0 @@ -from .base_spider import BaseSpider - - -class BestbuySpider(BaseSpider): - name = "bestbuy.com" - - def parse(self, response): - item = response.meta.get('item', {}) - item['url'] = response.url - item['title'] = response.css("div#sku-title > h1 ::text").extract_first().strip() - item['price'] = float( - response.css('div.price-block ::attr(data-customer-price)').extract_first(default=0) - ) - yield item diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/ebay.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/ebay.py deleted file mode 100644 index 7721fa6..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/spiders/ebay.py +++ /dev/null @@ -1,17 +0,0 @@ -from extruct.w3cmicrodata import MicrodataExtractor -from .base_spider import BaseSpider - - -class EbaySpider(BaseSpider): - name = "ebay.com" - - def parse(self, response): - extractor = MicrodataExtractor() - properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {}) - item = response.meta.get('item', {}) - item['url'] = response.url - item['title'] = properties.get('name').replace('Details about', '').strip() - item['price'] = float( - properties.get('offers', {}).get('properties', {}).get('price', 0) - ) - yield item diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html deleted file mode 100644 index c51ef0c..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/templates/email.html +++ /dev/null @@ -1,14 +0,0 @@ -

🎉 Hey, we found a good deal! 🎁

- - -{% for item in items %} - -{% endfor %} -
-

Product: {{item.title}}

-

Price: {{item.price}}

-

Store: {{item.retailer}}

-

Price obtained at: {{item.when}}

-

Visit the product page at {{item.retailer}}: {{item.url}}

-
- diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py deleted file mode 100644 index 8deb616..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/utils.py +++ /dev/null @@ -1,35 +0,0 @@ -import json -import pkgutil -from datetime import datetime, timedelta - - -def timestamp_from_reversed(reversed): - return datetime(5000, 1, 1) - timedelta(seconds=float(reversed)) - - -def reversed_timestamp(): - return str((datetime(5000, 1, 1) - datetime.now()).total_seconds()) - - -def normalize_name(name): - return name.replace('-', '') - - -def get_product_names(): - return [ - normalize_name(name) - for name in json.loads( - pkgutil.get_data("price_monitor", "resources/urls.json").decode() - ).keys() - ] - - -def get_retailer_name_from_url(url): - return url.split("://")[1].split("/")[0].replace("www.", "") - - -def get_retailers_for_product(product_name): - data = json.loads( - pkgutil.get_data("price_monitor", "resources/urls.json").decode() - ) - return {get_retailer_name_from_url(url) for url in data[product_name]} diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt b/scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt deleted file mode 100644 index 2567afb..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -scrapy -boto -extruct -w3lib -jinja2 diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml b/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml deleted file mode 100644 index 7a8527c..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapinghub.yml +++ /dev/null @@ -1,3 +0,0 @@ -requirements_file: requirements.txt -stacks: - default: scrapy:1.1-py3 diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg b/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg deleted file mode 100644 index d34a107..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = price_monitor.settings - -[deploy] -#url = http://localhost:6800/ -project = price_monitor diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py b/scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py deleted file mode 100644 index 3e0698a..0000000 --- a/scrapy_price_monitor/_scrapy_price_monitor_OLD/setup.py +++ /dev/null @@ -1,12 +0,0 @@ -# Automatically created by: shub deploy - -from setuptools import setup, find_packages - -setup( - name='project', - version='1.0', - packages=find_packages(), - package_data={'price_monitor': ['resources/*.json', 'templates/*.html']}, - scripts=['bin/monitor.py'], - entry_points={'scrapy': ['settings = price_monitor.settings']}, -) diff --git a/scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/__init__.py b/scrapy_price_monitor/bin/__init__.py similarity index 100% rename from scrapy_price_monitor/_scrapy_price_monitor_OLD/price_monitor/__init__.py rename to scrapy_price_monitor/bin/__init__.py diff --git a/scrapy_price_monitor/bin/alert.py b/scrapy_price_monitor/bin/alert.py new file mode 100644 index 0000000..0e985ec --- /dev/null +++ b/scrapy_price_monitor/bin/alert.py @@ -0,0 +1,31 @@ +# Below is sample code for sending alerts via an ASN Email service +# If you wish to alert through another means such as slack, text, etc replace this section with the appropiate code + +import boto +from jinja2 import Environment, PackageLoader +import os + +from w3lib.html import remove_tags +import logging +logger = logging.getLogger(__name__) + +jinja_env = Environment(loader=PackageLoader('price_monitor', 'alert_template')) + +# settings for Amazon SES email service +AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') +AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') +EMAIL_ALERT_FROM = 'Price Monitor ' +EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] + + +def send_alert(items): + ses = boto.connect_ses(AWS_ACCESS_KEY, AWS_SECRET_KEY) + html_body = jinja_env.get_template('email.html').render(items=items) + + ses.send_email( + EMAIL_ALERT_FROM, + 'Price drop alert', + remove_tags(html_body), + EMAIL_ALERT_TO, + html_body=html_body + ) \ No newline at end of file diff --git a/scrapy_price_monitor/bin/monitor.py b/scrapy_price_monitor/bin/monitor.py index 9f4be62..8a93fe6 100644 --- a/scrapy_price_monitor/bin/monitor.py +++ b/scrapy_price_monitor/bin/monitor.py @@ -4,15 +4,10 @@ import os from datetime import datetime, timedelta -import boto -from jinja2 import Environment, PackageLoader - from price_monitor import settings from price_monitor.utils import get_product_names, get_retailers_for_product from price_monitor.collection_helper import CollectionHelper -from w3lib.html import remove_tags - -jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates')) +from bin.alert import send_alert class DealsChecker(object): @@ -81,30 +76,17 @@ def get_deals(self): return latest_deals, previous_deals -def send_email_alert(items): - ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY) - html_body = jinja_env.get_template('email.html').render(items=items) - - ses.send_email( - settings.EMAIL_ALERT_FROM, - 'Price drop alert', - remove_tags(html_body), - settings.EMAIL_ALERT_TO, - html_body=html_body - ) - - def main(args): - items = [] - for prod_name in get_product_names(): - fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) - checker = DealsChecker(*fetcher.get_deals(), args.threshold) - best_deal = checker.get_best_deal() - if checker.is_from_latest_crawl(best_deal): - items.append(best_deal) + items = ['stuff'] + # for prod_name in get_product_names(): + # fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) + # checker = DealsChecker(*fetcher.get_deals(), args.threshold) + # best_deal = checker.get_best_deal() + # if checker.is_from_latest_crawl(best_deal): + # items.append(best_deal) if items: - send_email_alert(items) + send_alert(items) def parse_args(): diff --git a/scrapy_price_monitor/price_monitor/templates/email.html b/scrapy_price_monitor/price_monitor/alert_template/email.html similarity index 100% rename from scrapy_price_monitor/price_monitor/templates/email.html rename to scrapy_price_monitor/price_monitor/alert_template/email.html diff --git a/scrapy_price_monitor/price_monitor/settings.py b/scrapy_price_monitor/price_monitor/settings.py index b55aa4e..beed3ad 100644 --- a/scrapy_price_monitor/price_monitor/settings.py +++ b/scrapy_price_monitor/price_monitor/settings.py @@ -12,9 +12,3 @@ ITEM_PIPELINES = { 'price_monitor.pipelines.CollectionStoragePipeline': 400, } - -# settings for Amazon SES email service -AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') -AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') -EMAIL_ALERT_FROM = 'Price Monitor ' -EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] diff --git a/scrapy_price_monitor/requirements.txt b/scrapy_price_monitor/requirements.txt index f525147..c176940 100644 --- a/scrapy_price_monitor/requirements.txt +++ b/scrapy_price_monitor/requirements.txt @@ -3,4 +3,5 @@ boto extruct w3lib jinja2 -price-parser \ No newline at end of file +price-parser +scrapinghub diff --git a/scrapy_price_monitor/setup.py b/scrapy_price_monitor/setup.py index b22a90d..cc29a6e 100644 --- a/scrapy_price_monitor/setup.py +++ b/scrapy_price_monitor/setup.py @@ -7,6 +7,6 @@ version = '1.0', packages = find_packages(), package_data = {'price_monitor': ['resources/*.json', 'templates/*.html']}, - scripts = ['bin/monitor.py'], + scripts = ['bin/monitor.py', 'bin/alert.py'], entry_points = {'scrapy': ['settings = price_monitor.settings']}, ) From 25fe0e14d91ae14595e27f56027c6f42e36f726f Mon Sep 17 00:00:00 2001 From: further-reading Date: Fri, 19 Nov 2021 13:28:16 +0000 Subject: [PATCH 4/8] Removing placeholder project id --- scrapy_price_monitor/price_monitor/settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_price_monitor/price_monitor/settings.py b/scrapy_price_monitor/price_monitor/settings.py index beed3ad..0632144 100644 --- a/scrapy_price_monitor/price_monitor/settings.py +++ b/scrapy_price_monitor/price_monitor/settings.py @@ -6,8 +6,8 @@ # if you want to run it locally, replace None with your scrapy cloud API key SHUB_KEY = None -# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID -SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '999999').split('/')[0] +# if you want to run it locally, replace PROJ_ID by your Scrapy Cloud project ID +SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', 'PROJ_ID').split('/')[0] ITEM_PIPELINES = { 'price_monitor.pipelines.CollectionStoragePipeline': 400, From 065b77fa8534e625ed6785b37d8cd713285f3828 Mon Sep 17 00:00:00 2001 From: further-reading Date: Fri, 19 Nov 2021 13:52:49 +0000 Subject: [PATCH 5/8] PR feedback --- scrapy_price_monitor/README.md | 30 +++++++++---------- scrapy_price_monitor/bin/alert.py | 5 ++-- scrapy_price_monitor/bin/monitor.py | 12 ++++---- scrapy_price_monitor/price_monitor/items.py | 1 + .../price_monitor/spiders/_base.py | 2 +- 5 files changed, 24 insertions(+), 26 deletions(-) diff --git a/scrapy_price_monitor/README.md b/scrapy_price_monitor/README.md index 02c4134..abb2aff 100644 --- a/scrapy_price_monitor/README.md +++ b/scrapy_price_monitor/README.md @@ -31,27 +31,25 @@ the URL list as its value, such as: ## Supporting Further Retailers To add a retailer, just create a spider to handle the product pages from it. -To include a spider for samsclub.com, you -could run: +To include a spider for fake-website.com, you could run: - $ scrapy genspider samsclub.com samsclub.com + $ scrapy genspider fake-website.com fake-website.com -And then, open the spider and add the extraction rules: +And then you can open the newly created `fake_website_com.py` file in your IDE to edit the file. - $ scrapy edit samsclub.com - -Have a look at the current spiders and implement the new ones using the same +Have a look at the sample books.toscrape.com spider and implement the new ones using the same structure, subclassing `BaseSpider` instead of `scrapy.Spider`. This way, your spiders will automatically read the URLs list from `resources/urls.json`. ## Customizing the Price Monitor -The price monitor sends an email using Amazon SES service, so to run it you -have to set both `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` variables in -`price_monitor/settings.py`. If you want to use another email service, -you have to rewrite the `send_email_alert` function in -`price_monitor/bin/monitor.py`. +The price monitor script uses an `send_alert` function in the `price_monitor/bin/alert.py` +file to send an alert. The current sample sends an email using Amazon SES +service, so to run it you have to set both `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` +variables in the file, along with details for the email sender and intended recipient. +If you want to use another email service or another form of alert altogether, +you can rewrite this file and include an equivalent `send_alert` function. The price monitor can be further customized via parameters to the `price_monitor/bin/monitor.py` script. We will dig on those parameters @@ -62,17 +60,17 @@ later when showing how to schedule the project on Scrapy Cloud. 1. Clone this repo: - $ git clone git@github.com:further-reading/price-monitoring-sample.git + $ git clone git@github.com:scrapinghub/sample-projects.git 2. Enter the folder and install the project dependencies: $ cd scrapy_price_monitor $ pip install -r requirements.txt -3. Create a free forever account on Scrapy Cloud: -https://app.scrapinghub.com/account/signup/. +3. Create an account on Zyte: +https://app.zyte.com/ -4. Create a Scrapy project on Scrapy Cloud and copy the project id from the project URL. +4. Scroll to Scrapy Cloud Projects, select Creat Project take note of the project ID in the new project's url. 5. Install [Scrapinghub command line tool (shub)](https://github.com/scrapinghub/shub): diff --git a/scrapy_price_monitor/bin/alert.py b/scrapy_price_monitor/bin/alert.py index 0e985ec..39e239f 100644 --- a/scrapy_price_monitor/bin/alert.py +++ b/scrapy_price_monitor/bin/alert.py @@ -3,7 +3,6 @@ import boto from jinja2 import Environment, PackageLoader -import os from w3lib.html import remove_tags import logging @@ -12,8 +11,8 @@ jinja_env = Environment(loader=PackageLoader('price_monitor', 'alert_template')) # settings for Amazon SES email service -AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') -AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') +AWS_ACCESS_KEY = 'AWS_ACCESS_KEY' +AWS_SECRET_KEY = 'AWS_ACCESS_KEY' EMAIL_ALERT_FROM = 'Price Monitor ' EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] diff --git a/scrapy_price_monitor/bin/monitor.py b/scrapy_price_monitor/bin/monitor.py index 8a93fe6..1be49ef 100644 --- a/scrapy_price_monitor/bin/monitor.py +++ b/scrapy_price_monitor/bin/monitor.py @@ -78,12 +78,12 @@ def get_deals(self): def main(args): items = ['stuff'] - # for prod_name in get_product_names(): - # fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) - # checker = DealsChecker(*fetcher.get_deals(), args.threshold) - # best_deal = checker.get_best_deal() - # if checker.is_from_latest_crawl(best_deal): - # items.append(best_deal) + for prod_name in get_product_names(): + fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) + checker = DealsChecker(*fetcher.get_deals(), args.threshold) + best_deal = checker.get_best_deal() + if checker.is_from_latest_crawl(best_deal): + items.append(best_deal) if items: send_alert(items) diff --git a/scrapy_price_monitor/price_monitor/items.py b/scrapy_price_monitor/price_monitor/items.py index 6c5824d..d1dcb20 100644 --- a/scrapy_price_monitor/price_monitor/items.py +++ b/scrapy_price_monitor/price_monitor/items.py @@ -11,6 +11,7 @@ class PriceMonitorItem(Item): class PriceLoader(ItemLoader): + default_item_class = PriceMonitorItem default_output_processor = TakeFirst() title_in = MapCompose(lambda x: x.strip()) diff --git a/scrapy_price_monitor/price_monitor/spiders/_base.py b/scrapy_price_monitor/price_monitor/spiders/_base.py index e726c9c..dfc2ad5 100644 --- a/scrapy_price_monitor/price_monitor/spiders/_base.py +++ b/scrapy_price_monitor/price_monitor/spiders/_base.py @@ -11,6 +11,6 @@ def start_requests(self): for name, urls in products.items(): for url in urls: if self.name in url: - now = datetime.now().strftime('%Y/%m/%d %H:%M:%S') + now = datetime.now().isoformat() item = {'product_name': name, 'retailer': self.name, 'when': now} yield scrapy.Request(url, meta={'item': item}) From 36c425127c6b461deb482299be4b08950a135555 Mon Sep 17 00:00:00 2001 From: further-reading Date: Fri, 19 Nov 2021 13:55:44 +0000 Subject: [PATCH 6/8] PR feedback --- scrapy_price_monitor/requirements.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scrapy_price_monitor/requirements.txt b/scrapy_price_monitor/requirements.txt index c176940..0f278d9 100644 --- a/scrapy_price_monitor/requirements.txt +++ b/scrapy_price_monitor/requirements.txt @@ -1,7 +1,9 @@ scrapy -boto -extruct -w3lib -jinja2 price-parser scrapinghub + +# sample alert script requirements +# replace with appropriate packages if not using ASN email alerts +boto +w3lib +jinja2 \ No newline at end of file From af3270726c8b91a271ffd137b5d46eb39fb34281 Mon Sep 17 00:00:00 2001 From: further-reading Date: Fri, 19 Nov 2021 14:23:17 +0000 Subject: [PATCH 7/8] PR feedback --- scrapy_price_monitor/price_monitor/items.py | 6 +++--- .../price_monitor/spiders/books_toscrape.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scrapy_price_monitor/price_monitor/items.py b/scrapy_price_monitor/price_monitor/items.py index d1dcb20..a89e554 100644 --- a/scrapy_price_monitor/price_monitor/items.py +++ b/scrapy_price_monitor/price_monitor/items.py @@ -4,14 +4,14 @@ from price_parser import Price -class PriceMonitorItem(Item): +class Product(Item): url = Field() title = Field() price = Field() -class PriceLoader(ItemLoader): - default_item_class = PriceMonitorItem +class ProductLoader(ItemLoader): + default_item_class = Product default_output_processor = TakeFirst() title_in = MapCompose(lambda x: x.strip()) diff --git a/scrapy_price_monitor/price_monitor/spiders/books_toscrape.py b/scrapy_price_monitor/price_monitor/spiders/books_toscrape.py index d1fdc32..aec475b 100644 --- a/scrapy_price_monitor/price_monitor/spiders/books_toscrape.py +++ b/scrapy_price_monitor/price_monitor/spiders/books_toscrape.py @@ -1,5 +1,5 @@ from price_monitor.spiders._base import BaseSpider -from price_monitor.items import PriceLoader +from price_monitor.items import ProductLoader class BooksSpider(BaseSpider): @@ -7,7 +7,7 @@ class BooksSpider(BaseSpider): def parse(self, response): item = response.meta.get('item', {}) - loader = PriceLoader(item=item, response=response) + loader = ProductLoader(item=item, response=response) loader.add_value('url', response.url) loader.add_css('name', 'h1::text') loader.add_css('price', '.price_color::text') From 793aa322ed7b10dc8a7e99a246dcf073e5b3e6ed Mon Sep 17 00:00:00 2001 From: further-reading Date: Fri, 19 Nov 2021 14:40:34 +0000 Subject: [PATCH 8/8] Adding missing fields --- scrapy_price_monitor/price_monitor/items.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapy_price_monitor/price_monitor/items.py b/scrapy_price_monitor/price_monitor/items.py index a89e554..03b4a5b 100644 --- a/scrapy_price_monitor/price_monitor/items.py +++ b/scrapy_price_monitor/price_monitor/items.py @@ -8,6 +8,8 @@ class Product(Item): url = Field() title = Field() price = Field() + product_name = Field() + retailer = Field() class ProductLoader(ItemLoader):