diff --git a/.gitignore b/.gitignore index 894a44c..ea87a12 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,9 @@ venv.bak/ # mypy .mypy_cache/ + +# idea +.idea + +# cached rss +/cache diff --git a/README.md b/README.md new file mode 100644 index 0000000..9b22182 --- /dev/null +++ b/README.md @@ -0,0 +1,49 @@ +# PythonHomework +[Introduction to Python] Homework Repository for EPAM courses + +## How to use +1. install git `apt-get install git` (necessary to get a package with git) +2. `pip3 install .` +3. `rss-reader "https://www.androidpolice.com/feed/" --limit 3 --json --verbose --date` + +## Important information +"--to-pdf" convertation is unstable + +## Parameters +- **--help** (help text) +- **--json** (print rss feed in json format) +- **--verbose** (print verbose log messages) +- **--limit** (limit printed entries) +- **--date** (print cached entries if exist) +- **--to-html** (convert rss feed to html document) +- **--to-epub** (convert rss feed to epub document) +- **--to-pdf** (convert rss feed to pdf document) +- **--colorize** (colorize output) + +## JSON structure +`{"feed": "rss_title", "entries": [{"title": "title", "date": "date", "link": "link", "summary": "summary", "photos": [...], "links": [...]}, ...]}` + +## Storage +Used [Pickle](https://docs.python.org/3/library/pickle.html) for storage + +Entries cached in `cache/date/domain.rss` +- cache - name of cache folder, default "cache" +- date - script execution date +- domain - domain of rss feed + +Example: `cache/20191117/www.androidpolice.com.rss` + +## Convertation + +Examples: +- `--to-html folder_name` will create "out.html" and "images" folder in folder_name +- `--to-epub folder_name` will create "out.epub" in folder_name +- `--to-pdf folder_name` will create "out.pdf" in folder_name (*UNSTABLE*) + +## TODO +- [x] [Iteration 1] One-shot command-line RSS reader. +- [x] [Iteration 2] Distribution +- [x] [Iteration 3] News caching +- [x] [Iteration 4] Format converter +- [x] * [Iteration 5] Output colorization +- [ ] * [Iteration 6] Web-server diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..121f9ba --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +feedparser>=6.0.0b1 # rss parsing +requests # http requests +bs4 # for xml and html +colorama # colored output https://pypi.org/project/colorama/ +jinja2 # for generating html +git+https://github.com/xhtml2pdf/xhtml2pdf.git +ebooklib \ No newline at end of file diff --git a/rss_reader/__init__.py b/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader/__main__.py b/rss_reader/__main__.py new file mode 100644 index 0000000..b6fdcea --- /dev/null +++ b/rss_reader/__main__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python + +"""Package entry point.""" + + +from rss_reader.rss_reader import main + + +if __name__ == '__main__': # pragma: no cover + main() diff --git a/rss_reader/converter.py b/rss_reader/converter.py new file mode 100644 index 0000000..747be88 --- /dev/null +++ b/rss_reader/converter.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 + +""" +Convert RSS feed to HTML/PDF +""" +import copy +import logging +import shutil +import random +from pathlib import Path + +import requests +from xhtml2pdf import pisa +from jinja2 import Template +from bs4 import BeautifulSoup +from ebooklib import epub + +from rss_reader.exceptions import RSSFeedException + + +class Converter: + """ Class for conversion RSS feed + + Attributes: + title (str): Title of RSS feed + entries (list): List of RSS news + out_dir (str): Directory where output will be saved + """ + + def __init__(self, title, entries, out_dir="out", image_dir="images", temp_image_dir="_temp_images"): + self.title = title + self.entries = entries + self.out_dir = out_dir + + self.image_dir = image_dir + self.temp_image_dir = temp_image_dir + + self.font_path = Path(__file__).resolve().parent / 'fonts/Roboto-Regular.ttf' + + def _create_directories(self, image_dir): + """ Create directories if not exist (self.out_dir and self.out_dir/image_dir) """ + if not Path(self.out_dir).is_dir(): + logging.info("Creating directory /%s", Path(self.out_dir)) + Path(self.out_dir).mkdir(parents=True, exist_ok=True) + + if not image_dir.is_dir(): + logging.info("Creating directory /%s", image_dir) + image_dir.mkdir(parents=True, exist_ok=True) + + def _download_image(self, url, image_dir): + """ Download image in self.out_dir/image_dir + + Returns: + filename: image name + """ + logging.info("Starting image download") + + image_dir = Path(self.out_dir) / image_dir + + try: + self._create_directories(image_dir) + except OSError: + raise RSSFeedException(message="Сan not create directory") + + filename = url.split('/')[-1] + response = requests.get(url, allow_redirects=True) + + with open(image_dir / filename, 'wb') as handler: + handler.write(response.content) + + return filename + + def _replace_urls_to_local_path(self, entry): + """ Replace img URLs in entry.summary to local file path + + Args: + entry (dict): News dict + + """ + soup = BeautifulSoup(entry.summary, "html.parser") + + for img in soup.findAll('img'): + # use placeholder + if not img['src']: + # copy placeholder to self.out_dir/self.image_dir + filename = Path(__file__).resolve().parent / 'placeholder/placeholder.jpg' + shutil.copyfile(filename, Path(self.out_dir) / self.image_dir / 'placeholder.jpg') + img['src'] = str(Path(self.image_dir) / 'placeholder.jpg') + entry.summary = str(soup) + return entry + + filename = self._download_image(img['src'], self.image_dir) + downloaded_img_local_path = Path(self.image_dir) / filename + + img['src'] = str(downloaded_img_local_path) + entry.summary = str(soup) + + return entry + + def _replace_urls_to_absolute_path(self, entry): + """ Replace img URLs in entry.summary to local absolute file path + + Special for xhtml2pdf (xhtml2pdf support only absolute file path) + + Args: + entry (dict): News dict + """ + soup = BeautifulSoup(entry.summary, "html.parser") + + for img in soup.findAll('img'): + # use placeholder + if not img['src']: + filename = Path(__file__).resolve().parent / 'placeholder/placeholder.jpg' + img['src'] = str(filename.absolute()) + entry.summary = str(soup) + return entry + + filename = self._download_image(img['src'], self.temp_image_dir) + downloaded_img_absolute_path = (Path(self.out_dir) / self.temp_image_dir / filename).absolute() + + img['src'] = str(downloaded_img_absolute_path) + entry.summary = str(soup) + + return entry + + def _generate_html(self, is_cyrillic_font=False, is_absolute_path=False): + """ Generate HTML + + Args: + is_cyrillic_font (bool) Should we generate HTML with cyrillic_font (to convert to PDF)? + is_absolute_path (bool): Should we generate HTML with absolute image PATH (to convert to PDF)? + + Returns: + html: String with HTML code + """ + template = ''' + + + {{title}} + + + + + {% for entry in entries %} +
+

{{entry.title}}

+

{{entry.published}}

+

{{entry.link}}

+
{{entry.summary}}
+
+ {% endfor %} + + ''' + + # replacing image url to downloaded image path + temp_entries = copy.deepcopy(self.entries) + if is_absolute_path: + entries = [self._replace_urls_to_absolute_path(entry) for entry in temp_entries] + else: + entries = [self._replace_urls_to_local_path(entry) for entry in temp_entries] + + html = Template(template).render(title=self.title, entries=entries, + is_cyrillic_font=is_cyrillic_font, font_path=self.font_path) + return html + + def entries_to_html(self): + """ Generate HTML file in self.out_dir """ + html = self._generate_html() + + with open(Path(self.out_dir) / 'out.html', 'w') as file_object: + file_object.write(html) + + def entries_to_pdf(self): + """ Generate PDF file in self.out_dir """ + html = self._generate_html(is_cyrillic_font=True, is_absolute_path=True) + + with open(Path(self.out_dir) / 'out.pdf', 'w+b') as file: + pdf = pisa.CreatePDF(html, dest=file, encoding='UTF-8') + + # Delete temp folder (self.out_dir/self.temp_image_dir) + temp_img_dir = Path(self.out_dir) / self.temp_image_dir + logging.info("Cleaning up %s", temp_img_dir) + shutil.rmtree(temp_img_dir) + + if pdf.err: + raise RSSFeedException(message="Error during PDF generation") + + def entries_to_epub(self): + """ Generate EPUB file in self.out_dir """ + html = self._generate_html() + + def add_images_to_book(): + soup = BeautifulSoup(chapter.content, "html.parser") + image_urls = [img['src'] for img in soup.findAll('img') if img.has_attr('src')] + + added_images = [] + for image_url in image_urls: + # Images can repeat, check + if image_url in added_images: + continue + + added_images.append(image_url) + img_local_filename = Path(self.out_dir) / image_url + + with open(img_local_filename, 'br') as file_object: + epimg = epub.EpubImage() + epimg.file_name = image_url + epimg.set_content(file_object.read()) + + book.add_item(epimg) + + book = epub.EpubBook() + + # set metadata + book.set_identifier(f'id{random.randint(100000, 999999)}') + book.set_title(self.title) + book.set_language('en, ru') + book.add_author('rss-reader') + + # create chapter + chapter = epub.EpubHtml(title='Intro', file_name=f'chap_01.xhtml', lang='en, ru') + chapter.content = html + # add images + add_images_to_book() + # add chapter + book.add_item(chapter) + + # define Table Of Contents + book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'), + (epub.Section(self.title), + (chapter,)) + ) + # add default NCX and Nav file + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + # define CSS style + style = 'BODY {color: white;}' + nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) + # add CSS file + book.add_item(nav_css) + # basic spine + book.spine = ['nav', chapter] + + # write to the file + epub.write_epub(Path(self.out_dir) / 'out.epub', book, {}) diff --git a/rss_reader/exceptions.py b/rss_reader/exceptions.py new file mode 100644 index 0000000..8f60089 --- /dev/null +++ b/rss_reader/exceptions.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +""" +Exceptions for rss-reader +""" + + +class RSSFeedException(Exception): + """ Custom exception class for RSSFeed errors """ + + def __init__(self, message): + super(RSSFeedException, self).__init__(message) + self.message = message diff --git a/rss_reader/fonts/Roboto-Regular.ttf b/rss_reader/fonts/Roboto-Regular.ttf new file mode 100644 index 0000000..2b6392f Binary files /dev/null and b/rss_reader/fonts/Roboto-Regular.ttf differ diff --git a/rss_reader/placeholder/placeholder.jpg b/rss_reader/placeholder/placeholder.jpg new file mode 100644 index 0000000..05a274f Binary files /dev/null and b/rss_reader/placeholder/placeholder.jpg differ diff --git a/rss_reader/rss_reader.py b/rss_reader/rss_reader.py new file mode 100644 index 0000000..12f01c3 --- /dev/null +++ b/rss_reader/rss_reader.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 + +""" +Simple RSS reader +""" + +__author__ = "DiSonDS" +__version__ = "0.5.0" +__license__ = "MIT" + +import os +import sys +import time +import copy +import json +import pickle +import logging +import argparse +from pathlib import Path +from datetime import datetime +from urllib.parse import urlparse + +import feedparser +import requests +import bs4 +from colorama import Fore + +from rss_reader.exceptions import RSSFeedException +from rss_reader.converter import Converter + + +class RSSFeed: + """ Class for RSS feed + + Attributes: + source (str): URL of RSS feed + title (str): Title of RSS feed + entries (list): List of pretty RSS news + raw_entries (list): List of raw RSS news + cache_dir (str): Main directory of cache files + """ + + def __init__(self, source, cache_dir="cache"): + self.source = source + self.title = None + self.entries = None + self.raw_entries = None + self.cache_dir = Path(cache_dir) + + def _save_rss_in_file(self): + """ Saving rss feed to cache/date/domain.rss """ + logging.info("Saving rss feed") + + date_dir = Path(datetime.now().strftime('%Y%m%d')) + cache_file_dir = self.cache_dir / date_dir + if not cache_file_dir.is_dir(): + logging.info("Creating directory /%s", cache_file_dir) + os.makedirs(cache_file_dir) + + uri = urlparse(self.source) + file_name = f"{uri.netloc}.rss" + + cache_file_path = cache_file_dir / file_name + with open(cache_file_path, "wb") as file: + logging.info("Saving entries in file %s", cache_file_path) + pickle.dump((self.title, self.raw_entries), file) + + def _load_rss_from_file(self, date): + """ Loading rss feed from cache/date/domain.rss """ + logging.info("Loading rss feed") + + date_dir = Path(str(date)) + uri = urlparse(self.source) + file_name = f"{uri.netloc}.rss" + + cache_file_path = self.cache_dir / date_dir / file_name + if not cache_file_path.is_file(): + raise RSSFeedException(message=f"There is no entries for {date}") + + with open(cache_file_path, "rb") as file: + logging.info("Loading entries from file %s", cache_file_path) + self.title, self.raw_entries = pickle.load(file) + self.entries = self._get_pretty_entries() + + def _get_entries_in_json(self, entries): + """ Convert entries to json """ + logging.info("Converting rss feed to json") + return json.dumps({"feed": self.title, "entries": entries}, + indent=2, ensure_ascii=False) + + def _get_pretty_entries(self): + """ Prettify entries + + Remove HTML code from summary; parse date, photos, links + """ + pretty_entries = [] + for entry in self.raw_entries: + summary_html = bs4.BeautifulSoup(entry.summary, "html.parser") + images = [img['src'] for img in summary_html.findAll('img') if img.has_attr('src')] + links = [link['href'] for link in entry.links] + + pretty_entries.append({ + "title": entry.title, + "date": time.strftime('%Y-%m-%dT%H:%M:%SZ', entry.published_parsed), + "link": entry.link, + "summary": summary_html.text.strip(), + "photos": images, + "links": links + }) + return pretty_entries + + def get_rss(self, date): + """ Gets rss feed from source or cache""" + logging.info("Getting rss feed") + + if date: + self._load_rss_from_file(date) + return + + response = requests.get(self.source).text + rss = feedparser.parse(response) + if rss['bozo']: + raise RSSFeedException(message="Incorrect url") + + self.title = rss['feed']['title'] + self.raw_entries = rss.entries + self.entries = self._get_pretty_entries() + + self._save_rss_in_file() + + def print_rss(self, limit=None, is_json=False, colorize=False): + """ Prints rss feed """ + + if limit: + entries = self.entries[:limit] + else: + entries = self.entries + + logging.info("Printing rss feed") + + if is_json: + entries = self._get_entries_in_json(entries) + print(entries) + else: + if colorize: + print(f"{Fore.RED}Feed:{Fore.RESET} {self.title}\n") + for entry in entries: + print(f"{Fore.GREEN}Title:{Fore.RESET} {entry['title']}\n" + f"{Fore.MAGENTA}Date:{Fore.RESET} {entry['date']}\n" + f"{Fore.BLUE}Link:{Fore.RESET} {entry['link']}\n\n" + f"{entry['summary']}\n\n" + f"{Fore.YELLOW}Photos:{Fore.RESET} {', '.join(entry['photos'])}\n" + f"{Fore.CYAN}Links:{Fore.RESET} {', '.join(entry['links'])}\n\n") + else: + print(f"Feed: {self.title}\n") + for entry in entries: + print(f"Title: {entry['title']}\n" + f"Date: {entry['date']}\n" + f"Link: {entry['link']}\n\n" + f"{entry['summary']}\n\n" + f"Photos: {', '.join(entry['photos'])}\n" + f"Links: {', '.join(entry['links'])}\n\n") + + def convert_to(self, to_html, to_pdf, to_epub, limit): + """ Сonvert rss_feed to the appropriate format (HTML/PDF/EPUB) """ + converter = Converter(title=self.title, entries=copy.deepcopy(self.raw_entries[:limit])) + if to_html: + logging.info("Converting RSS to HTML") + converter.out_dir = to_html + converter.entries_to_html() + if to_pdf: + logging.info("Converting RSS to PDF") + converter.out_dir = to_pdf + converter.entries_to_pdf() + if to_epub: + logging.info("Converting RSS to EPUB") + converter.out_dir = to_epub + converter.entries_to_epub() + + logging.info("Convertation successful.") + + +def get_args(): + """ Parse and return provided args """ + parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.") + + parser.add_argument("source", help="rss url") + parser.add_argument( + "--version", + action="version", + version="%(prog)s (version {version})".format(version=__version__)) + parser.add_argument("-j", "--json", action="store_true", help="Print result as JSON in stdout") + parser.add_argument( + "--verbose", + action="count", + default=False, + help="Outputs verbose status messages") + parser.add_argument("-l", "--limit", action="store", type=int, dest="limit", + help="Limit news topics if this parameter is provided") + parser.add_argument("-d", "--date", action="store", type=int, dest="date", + help="Trying to get cached news for DATE if this parameter is provided.") + parser.add_argument("-c", "--colorize", action="store_true", + help="Print colorized result in stdout") + parser.add_argument("--to-html", action="store", type=str, + help="Generate TO_HTML/out.html with news") + parser.add_argument("--to-pdf", action="store", type=str, + help="Generate TO_HTML/out.pdf with news") + parser.add_argument("--to-epub", action="store", type=str, + help="Generate TO_EPUB/out.epub with news") + + return parser.parse_args() + + +def main(): + """ Main entry point of the app """ + + args = get_args() + + if args.verbose: + logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG) + logging.info("Verbose output.") + else: + logging.basicConfig(format="%(levelname)s: %(message)s") + + try: + feed = RSSFeed(source=args.source) + feed.get_rss(date=args.date) + feed.print_rss(limit=args.limit, is_json=args.json, colorize=args.colorize) + feed.convert_to(to_html=args.to_html, to_pdf=args.to_pdf, to_epub=args.to_epub, limit=args.limit) + except RSSFeedException as ex: + print(f"{ex.message}") + sys.exit(0) + finally: + logging.info("Exiting") + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5c60bcb --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +from os import path +from setuptools import setup, find_packages + +HERE = path.abspath(path.dirname(__file__)) +with open(path.join(HERE, 'README.md'), encoding='utf-8') as f: + LONG_DESCRIPTION = f.read() + +setup( + name='rss-reader', + version='0.5.0', + description='A simple Python3.8 rss reader', + long_description=LONG_DESCRIPTION, + long_description_content_type='text/markdown', + url='https://github.com/introduction-to-python-bsuir-2019/PythonHomework', + author='DiSonDS', + author_email='dison.ds@gmail.com', + keywords='simple rss reader', + packages=find_packages(), + package_data={'rss_reader': ['fonts/*.ttf', 'placeholder/*.jpg']}, + python_requires='>=3.8', + install_requires=['feedparser>=6.0.0b1', 'requests', 'bs4', 'colorama', 'jinja2', 'ebooklib', + 'xhtml2pdf @ git+https://github.com/xhtml2pdf/xhtml2pdf/'], + entry_points={ + 'console_scripts': [ + 'rss-reader=rss_reader.__main__:main', + ], + }, + project_urls={ + 'Bug Reports': 'https://github.com/introduction-to-python-bsuir-2019/PythonHomework/issues', + 'Source': 'https://github.com/introduction-to-python-bsuir-2019/PythonHomework', + }, +) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_converter.py b/test/test_converter.py new file mode 100644 index 0000000..038c189 --- /dev/null +++ b/test/test_converter.py @@ -0,0 +1,66 @@ +from pathlib import Path +from unittest import TestCase +from unittest.mock import MagicMock + +from rss_reader.converter import Converter + + +class TestConverter(TestCase): + + def setUp(self): + title = "test" + entries = MagicMock() + out_dir = "_test" + self.converter = Converter(title, entries, out_dir) + + # def test__create_directories(self): + # image_dir = "test" + # self.converter._create_directories(image_dir) + # self.fail() + + # def test__download_image(self): + # self.fail() + + def test__replace_urls_to_local_path(self): + self.converter._download_image = MagicMock(return_value="test.jpg") + mock_entry = MagicMock + mock_entry.summary = 'Фото: Дарья Бурякина' \ + 'Первый этап Кубка мира по биатлону продолжится женской спринтерской гонкой на 7,5 км.' \ + '
' + mock_entry_replaced = MagicMock() + mock_entry_replaced.summary = f'Фото: Дарья БурякинаПервый этап Кубка мира по биатлону продолжится женской спринтерской гонкой' \ + f' на 7,5 км.
' + entry = self.converter._replace_urls_to_local_path(mock_entry) + self.assertEqual(mock_entry_replaced.summary, entry.summary) + + def test__replace_urls_to_absolute_path(self): + self.converter._download_image = MagicMock(return_value="test.jpg") + mock_entry = MagicMock + mock_entry.summary = 'Фото: Дарья Бурякина' \ + 'Первый этап Кубка мира по биатлону продолжится женской спринтерской гонкой на 7,5 км.' \ + '
' + mock_entry_replaced = MagicMock() + image_path = (Path(self.converter.out_dir) / self.converter.temp_image_dir / 'test.jpg').absolute() + mock_entry_replaced.summary = f'Фото: Дарья БурякинаПервый этап Кубка мира по биатлону продолжится женской спринтерской гонкой' \ + f' на 7,5 км.
' + entry = self.converter._replace_urls_to_absolute_path(mock_entry) + self.assertEqual(mock_entry_replaced.summary, entry.summary) + + def test__generate_html(self): + self.fail() + + # def test_entries_to_html(self): + # self.fail() + # + # def test_entries_to_pdf(self): + # self.fail() + # + # def test_entries_to_epub(self): + # self.fail()