diff --git a/final_task/LICENSE b/final_task/LICENSE new file mode 100644 index 0000000..56e93f0 --- /dev/null +++ b/final_task/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2019 The Python Packaging Authority + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/final_task/MANIFEST.in b/final_task/MANIFEST.in new file mode 100644 index 0000000..540b720 --- /dev/null +++ b/final_task/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt \ No newline at end of file diff --git a/final_task/README.md b/final_task/README.md new file mode 100644 index 0000000..06e5a52 --- /dev/null +++ b/final_task/README.md @@ -0,0 +1,44 @@ +##### JSON structure + +``` +{ + "news": { + "feed": "Yahoo News - Latest News & Headlines", + "publications": [ + { + "title": "Stefanik embraces spotlight at impeachment hearings", + "pub_date": "Fri, 15 Nov 2019 17:55:51 -0500", + "link": "https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html", + "description": "[image 2: Stefanik embraces spotlight at impeachment hearings] [2]\nThe second day of the impeachment inquiry\u2019s public hearings, on Friday, began the same way\nas the first: with an attempt by Rep. Elise Stefanik, a New York Republican, to interrupt proceedings\nwith a procedural objection.", + "hrefs": [ + [ + "https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html", + "link" + ], + [ + "http://l.yimg.com/uu/api/res/1.2/NRuDo56c6EiwjZH4WOqEZg--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media-mbst-pub-ue1.s3.amazonaws.com/creatr-uploaded-images/2019-11/7a1d0760-07d6-11ea-bef7-f17150574bb2", + "image", + "Stefanik embraces spotlight at impeachment hearings" + ] + ] + } + ] + } +} +``` + +##### Cache description + +News received from feed is cached through database is being created locally. + +The database consists of the only file named "cache.db". It has the following structure: + +| | id | feed | title | pub_date | pub_parsed | link | description | hrefs | +|-----|------|------|-------|----------|------------|------|-------------|-------| +|post | .. | ... | ... | ... | ... | ... | ... | ... | + +All fields except "id" have text type. ID field plays a role of post primary key. + +Hrefs field is composed of all post links including image links and image descriptions. +Usual references section and one for image links are separated by --|-- sequence. +Items in one section are separated by -+- sequence. And -|- is for dividing link, it's type and image description. diff --git a/final_task/requirements.txt b/final_task/requirements.txt new file mode 100644 index 0000000..3f875a3 --- /dev/null +++ b/final_task/requirements.txt @@ -0,0 +1,4 @@ +feedparser +bs4 +fpdf +requests \ No newline at end of file diff --git a/final_task/rss_reader/__init__.py b/final_task/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/final_task/rss_reader/cacher.py b/final_task/rss_reader/cacher.py new file mode 100644 index 0000000..0d6f754 --- /dev/null +++ b/final_task/rss_reader/cacher.py @@ -0,0 +1,93 @@ +""" +this module provides tools for caching news + +it includes functions for work with database and support ones +""" + +import sqlite3 +from re import match + +def init_database(): + """ + this function creates and initizlizes database for caching news + """ + connection_obj = sqlite3.connect('cache.db') + cursor_obj = connection_obj.cursor() + cursor_obj.execute( + '''CREATE TABLE IF NOT EXISTS cache (id integer primary key, feed text, title text, pub_date text, pub_parsed text, link text, description text, hrefs text)''' + ) + connection_obj.commit() + + return connection_obj, cursor_obj + +def cache_news(connection_obj, cursor_obj, news): + """ + this function adds parsed news in database + """ + for post in news: + cursor_obj.execute( + '''SELECT id FROM cache WHERE feed=? AND title=? AND pub_date=? AND pub_parsed=? AND link=? AND description=? AND hrefs=?''', + (post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs'])) + ) + if cursor_obj.fetchone() is None: + cursor_obj.execute( + '''INSERT INTO cache (feed, title, pub_date, pub_parsed, link, description, hrefs) VALUES (?, ?, ?, ?, ?, ?, ?)''', + (post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs'])) + ) + connection_obj.commit() + + return + +def get_cached_news(cursor_obj, date): + """ + this function fetches news from database and return them as a list + """ + cursor_obj.execute('''SELECT * FROM cache WHERE pub_parsed=?''', (date, )) + rows = cursor_obj.fetchall() + + news = [] + for row in rows: + data = {} + data['feed'] = row[1] + data['title'] = row[2] + data['pub_date'] = row[3] + data['pub_parsed'] = row[4] + data['link'] = row[5] + data['description'] = row[6] + + hrefs = row[7].split("--|--") + try: + data['hrefs'] = [tuple(item.split("-|-")) for item in hrefs[0].split("-+-") if item != ''] + data['hrefs'] += [tuple(item.split("-|-")) for item in hrefs[1].split("-+-") if item != ''] + except IndexError: + pass + news.append(data) + + return news + +def hrefs_to_text(link_list): + """ + this function represents the list of links connected to post to text form + """ + res_line = '' + ind = -1 + for tpl in link_list: + if tpl[1] != 'image': + res_line += f"-+-{tpl[0]}-|-{tpl[1]}" + else: + res_line += '--|--' + ind = link_list.index(tpl) + break + + if ind != -1: + for tpl in link_list[ind:]: + res_line += f"{tpl[0]}-|-{tpl[1]}-|-{tpl[2]}-+-" + + return res_line + +def is_valid_date(line): + """ + this function checks a date parameter for suiting date format + """ + date = r"^[1-2][0-9]{3}[0-1][0-9][0-3][0-9]$" + return match(date, line) diff --git a/final_task/rss_reader/format_converter.py b/final_task/rss_reader/format_converter.py new file mode 100644 index 0000000..0731d14 --- /dev/null +++ b/final_task/rss_reader/format_converter.py @@ -0,0 +1,169 @@ +""" +this module provides tools for converting news to html and pdf formats +""" + +import os +import shutil +import requests +from fpdf import FPDF + +def break_lines(text): + """ + this function replaces '\n' to
tags + """ + i = 0 + while True: + try: + while text[i] != '\n': + i += 1 + text = text[:i] + "
" + text[i + 1:] + i += 4 + except IndexError: + break + + return text + +def to_html(news, filepath): + """ + this function prints news in html format to file + """ + with open(filepath, "w", encoding='utf-8') as f: + f.write(''' + + + rss_reader + + + + +
+

Actual News

+ +
+ + + + +''') + +class user_FPDF(FPDF): + """ + a small inherited class providing an ability to enumerate pages + """ + def footer(self): + self.set_y(-15) + self.cell(0, 10, txt=f"{self.page_no()}", align='R') + +def download_image(url, dest_filepath): + """ + this function downloads an image from url and saves it in file + """ + with open(dest_filepath, 'wb') as f: + response = requests.get(url, stream=True) + for block in response.iter_content(1024): + if not block: + break + f.write(block) + +def to_pdf(news, filepath): + """ + this function prints news in pdf format to file + """ + current_directory = os.getcwd() + final_directory = os.path.join(current_directory, "tmp_files") + if not os.path.exists(final_directory): + os.mkdir(final_directory) + + pdf_obj= user_FPDF() + font_dir = os.path.join(final_directory, 'DejaVuSansCondensed.ttf') + with open(font_dir, "wb") as f: + f.write(requests.get("https://raw.github.com/prague15031939/font_storage/master/DejaVuSansCondensed.ttf").content) + pdf_obj.add_font('DejaVu', '', font_dir, uni=True) + image_id = 0 + + for ind, post in enumerate(news): + pdf_obj.add_page() + if ind == 0: + pdf_obj.set_font('Arial', style='B', size=16) + pdf_obj.cell(200, 15, txt='ACTUAL NEWS', align='C', ln=1) + pdf_obj.set_font('DejaVu', '', 12) + pdf_obj.cell(5, 5, txt="#") + pdf_obj.cell(180, 5, txt=f"Feed: {(post['feed'])}", ln=1) + pdf_obj.cell(200, 5, ln=1) + pdf_obj.cell(5, 5) + pdf_obj.multi_cell(180, 5, txt=f"Title: {(post['title'])}") + pdf_obj.cell(5, 5) + pdf_obj.cell(200, 5, txt=f"Publication date: {post['pub_date']}", ln=1) + pdf_obj.cell(5, 5) + pdf_obj.cell(10, 5, txt='Link: ') + pdf_obj.set_font('Arial', style='I', size=12) + pdf_obj.multi_cell(180, 5, txt=f"{post['link']}") + pdf_obj.set_font('DejaVu', '', 12) + pdf_obj.cell(200, 5, ln=1) + pdf_obj.cell(5, 5) + pdf_obj.multi_cell(200, 5, txt=f"{post['description']}") + pdf_obj.cell(200, 5, ln=1) + pdf_obj.cell(5, 5) + pdf_obj.cell(200, 5, txt=f"Links:", ln=1) + + for index, tpl in enumerate(post['hrefs']): + pdf_obj.cell(10, 5) + if not tpl[1] == 'image': + pdf_obj.set_font('DejaVu', '', 12) + pdf_obj.cell(7, 5, txt=f"[{index + 1}] ") + pdf_obj.set_font('Arial', style='I', size=12) + pdf_obj.multi_cell(170, 5, txt=f"{tpl[0]}") + else: + pdf_obj.set_font('DejaVu', '', 12) + pdf_obj.multi_cell(170, 5, txt=f"[{index + 1}] {tpl[2]}") + try: + img_dir = os.path.join(final_directory, f"{image_id}.jpeg") + download_image(tpl[0], img_dir) + pdf_obj.image(img_dir, x=22, y=pdf_obj.get_y()+5, link=tpl[0]) + image_id += 1 + except RuntimeError: + pass + + pdf_obj.output(filepath) + shutil.rmtree(final_directory) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py new file mode 100644 index 0000000..7239654 --- /dev/null +++ b/final_task/rss_reader/rss_reader.py @@ -0,0 +1,223 @@ +""" +main rss_reader module +""" + +import sys +import argparse +import logging +import html +import json +import feedparser +from bs4 import BeautifulSoup +import rss_reader.cacher as cacher +import rss_reader.format_converter as format_converter + +def init_cli_parser(): + """ + this function initializes command line parser with all nessecary arguments + """ + parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader.', prog='rss-reader') + group_news = parser.add_mutually_exclusive_group(required = True) + group_format = parser.add_mutually_exclusive_group() + group_news.add_argument("source", type=str, nargs='?', default=None, help="RSS URL") + parser.add_argument('--version', help="print version info", action='version', version='%(prog)s 1.4') + group_format.add_argument("--json", help="print result as JSON in stdout", action="store_true") + parser.add_argument("--verbose", help="output verbose status messages", action="store_true") + group_news.add_argument("--date", type=str, help="print news with provided publish date in stdout") + group_format.add_argument("--to-html", type=str, help="print news in a specified file in html format", dest="html", metavar="FILE") + group_format.add_argument("--to-pdf", type=str, help="print news in a specified file in pdf format", dest="pdf", metavar="FILE") + parser.add_argument("--limit", type=int, help="limit news topics if this parameter provided") + + return parser.parse_args() + +def init_logger(): + """ + this function initizlizes logger connected with log file + """ + logger = logging.getLogger() + logger.setLevel(logging.INFO) + file_handler = logging.FileHandler("rss_reader_logs.txt") + file_handler.setFormatter(logging.Formatter('%(asctime)s -- %(levelname)s -- %(message)s')) + logger.addHandler(file_handler) + + return logger + +def brush_text(line): + """ + this function forms description text into more convinient form + """ + start = 80 + while True: + i = start - 10 + try: + while line[i] != ' ': + i += 1 + except IndexError: + break + line = line[:i] + "\n" + line[i + 1:] + start += 80 + + return line + +def get_post_content(post, feed_title): + """ + this function fetches nessecary elements of a publication from post + """ + data = {} + data['feed'] = feed_title + data['title'] = html.unescape(post.title) if html.unescape(post.title) else '*no title*' + data['pub_date'] = post.published if post.published else '*no date*' + data['pub_parsed'] = \ + f"{post.published_parsed.tm_year}{post.published_parsed.tm_mon}{post.published_parsed.tm_mday}" if \ + data['pub_date'] != '*no date*' else '42' + data['link'] = post.link + soup = BeautifulSoup(post.description, 'html.parser') + data['description'] = html.unescape(soup.text) if html.unescape(soup.text) else '*no description*' + data['hrefs'] = [(link['href'], 'link') for link in soup.find_all('a') if link.get('href', None)] + for img in soup.find_all('img'): + if not img.get('src', 'Unknown') == '': + data['hrefs'] += [(img.get('src', 'Unknown'), 'image', img.get('alt', ''))] + data['description'] = \ + f"[image {len(data['hrefs'])}: {img.get('alt', '')}][{len(data['hrefs'])}] " + data['description'] + data['description'] = brush_text(data['description']) + + return data + +def parse_news(url): + """ + this function parses news by given url and returns news list and feed title + """ + feed = feedparser.parse(url) + if feed.bozo == 1: + raise ValueError + + news = [] + for post in feed.entries: + news += [get_post_content(post, feed.feed.title)] + + return news + +def display_news(news): + """ + this function prints news in stdout + """ + if not news: + return None + + for item in news: + print(f"Feed: {item['feed']}\n") + print(f"Title: {item['title']}") + print(f"Publication date: {item['pub_date']}") + print(f"Link: {item['link']}\n") + print(f"{item['description']}\n") + print("Links:") + for index, tpl in enumerate(item['hrefs']): + print(f"[{index + 1}] {tpl[0]} ({tpl[1]})") + print('\n') + +def to_json(news): + """ + this function represents news in json format + """ + for ind, item in enumerate(news): + del item['pub_parsed'] + news[ind] = item + + return json.dumps({'news': news}, indent=2) + +def main(): + """ + an entry point for a program + """ + logger = init_logger() + args = init_cli_parser() + connection, cursor = cacher.init_database() + + if args.verbose: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info(f"verbose notifications are turned on") + + if args.limit or args.limit == 0: + if args.limit < 1: + if len(logger.handlers) == 1: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.error(f"error: invalid limit value\n") + return + + if args.date: + try: + logger.info(f"checking date..") + if not cacher.is_valid_date(args.date): + raise ValueError + logger.info(f"started fetching data from cache..") + news = cacher.get_cached_news(cursor, args.date) + if len(news) == 0: + raise IndexError + news = news[:args.limit if args.limit else len(news)] + except ValueError: + if len(logger.handlers) == 1: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.error(f"error: invalid date\n") + return + except IndexError: + if len(logger.handlers) == 1: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.info(f"error: no news for this date\n") + return + + if args.source: + logger.info(f"started fetching data (url - {args.source})..") + try: + news = parse_news(args.source) + logger.info(f"started caching data..") + cacher.cache_news(connection, cursor, news) + news = news[:args.limit if args.limit else len(news)] + except ValueError: + if len(logger.handlers) == 1: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.error(f"error: not well-formed xml or no access to the Internet\n") + return + + if args.limit: + logger.info(f"the limit of publications to print - {args.limit}") + + if not args.json and not args.html and not args.pdf: + logger.info(f"displaying news..") + display_news(news) + elif args.json: + logger.info(f"displaying news in json format..") + print(to_json(news)) + elif args.html: + logger.info(f"writing news in {args.html} file in html format..") + try: + format_converter.to_html(news, args.html) + except (OSError, FileNotFoundError): + if len(logger.handlers) == 1: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.error("error: invalid directory\n") + return + logger.info(f"file {args.html} was successfully rewrited\n") + return + elif args.pdf: + logger.info(f"writing news in {args.pdf} file in pdf format..") + try: + format_converter.to_pdf(news, args.pdf) + except ConnectionError: + if len(logger.handlers) == 1: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.error("error: no access to the Internet\n") + return + except (OSError, FileNotFoundError): + if len(logger.handlers) == 1: + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.error("error: invalid directory\n") + return + logger.info(f"file {args.pdf} was successfully rewrited\n") + return + + logger.info(f"publications were successfully shown - {len(news)}\n") + + return + +if __name__ == "__main__": + main() diff --git a/final_task/setup.py b/final_task/setup.py new file mode 100644 index 0000000..60b6b70 --- /dev/null +++ b/final_task/setup.py @@ -0,0 +1,30 @@ +import setuptools + +with open("README.md", "r") as f: + long_description = f.read() + +setuptools.setup( + name="rss-reader", + version="1.4", + author="Anton Pashkevich", + author_email="mario.lazer@mail.ru", + description="Pure Python command-line RSS reader", + long_description=long_description, + long_description_content_type="text/markdown", + license='MIT', + url="https://github.com/prague15031939/PythonHomework", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + packages=["rss_reader"], + package_dir={"rss_reader": 'rss_reader'}, + include_package_data=True, + python_requires='>=3.7', + install_requires=['feedparser', 'bs4', 'fpdf', 'requests'], + entry_points={ + 'console_scripts': + [f"rss-reader = rss_reader.rss_reader:main"] + } +)