introduction-to-python-bsuir-2019 · InvokerAndrey · Nov 10, 2019 · Nov 13, 2019 · Nov 13, 2019 · Nov 13, 2019
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# IDE
+.idea
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2018 The Python Packaging Authority
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,49 @@
+# PythonHomework
+[Introduction to Python] Homework Repository
+
+# How to use
+* pip install .
+* rss-reader rss-reader.py "https://news.yahoo.com/rss/" --limit 2 --json
+--to-pdf C:\Users\User_name\Desktop
+* --date prints cached news that were parsed previously from the given URL
+Creates folder cache and saves news in JSON files format
+file name = date (like 20191125.json)
+* For --to-pdf argument: specify the path to the folder 
+where 'news.pdf/cached_news.pdf' file will be saved.
+The file will be overwritten after restarting the program.
+Make sure to copy that file if you need it. Same thing with --to-html argument.
+Also --to-html uses pictures from websites, so they wont be displayed without
+internet connection
+* Btw i use fonts for .pdf files to avoid encoding issues,
+hope they will be installed correctly by 'pip install .'
+
+
+# Parameters
+* --help (show this help message and exit)
+* --limit LIMIT (limit news topics if this parameter provided)
+* --json (prints result as JSON in stdout)
+* --verbose (outputs verbose status messages)
+* --version (print version info)
+* --date (It should take a date in YYYYmmdd format. For example:
+ --date 20191020The new from the specified day will be printed out.
+  If the news are not found error will be returned.)
+* --to-pdf TO_PDF (It should take the path of the directory where new PDF file will be saved)
+* --to-html TO_HTML (It should take the path of the directory where new HTML file will be saved)
+
+# JSON structure
+feed = {
+  'Title': 'feed title',
+  'Published': 'date',
+  'Summary': 'news description',
+  'Link': 'original link to news',
+  'Url': 'url of rss feed'
+  'Image': 'original link to the image'
+}
+
+# Progress
+-   [x] [Iteration 1] One-shot command-line RSS reader.
+-   [x] [Iteration 2] Distribution
+-   [x] [Iteration 3] News caching
+-   [x] [Iteration 4] Format converter
+-   [x] * [Iteration 5] Output colorization
+-   [ ] * [Iteration 6] Web-server
diff --git a/app/RSSReader.py b/app/RSSReader.py
@@ -0,0 +1,181 @@
+"""
+    Contains class RSSReader which receives arguments from cmd
+    and allows to parse URL with RSS feed and print it in stdout
+    in different formats
+"""
+
+import os
+import json
+
+import feedparser
+from bs4 import BeautifulSoup
+import dateutil.parser as dateparser
+from colorama import init
+from colorama import Fore
+import requests
+
+from app.rss_exception import RSSException
+
+
+class RSSReader:
+    """ Reads news from RSS url and prints them """
+
+    def __init__(self, url, limit, date, logger, colorize=None):
+        self.url = url
+        self.limit = limit
+        self.date = date
+        self.logger = logger
+        self.colorize = colorize
+        init() # colorama
+
+    def get_feed(self):
+        """ Returns parsed feed and caches it"""
+        response = requests.get(self.url).text
+        news_feed = feedparser.parse(response)
+        for entry in news_feed.entries[:self.limit]:
+            self.cache_news_json(entry)
+        self.logger.info('News has been cached')
+        if not news_feed.entries:
+            raise RSSException('Did not parse any news')
+        return news_feed.entries[:self.limit]
+
+    def print_feed(self, entries):
+        """ Prints feed in stdout """
+
+        self.logger.info('Printing feed')
+
+        if self.colorize:
+            for entry in entries:
+                print(f'{Fore.GREEN}========================================================{Fore.RESET}')
+                print(f'{Fore.GREEN}Title:{Fore.RESET} {entry.title}')
+                print(f'{Fore.GREEN}Published:{Fore.RESET} {entry.published}')
+                print(f'{Fore.GREEN}Summary:{Fore.RESET} {BeautifulSoup(entry.summary, "html.parser").text}')
+                print(f'{Fore.GREEN}Image:{Fore.RESET} {self.get_img_url(entry.summary)}')
+                print(f'{Fore.GREEN}Link:{Fore.RESET} {entry.link}')
+                print(f'{Fore.GREEN}========================================================{Fore.RESET}')
+        else:
+            for entry in entries:
+                print('========================================================')
+                print(f'Title: {entry.title}')
+                print(f'Published: {entry.published}', end='\n\n')
+                print(f'Summary: {BeautifulSoup(entry.summary, "html.parser").text}', end='\n\n')
+                print(f'Image: {self.get_img_url(entry.summary)}')
+                print(f'Link: {entry.link}')
+                print('========================================================')
+
+    def get_img_url(self, summary):
+        """ Parses image url from <description> in rss feed """
+        soup = BeautifulSoup(summary, 'html.parser')
+        img = soup.find('img')
+        if img:
+            img_url = img['src']
+            return img_url
+        else:
+            return None
+
+    def print_feed_json(self, entries):
+        """ Prints feed in stdout in JSON format """
+
+        self.logger.info('Printing feed in JSON format')
+
+        for entry in entries:
+            feed = self.to_dict(entry)
+            if self.colorize:
+                print(Fore.GREEN + json.dumps(feed, indent=2, ensure_ascii=False) + Fore.RESET, end=',\n')
+            else:
+                print(json.dumps(feed, indent=2, ensure_ascii=False), end=',\n')
+
+    def to_dict(self, entry):
+        """ Converts entry to dict() format """
+
+        feed = dict()
+        feed['Title'] = entry.title
+        feed['Published'] = entry.published
+        feed['Summary'] = BeautifulSoup(entry.summary, "html.parser").text
+        feed['Link'] = entry.link
+        feed['Url'] = self.url
+        feed['Image'] = self.get_img_url(entry.summary)
+        return feed
+
+    def cache_news_json(self, entry):
+        """ Saves all printed news in JSON format (path = 'cache/{publication_date}.json')"""
+
+        date = dateparser.parse(entry.published, fuzzy=True).strftime('%Y%m%d')
+        directory_path = 'cache' + os.path.sep
+        if not os.path.exists(directory_path):
+            self.logger.info('Creating directory cache')
+            os.mkdir(directory_path)
+
+        file_path = directory_path + date + '.json'
+
+        feed = self.to_dict(entry)
+        news = list()
+        try:
+            with open(file_path, encoding='utf-8') as rf:
+                news = json.load(rf)
+                if feed in news:
+                    # already cached
+                    return
+        except FileNotFoundError:
+            self.logger.info('Creating new .json file')
+        except json.JSONDecodeError:
+            self.logger.info('Empty JSON file')
+
+        with open(file_path, 'w', encoding='utf-8') as wf:
+            news.append(feed)
+            json.dump(news, wf, indent=2)
+
+    def get_cached_json_news(self):
+        """ Returns the list of cached news with date from arguments """
+
+        file_path = 'cache' + os.path.sep + self.date + '.json'
+        cached_news = list()
+        try:
+            with open(file_path) as rf:
+                news = json.load(rf)
+                for new in news:
+                    if new['Url'] == self.url:
+                        cached_news.append(new)
+                if not cached_news:
+                    # News with such url have not been found
+                    raise FileNotFoundError
+                return cached_news[:self.limit]
+        except FileNotFoundError:
+            if self.colorize:
+                print(f'{Fore.RED}There are no cached news with such date by this url{Fore.RESET}')
+            else:
+                print('There are no cached news with such date by this url')
+        except json.JSONDecodeError:
+            # Empty json file
+            # Or no news by needed url
+            if self.colorize:
+                print(f'{Fore.RED}There are no cached news with such date by this url{Fore.RESET}')
+            else:
+                print('There are no cached news with such date by this url')
+        return False
+
+    def print_cached_feed(self, cached_feed):
+        """ Prints saved news in stdout """
+
+        self.logger.info('Printing cached feed')
+        for new in cached_feed:
+            if self.colorize:
+                print(f'{Fore.GREEN}---------------------------------------------------------{Fore.RESET}')
+                for key, value in new.items():
+                    print(f'{Fore.GREEN}{key}:{Fore.RESET} {value}')
+                print(f'{Fore.GREEN}---------------------------------------------------------{Fore.RESET}')
+            else:
+                print('---------------------------------------------------------')
+                for key, value in new.items():
+                    print(f'{key}: {value}')
+                print('---------------------------------------------------------')
+
+    def print_cached_feed_json(self, cached_feed):
+        """ Prints saved news in stdout in JSON format """
+
+        self.logger.info('Printing cached feed in JSON format')
+        for new in cached_feed:
+            if self.colorize:
+                print(Fore.GREEN + json.dumps(new, indent=2) + Fore.RESET, end=',\n')
+            else:
+                print(json.dumps(new, indent=2), end=',\n')
diff --git a/app/__init__.py b/app/__init__.py
diff --git a/app/__main__.py b/app/__main__.py
@@ -0,0 +1,6 @@
+""" Package entry point """
+
+from app.rss_reader import main
+
+if __name__ == '__main__':
+    main()
diff --git a/app/argparser.py b/app/argparser.py
@@ -0,0 +1,77 @@
+"""
+    Contains ArgParser class which allows parse arguments from cmd
+"""
+
+import argparse
+
+
+__version__ = '0.5.0'
+
+
+class ArgParser:
+    """ Reads arguments """
+
+    def __init__(self):
+        self.args = self.parse_args()
+
+    def parse_args(self):
+        """ Reads arguments from the cmd and returns them """
+
+        argparser = argparse.ArgumentParser(description='One-shot command-line RSS reader', prog='rss-reader')
+        argparser.add_argument(
+            'url',
+            type=str,
+            help='Input RSS url containing news'
+        )
+        argparser.add_argument(
+            '--limit',
+            type=int,
+            default=None,
+            help='Sets a limit for news output (default - no limit)'
+        )
+        argparser.add_argument(
+            '--json',
+            action='store_true',
+            help='Prints feed in JSON format in stdout'
+        )
+        argparser.add_argument(
+            '--version',
+            action='version',
+            version=f'%(prog)s version {__version__}',
+            default=None,
+            help='Prints version of program'
+        )
+        argparser.add_argument(
+            '--verbose',
+            action='store_true',
+            help='Prints all logs in stdout'
+        )
+        argparser.add_argument(
+            '--date',
+            type=str,
+            help='It should take a date in YYYYmmdd format. For example: --date 20191020'
+                 'The new from the specified day will be printed out. If the news are not found error will be returned.'
+        )
+        argparser.add_argument(
+            '--to-pdf',
+            dest='to_pdf',
+            type=str,
+            help='It should take the path of the directory where new PDF file will be saved'
+        )
+        argparser.add_argument(
+            '--to-html',
+            dest='to_html',
+            type=str,
+            help='It should take the path of the directory where new HTML file will be saved'
+        )
+        argparser.add_argument(
+            '--colorize',
+            action='store_true',
+            help='Prints the result of the utility in colorized mode'
+        )
+        args = argparser.parse_args()
+        return args
+
+    def get_args(self):
+        """ Returns arguments """
+        return self.args
diff --git a/app/fonts/NotoSans-Black.cw127.pkl b/app/fonts/NotoSans-Black.cw127.pkl
diff --git a/app/fonts/NotoSans-Black.pkl b/app/fonts/NotoSans-Black.pkl
diff --git a/app/fonts/NotoSans-Black.ttf b/app/fonts/NotoSans-Black.ttf
diff --git a/app/fonts/NotoSans-Thin.cw127.pkl b/app/fonts/NotoSans-Thin.cw127.pkl
diff --git a/app/fonts/NotoSans-Thin.pkl b/app/fonts/NotoSans-Thin.pkl
diff --git a/app/fonts/NotoSans-Thin.ttf b/app/fonts/NotoSans-Thin.ttf
-Original file line number
+Diff line change
@@ Expand Up / @@ -102,3 +102,6 @@ venv.bak/ @@
     # mypy
     .mypy_cache/
+    # IDE
+    .idea