introduction-to-python-bsuir-2019 · aviacore · Nov 16, 2019 · Nov 17, 2019 · Nov 17, 2019 · AlexeiBuzuma
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+  "python.pythonPath": "/usr/bin/python3",
+  "python.linting.pylintEnabled": true,
+  "python.linting.enabled": true,
+  "python.formatting.provider": "black"
+}
diff --git a/bin/rss-reader b/bin/rss-reader
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from argparse import ArgumentParser
+from sys import exit
+from termcolor import colored
+
+from core.__version__ import __version__
+
+from core.document import Document
+from core.logger import create_logger
+from core.requester import request_xml
+
+if __name__ == "__main__":
+    parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.")
+
+    parser.add_argument("source", type=str, help="RSS URL")
+    parser.add_argument("--version", action="version", help="Print version info", version=__version__)
+    parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout")
+    parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages")
+    parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided")
+
+    args = parser.parse_args()
+    logger = create_logger(args.verbose)
+
+    try:
+        response = request_xml(args.source, logger)
+        document = Document.from_xml(response, args.limit, logger)
+        data = document.to_json() if args.json else str(document)
+        print(data)
+    except Exception as e:
+        logger.error(e)
+        print(colored(str(e), "red"))
+        exit(1)
diff --git a/core/__init__.py b/core/__init__.py
diff --git a/core/__version__.py b/core/__version__.py
@@ -0,0 +1,2 @@
+__version_info__ = (1, 0, 0)
+__version__ = ".".join(map(str, __version_info__))
diff --git a/core/article.py b/core/article.py
@@ -0,0 +1,210 @@
+from bs4 import BeautifulSoup
+import json
+from logging import Logger
+from typing import List, Optional
+from xml.etree import ElementTree
+
+from .content import Content
+from .logger import default_logger
+
+
+def find_links(tree: BeautifulSoup, logger: Optional[Logger] = default_logger) -> List[str]:
+    """
+    Recursively finds all links inside the given HTML string.
+
+    This method looks both for anchor tags `href` attributes and image tags `src` attributes which start with "http"
+    prefix (and, accordingly, are working links).
+
+    Params:
+        - tree (BeautifulSoup): BeautifulSoup instance contains HTML template to search for links in
+        - logger (Logger): Logger instance to use for logging inside the function (optional)
+
+    Returns:
+        - List of strings which contains all the found links
+    """
+
+    links = []
+
+    logger.info("Parsing anchor tags")
+
+    # Iterate through all anchor tags which have `href` attribute
+    for el in tree.find_all("a", href=True):
+        logger.info("Got anchor element: %s", el)
+        # Check that the href is an external HTTP/S link (not an element ID or email link and etc.)
+        if el["href"].startswith("http"):
+            logger.info("Got HTTP/S link: %s", el["href"])
+            links.append(el["href"])
+
+    logger.info("Parsing image tags")
+
+    # Iterate through all image tags which have `src` attribute
+    for el in tree.find_all("img", src=True):
+        logger.info("Got image element: %s", el)
+        # Check that the href is an external HTTP/S link (not an element ID or email link and etc.)
+        if el["src"].startswith("http"):
+            logger.info("Got HTTP/S link: %s", el["src"])
+            links.append(el["src"])
+
+    return links
+
+
+def remove_duplicates(items: List, logger: Optional[Logger] = default_logger) -> List:
+    """
+    Returns copy of the passed array without duplicates.
+
+    The purpose of this method is because when the `list(set(items))` is used, the original list items order is being
+    destroyed. This function iterates through the whole list in sequence and adds the items to new list only if they
+    have not been seen earlier, so the original list order is saved.
+
+    Params:
+        - items (list): List to remove the duplicates from
+        - logger (Logger): Logger instance to use for logging inside the function (optional)
+
+    Returns:
+        - New list without duplicates
+    """
+
+    result = []
+    seen_elements = set()
+
+    logger.info("Removing duplicate items")
+
+    for element in items:
+        # Only adds new element to the resulting list if it has not been added earlier
+        if not element in seen_elements:
+            result.append(element)
+            seen_elements.add(element)
+        else:
+            logger.info("Excluding duplicate item: %s", element)
+
+    return result
+
+
+class Article:
+    """
+    Represents the RSS standard article data.
+    """
+
+    @classmethod
+    def from_xml(cls, element: ElementTree.Element, logger: Optional[Logger] = default_logger):
+        """
+        Constructs new Article from an XML element.
+
+        Params:
+            - element (ElementTree.Element): XML element to parse the article from
+            - logger (Logger): Logger instance to use for logging inside the method (optional)
+
+        Returns:
+            - Article class instance which contains the parsed data
+        """
+
+        logger.info("Parsing %s article data", str(element))
+
+        # Grab the article title element
+        title = element.find("title")
+        logger.info("Got article title: %s", title)
+
+        # Grab the article date (`pubDate` tag)
+        date = element.find("pubDate")
+        logger.info("Got article date: %s", date)
+
+        # Grab the article description
+        description = element.find("description")
+        logger.info("Got article description: %s", description)
+
+        # Grab the article public link
+        public_link = element.find("link")
+        logger.info("Got article public link: %s", public_link)
+
+        # Parse the description text to HTML tree
+        tree = BeautifulSoup(description.text, "html.parser")
+        logger.info("Parsed the HTML data")
+
+        # Parse the description to human-readable format (with images)
+        content = Content.from_html_tree(tree, logger)
+
+        # Concatenate the array containing public link with links found in the `description` element to get complete
+        # article links list
+        all_links = [public_link.text]
+        all_links.extend(find_links(tree, logger))
+
+        # Remove duplicate links (article description can also contain the public link)
+        all_links = remove_duplicates(all_links, logger)
+
+        return cls(title.text, date.text, public_link.text, content, all_links, logger)
+
+    def __init__(self, title: str, date: str, public_link: str, content: Content, links: List[str], logger: Logger):
+        """
+        Constructs new Article from parsed data.
+
+        It is not recommended to call this method direclty with parsed XML, instead use `Article.from_xml` class method.
+
+        Params:
+            - title (str): The article title
+            - date (str): Article creation date
+            - public_link (str): Article public link from which it can be accessed within the browser
+            - content (Content): Parsed article content (description)
+            - links (List[str]): List of all article links
+            - logger (Logger): Logger instance to use for logging inside the class
+
+        Returns:
+            - The constructed Article object
+        """
+
+        self.title = title
+        self.date = date
+        self.public_link = public_link
+        self.content = content
+        self.links = links
+        self.logger = logger
+
+    def to_string(self) -> str:
+        """
+        Returns string representation of an article.
+        """
+
+        self.logger.info("Converting article header to string")
+
+        title = f"Title: {self.title}\n"
+        date = f"Date: {self.date}\n"
+        public_link = f"Link: {self.public_link}\n"
+        content = f"\n{self.content}\n"
+
+        self.logger.info("Converting article links to string")
+
+        links_title = "\n\nLinks:"
+        links_list = ""
+
+        for i, l in enumerate(self.links):
+            links_list += f"\n[{i + 1}]: {l}"
+
+        return title + date + public_link + content + links_title + links_list
+
+    def __str__(self) -> str:
+        """
+        Overrides the standard python string convertion behavior to be able to directly call `str(article_instance)`
+        and receive the correct article string representation.
+        """
+
+        return self.to_string()
+
+    def to_dict(self) -> dict:
+        """
+        Returns the dict representation of an article data.
+        """
+
+        return {
+            "title": self.title,
+            "date": self.date,
+            "public_link": self.public_link,
+            "content": self.content.to_dict(),
+            "links": self.links,
+        }
+
+    def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str:
+        """
+        Returns JSON representation of an article.
+        """
+
+        self.logger.info("Converting article to JSON")
+        return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys)
diff --git a/core/content.py b/core/content.py
@@ -0,0 +1,84 @@
+from bs4 import BeautifulSoup
+import json
+from logging import Logger
+from typing import Dict, List, Optional
+
+from .logger import default_logger
+
+
+class Content:
+    """
+    Represents the RSS standard article content.
+    """
+
+    @classmethod
+    def from_html_tree(cls, tree: BeautifulSoup, logger: Optional[Logger] = default_logger):
+        """
+        Constructs new Content instance from a description HTML tree.
+
+        Params:
+            - tree (BeautifulSoup): HTML tree to parse the content from
+            - logger (Logger): Logger instance to use for logging inside the method (optional)
+
+        Returns:
+            - Content class instance which contains the parsed data
+        """
+
+        # Grab the description text
+        description = tree.text
+        # Grab the content image `alt` attribute
+        image = tree.find("img", alt=True)["alt"]
+
+        return cls(description, image, logger)
+
+    def __init__(self, description: str, image: str, logger: Logger):
+        """
+        Constructs new Content from parsed data.
+
+        It is not recommended to call this method direclty with parsed HTML, instead use
+        `Content.from_html_tree` class method.
+
+        Params:
+            - description (str): The content description text
+            - image (str): Value of the `alt` attribute of the content image
+            - logger (Logger): Logger instance to use for logging inside the class
+
+        Returns:
+            - The constructed Content object
+        """
+
+        self.description = description
+        self.image = image
+        self.logger = logger
+
+    def to_string(self) -> str:
+        """
+        Returns string representation of a content.
+        """
+
+        self.logger.info("Converting article content to string")
+        return f"[image: {self.image}] {self.description}"
+
+    def __str__(self) -> str:
+        """
+        Overrides the standard python string convertion behavior to be able to directly call `str(content_instance)`
+        and receive the correct content string representation.
+        """
+
+        return self.to_string()
+
+    def to_dict(self) -> Dict[str, str]:
+        """
+        Returns the dict representation of an article content.
+        """
+
+        data = {"description": self.description, "image": self.image}
+        return data
+
+    def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str:
+        """
+        Returns JSON representation of an article content.
+        """
+
+        self.logger.info("Converting article content to JSON")
+        return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		__version_info__ = (1, 0, 0)
		__version__ = ".".join(map(str, __version_info__))