From 0434a0463bcf0bb10f01a5e651081005386b8798 Mon Sep 17 00:00:00 2001
From: aviacore <info@ksn.by>
Date: Sat, 16 Nov 2019 16:29:02 +0300
Subject: [PATCH 1/3] init: project code

---
 src/main.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/main.py

diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000..e69de29

From 2ec5fc07d928de683e1bba509991da5c8da6e0d4 Mon Sep 17 00:00:00 2001
From: Sergey Kornilov <info@ksn.by>
Date: Sun, 17 Nov 2019 21:23:31 +0300
Subject: [PATCH 2/3] feat: finish first iteration

---
 .vscode/settings.json           |   6 +
 src/main.py => core/__init__.py |   0
 core/__version__.py             |   2 +
 core/article.py                 | 210 ++++++++++++++++++++++++++++++++
 core/content.py                 |  84 +++++++++++++
 core/document.py                | 111 +++++++++++++++++
 core/logger.py                  |  32 +++++
 core/requester.py               |  46 +++++++
 pyproject.toml                  |   3 +
 rss_reader.py                   |  32 +++++
 setup.py                        |  18 +++
 11 files changed, 544 insertions(+)
 create mode 100644 .vscode/settings.json
 rename src/main.py => core/__init__.py (100%)
 create mode 100644 core/__version__.py
 create mode 100644 core/article.py
 create mode 100644 core/content.py
 create mode 100644 core/document.py
 create mode 100644 core/logger.py
 create mode 100644 core/requester.py
 create mode 100644 pyproject.toml
 create mode 100644 rss_reader.py
 create mode 100644 setup.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..b3ee124
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+  "python.pythonPath": "venv/bin/python",
+  "python.linting.pylintEnabled": true,
+  "python.linting.enabled": true,
+  "python.formatting.provider": "black"
+}
diff --git a/src/main.py b/core/__init__.py
similarity index 100%
rename from src/main.py
rename to core/__init__.py
diff --git a/core/__version__.py b/core/__version__.py
new file mode 100644
index 0000000..2c3ca06
--- /dev/null
+++ b/core/__version__.py
@@ -0,0 +1,2 @@
+__version_info__ = (1, 0, 0)
+__version__ = ".".join(map(str, __version_info__))
diff --git a/core/article.py b/core/article.py
new file mode 100644
index 0000000..e152124
--- /dev/null
+++ b/core/article.py
@@ -0,0 +1,210 @@
+from bs4 import BeautifulSoup
+import json
+from logging import Logger
+from typing import List, Optional
+from xml.etree import ElementTree
+
+from .content import Content
+from .logger import default_logger
+
+
+def find_links(tree: BeautifulSoup, logger: Optional[Logger] = default_logger) -> List[str]:
+    """
+    Recursively finds all links inside the given HTML string.
+
+    This method looks both for anchor tags `href` attributes and image tags `src` attributes which start with "http"
+    prefix (and, accordingly, are working links).
+
+    Params:
+        - tree (BeautifulSoup): BeautifulSoup instance contains HTML template to search for links in
+        - logger (Logger): Logger instance to use for logging inside the function (optional)
+
+    Returns:
+        - List of strings which contains all the found links
+    """
+
+    links = []
+
+    logger.info("Parsing anchor tags")
+
+    # Iterate through all anchor tags which have `href` attribute
+    for el in tree.find_all("a", href=True):
+        logger.info("Got anchor element: %s", el)
+        # Check that the href is an external HTTP/S link (not an element ID or email link and etc.)
+        if el["href"].startswith("http"):
+            logger.info("Got HTTP/S link: %s", el["href"])
+            links.append(el["href"])
+
+    logger.info("Parsing image tags")
+
+    # Iterate through all image tags which have `src` attribute
+    for el in tree.find_all("img", src=True):
+        logger.info("Got image element: %s", el)
+        # Check that the href is an external HTTP/S link (not an element ID or email link and etc.)
+        if el["src"].startswith("http"):
+            logger.info("Got HTTP/S link: %s", el["src"])
+            links.append(el["src"])
+
+    return links
+
+
+def remove_duplicates(items: List, logger: Optional[Logger] = default_logger) -> List:
+    """
+    Returns copy of the passed array without duplicates.
+
+    The purpose of this method is because when the `list(set(items))` is used, the original list items order is being
+    destroyed. This function iterates through the whole list in sequence and adds the items to new list only if they
+    have not been seen earlier, so the original list order is saved.
+
+    Params:
+        - items (list): List to remove the duplicates from
+        - logger (Logger): Logger instance to use for logging inside the function (optional)
+
+    Returns:
+        - New list without duplicates
+    """
+
+    result = []
+    seen_elements = set()
+
+    logger.info("Removing duplicate items")
+
+    for element in items:
+        # Only adds new element to the resulting list if it has not been added earlier
+        if not element in seen_elements:
+            result.append(element)
+            seen_elements.add(element)
+        else:
+            logger.info("Excluding duplicate item: %s", element)
+
+    return result
+
+
+class Article:
+    """
+    Represents the RSS standard article data.
+    """
+
+    @classmethod
+    def from_xml(cls, element: ElementTree.Element, logger: Optional[Logger] = default_logger):
+        """
+        Constructs new Article from an XML element.
+
+        Params:
+            - element (ElementTree.Element): XML element to parse the article from
+            - logger (Logger): Logger instance to use for logging inside the method (optional)
+
+        Returns:
+            - Article class instance which contains the parsed data
+        """
+
+        logger.info("Parsing %s article data", str(element))
+
+        # Grab the article title element
+        title = element.find("title")
+        logger.info("Got article title: %s", title)
+
+        # Grab the article date (`pubDate` tag)
+        date = element.find("pubDate")
+        logger.info("Got article date: %s", date)
+
+        # Grab the article description
+        description = element.find("description")
+        logger.info("Got article description: %s", description)
+
+        # Grab the article public link
+        public_link = element.find("link")
+        logger.info("Got article public link: %s", public_link)
+
+        # Parse the description text to HTML tree
+        tree = BeautifulSoup(description.text, "html.parser")
+        logger.info("Parsed the HTML data")
+
+        # Parse the description to human-readable format (with images)
+        content = Content.from_html_tree(tree, logger)
+
+        # Concatenate the array containing public link with links found in the `description` element to get complete
+        # article links list
+        all_links = [public_link.text]
+        all_links.extend(find_links(tree, logger))
+
+        # Remove duplicate links (article description can also contain the public link)
+        all_links = remove_duplicates(all_links, logger)
+
+        return cls(title.text, date.text, public_link.text, content, all_links, logger)
+
+    def __init__(self, title: str, date: str, public_link: str, content: Content, links: List[str], logger: Logger):
+        """
+        Constructs new Article from parsed data.
+
+        It is not recommended to call this method direclty with parsed XML, instead use `Article.from_xml` class method.
+
+        Params:
+            - title (str): The article title
+            - date (str): Article creation date
+            - public_link (str): Article public link from which it can be accessed within the browser
+            - content (Content): Parsed article content (description)
+            - links (List[str]): List of all article links
+            - logger (Logger): Logger instance to use for logging inside the class
+
+        Returns:
+            - The constructed Article object
+        """
+
+        self.title = title
+        self.date = date
+        self.public_link = public_link
+        self.content = content
+        self.links = links
+        self.logger = logger
+
+    def to_string(self) -> str:
+        """
+        Returns string representation of an article.
+        """
+
+        self.logger.info("Converting article header to string")
+
+        title = f"Title: {self.title}\n"
+        date = f"Date: {self.date}\n"
+        public_link = f"Link: {self.public_link}\n"
+        content = f"\n{self.content}\n"
+
+        self.logger.info("Converting article links to string")
+
+        links_title = "\n\nLinks:"
+        links_list = ""
+
+        for i, l in enumerate(self.links):
+            links_list += f"\n[{i + 1}]: {l}"
+
+        return title + date + public_link + content + links_title + links_list
+
+    def __str__(self) -> str:
+        """
+        Overrides the standard python string convertion behavior to be able to directly call `str(article_instance)`
+        and receive the correct article string representation.
+        """
+
+        return self.to_string()
+
+    def to_dict(self) -> dict:
+        """
+        Returns the dict representation of an article data.
+        """
+
+        return {
+            "title": self.title,
+            "date": self.date,
+            "public_link": self.public_link,
+            "content": self.content.to_dict(),
+            "links": self.links,
+        }
+
+    def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str:
+        """
+        Returns JSON representation of an article.
+        """
+
+        self.logger.info("Converting article to JSON")
+        return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys)
diff --git a/core/content.py b/core/content.py
new file mode 100644
index 0000000..4ae920b
--- /dev/null
+++ b/core/content.py
@@ -0,0 +1,84 @@
+from bs4 import BeautifulSoup
+import json
+from logging import Logger
+from typing import Dict, List, Optional
+
+from .logger import default_logger
+
+
+class Content:
+    """
+    Represents the RSS standard article content.
+    """
+
+    @classmethod
+    def from_html_tree(cls, tree: BeautifulSoup, logger: Optional[Logger] = default_logger):
+        """
+        Constructs new Content instance from a description HTML tree.
+
+        Params:
+            - tree (BeautifulSoup): HTML tree to parse the content from
+            - logger (Logger): Logger instance to use for logging inside the method (optional)
+
+        Returns:
+            - Content class instance which contains the parsed data
+        """
+
+        # Grab the description text
+        description = tree.text
+        # Grab the content image `alt` attribute
+        image = tree.find("img", alt=True)["alt"]
+
+        return cls(description, image, logger)
+
+    def __init__(self, description: str, image: str, logger: Logger):
+        """
+        Constructs new Content from parsed data.
+
+        It is not recommended to call this method direclty with parsed HTML, instead use
+        `Content.from_html_tree` class method.
+
+        Params:
+            - description (str): The content description text
+            - image (str): Value of the `alt` attribute of the content image
+            - logger (Logger): Logger instance to use for logging inside the class
+
+        Returns:
+            - The constructed Content object
+        """
+
+        self.description = description
+        self.image = image
+        self.logger = logger
+
+    def to_string(self) -> str:
+        """
+        Returns string representation of a content.
+        """
+
+        self.logger.info("Converting article content to string")
+        return f"[image: {self.image}] {self.description}"
+
+    def __str__(self) -> str:
+        """
+        Overrides the standard python string convertion behavior to be able to directly call `str(content_instance)`
+        and receive the correct content string representation.
+        """
+
+        return self.to_string()
+
+    def to_dict(self) -> Dict[str, str]:
+        """
+        Returns the dict representation of an article content.
+        """
+
+        data = {"description": self.description, "image": self.image}
+        return data
+
+    def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str:
+        """
+        Returns JSON representation of an article content.
+        """
+
+        self.logger.info("Converting article content to JSON")
+        return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys)
diff --git a/core/document.py b/core/document.py
new file mode 100644
index 0000000..1026cfe
--- /dev/null
+++ b/core/document.py
@@ -0,0 +1,111 @@
+import json
+from logging import Logger
+from typing import List, Optional
+from xml.etree import ElementTree
+
+from .article import Article
+from .logger import default_logger
+
+
+class Document:
+    """
+    Represents the RSS document data.
+    """
+
+    @classmethod
+    def from_xml(
+        cls, document: ElementTree.Element, limit: Optional[int] = 0, logger: Optional[Logger] = default_logger
+    ):
+        """
+        Constructs new RSS Document from an XML element.
+
+        Params:
+            - document (ElementTree.Element): XML element to parse the document from
+            - limit (Optional[int]): Maximum amount of articles to parse (defaults to 0, which means "all articles")
+            - logger (Logger): Logger instance to use for logging inside the class
+
+        Returns:
+            - Document class instance which contains the parsed data
+        """
+
+        # Grab the `channel` element from RSS document (it contains all the required content)
+        channel = document.find("channel")
+        logger.info("Got document channel element: %s", str(channel))
+
+        # Grab the whole blog title (it is usually first `title` element in RSS documents)
+        feed = channel.find("title")
+        logger.info("Got document title element: %s", feed.text)
+
+        # Grab all articles items
+        all_items = channel.findall("item")
+        logger.info("Got %d articles elements", len(all_items))
+
+        # Use provided limit value to limit the articles iterations
+        end_of_parsing = limit if limit else len(all_items)
+        logger.info("Parsing document articles until reaching the limit: %d", end_of_parsing)
+
+        # Construct articles from the RSS document
+        articles = []
+        for item in all_items[:end_of_parsing]:
+            articles.append(Article.from_xml(item, logger))
+
+        return cls(feed.text, articles, logger)
+
+    def __init__(self, feed: str, articles: List[Article], logger: Logger):
+        """
+        Constructs new Document from parsed data.
+
+        It is not recommended to call this method direclty with parsed XML, instead use `Document.from_xml` class method.
+
+        Params:
+            - feed (str): The main blog title
+            - articles (List[Article]): The parsed blog articles list
+            - logger (Logger): Logger instance to use for logging inside the class
+
+        Returns:
+            - The constructed Document object
+        """
+
+        self.feed = feed
+        self.articles = articles
+        self.logger = logger
+
+    def to_string(self) -> str:
+        """
+        Returns string representation of a Document.
+        """
+
+        self.logger.info("Converting document header to string")
+
+        feed = f"\nFeed: {self.feed}\n\n"
+        articles = ""
+
+        self.logger.info("Converting document articles to string")
+
+        for a in self.articles:
+            articles += "=" * 64 + "\n" * 2
+            articles += a.to_string() + "\n" * 2
+
+        return feed + articles
+
+    def __str__(self) -> str:
+        """
+        Overrides the standard python string convertion behavior to be able to directly call `str(document_instance)`
+        and receive the correct document string representation.
+        """
+
+        return self.to_string()
+
+    def to_dict(self) -> dict:
+        articles = []
+        for a in self.articles:
+            articles.append(a.to_dict())
+        return {"feed": self.feed, "articles": articles}
+
+    def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str:
+        """
+        Returns JSON representation of a document.
+        """
+
+        self.logger.info("Converting document to JSON")
+        return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys)
diff --git a/core/logger.py b/core/logger.py
new file mode 100644
index 0000000..2548191
--- /dev/null
+++ b/core/logger.py
@@ -0,0 +1,32 @@
+import logging
+from typing import Optional
+
+default_logger = logging.getLogger(__name__)
+default_logger.setLevel(logging.DEBUG)
+
+
+def create_logger(verbose: Optional[bool] = False) -> logging.Logger:
+    """
+    Creates new logger instance wich outputs messages to stdout, based on CLI verbose argument.
+
+    Params:
+        - verbose (bool): Whether to verbose the module output in console (optional, defaults to False)
+
+    Returns:
+        - Logger instance to use for application logging purposes
+    """
+
+    logger = logging.getLogger(__name__)
+    format_pattern = "%(asctime)s %(levelname)s %(message)s"
+    formatter = logging.Formatter(format_pattern)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
+
+    if verbose:
+        logger.setLevel(logging.INFO)
+    else:
+        logger.setLevel(logging.FATAL)
+
+    return logger
diff --git a/core/requester.py b/core/requester.py
new file mode 100644
index 0000000..5f1296c
--- /dev/null
+++ b/core/requester.py
@@ -0,0 +1,46 @@
+from logging import Logger
+from requests import get
+from sys import exit
+from typing import Optional
+from xml.etree import ElementTree
+
+from .logger import default_logger
+
+
+def request_xml(url: str, logger: Optional[Logger] = default_logger) -> ElementTree.Element:
+    """
+    Uses HTTP GET method to request the content of the and returns the XML parser from it.
+
+    Params:
+        - url (str): The URL to query XML from
+
+    Returns:
+        - ElementTree.Element instance which can be used to interact with the queried XML
+    """
+
+    logger.info("Sending GET request to %s", url)
+    response = get(url)
+    logger.info("%s responded with %s", url, str(response))
+
+    # Validate that the server responded without errors (or with "200" status code)
+    if response.status_code != 200:
+        msg = f"Request to {url} failed with code: {response.status_code}"
+        logger.error(msg)
+        raise Exception(msg)
+
+    # Additionaly, check that the response is "ok" (for some unexpected cases)
+    if not response.ok:
+        msg = f"Failed to send the request to {url}"
+        logger.error(msg)
+        raise Exception(msg)
+
+    logger.info("Parsing XML from response text")
+
+    try:
+        xml = ElementTree.fromstring(response.text)
+        logger.info("Parsed XML: %s", str(xml))
+        return xml
+    except Exception as e:
+        msg = f"Failed to parse the XML: {e}"
+        logger.error(msg)
+        raise Exception(msg)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4e42573
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[tool.black]
+line-length = 120
+target-version = ['py38']
diff --git a/rss_reader.py b/rss_reader.py
new file mode 100644
index 0000000..eb6d12a
--- /dev/null
+++ b/rss_reader.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from argparse import ArgumentParser
+from sys import exit
+from termcolor import colored
+
+from core.__version__ import __version__
+
+from core.document import Document
+from core.logger import create_logger
+from core.requester import request_xml
+
+parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.")
+
+parser.add_argument("source", type=str, help="RSS URL")
+parser.add_argument("--version", action="version", help="Print version info", version=__version__)
+parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout")
+parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages")
+parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided")
+
+args = parser.parse_args()
+logger = create_logger(args.verbose)
+
+try:
+    response = request_xml(args.source, logger)
+    document = Document.from_xml(response, args.limit, logger)
+    data = document.to_json() if args.json else str(document)
+    print(data)
+except Exception as e:
+    logger.error(e)
+    print(colored(str(e), "red"))
+    exit(1)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..89a889e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from os import path
+from setuptools import find_packages, setup
+
+from core.__version__ import __version__
+
+setup(
+    name="rss_reader",
+    version=__version__,
+    description="Pure Python command-line RSS reader.",
+    url="https://github.com/aviacore/PythonHomework",
+    author="Sergey Kornilov",
+    author_email="info@ksn.by",
+    packages=find_packages(),
+    requires=["argparse", "bs4", "requests", "termcolor"],
+    scripts=["rss_reader.py"],
+)

From 7b58523937c23f43ba9b41983b341a63ba060254 Mon Sep 17 00:00:00 2001
From: Sergey Kornilov <info@ksn.by>
Date: Sun, 17 Nov 2019 21:50:29 +0300
Subject: [PATCH 3/3] fix: setuptools cli and deps

---
 .vscode/settings.json |  2 +-
 bin/rss-reader        | 33 +++++++++++++++++++++++++++++++++
 rss_reader.py         | 32 --------------------------------
 setup.py              |  6 +++---
 4 files changed, 37 insertions(+), 36 deletions(-)
 create mode 100644 bin/rss-reader
 delete mode 100644 rss_reader.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index b3ee124..d37719c 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,5 +1,5 @@
 {
-  "python.pythonPath": "venv/bin/python",
+  "python.pythonPath": "/usr/bin/python3",
   "python.linting.pylintEnabled": true,
   "python.linting.enabled": true,
   "python.formatting.provider": "black"
diff --git a/bin/rss-reader b/bin/rss-reader
new file mode 100644
index 0000000..ce111c5
--- /dev/null
+++ b/bin/rss-reader
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from argparse import ArgumentParser
+from sys import exit
+from termcolor import colored
+
+from core.__version__ import __version__
+
+from core.document import Document
+from core.logger import create_logger
+from core.requester import request_xml
+
+if __name__ == "__main__":
+    parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.")
+
+    parser.add_argument("source", type=str, help="RSS URL")
+    parser.add_argument("--version", action="version", help="Print version info", version=__version__)
+    parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout")
+    parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages")
+    parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided")
+
+    args = parser.parse_args()
+    logger = create_logger(args.verbose)
+
+    try:
+        response = request_xml(args.source, logger)
+        document = Document.from_xml(response, args.limit, logger)
+        data = document.to_json() if args.json else str(document)
+        print(data)
+    except Exception as e:
+        logger.error(e)
+        print(colored(str(e), "red"))
+        exit(1)
diff --git a/rss_reader.py b/rss_reader.py
deleted file mode 100644
index eb6d12a..0000000
--- a/rss_reader.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-from argparse import ArgumentParser
-from sys import exit
-from termcolor import colored
-
-from core.__version__ import __version__
-
-from core.document import Document
-from core.logger import create_logger
-from core.requester import request_xml
-
-parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.")
-
-parser.add_argument("source", type=str, help="RSS URL")
-parser.add_argument("--version", action="version", help="Print version info", version=__version__)
-parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout")
-parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages")
-parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided")
-
-args = parser.parse_args()
-logger = create_logger(args.verbose)
-
-try:
-    response = request_xml(args.source, logger)
-    document = Document.from_xml(response, args.limit, logger)
-    data = document.to_json() if args.json else str(document)
-    print(data)
-except Exception as e:
-    logger.error(e)
-    print(colored(str(e), "red"))
-    exit(1)
diff --git a/setup.py b/setup.py
index 89a889e..dad2928 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 from os import path
 from setuptools import find_packages, setup
@@ -13,6 +13,6 @@
     author="Sergey Kornilov",
     author_email="info@ksn.by",
     packages=find_packages(),
-    requires=["argparse", "bs4", "requests", "termcolor"],
-    scripts=["rss_reader.py"],
+    install_requires=["argparse", "bs4", "requests", "termcolor"],
+    scripts=["bin/rss-reader"],
 )