From 0434a0463bcf0bb10f01a5e651081005386b8798 Mon Sep 17 00:00:00 2001 From: aviacore Date: Sat, 16 Nov 2019 16:29:02 +0300 Subject: [PATCH 1/3] init: project code --- src/main.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/main.py diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..e69de29 From 2ec5fc07d928de683e1bba509991da5c8da6e0d4 Mon Sep 17 00:00:00 2001 From: Sergey Kornilov Date: Sun, 17 Nov 2019 21:23:31 +0300 Subject: [PATCH 2/3] feat: finish first iteration --- .vscode/settings.json | 6 + src/main.py => core/__init__.py | 0 core/__version__.py | 2 + core/article.py | 210 ++++++++++++++++++++++++++++++++ core/content.py | 84 +++++++++++++ core/document.py | 111 +++++++++++++++++ core/logger.py | 32 +++++ core/requester.py | 46 +++++++ pyproject.toml | 3 + rss_reader.py | 32 +++++ setup.py | 18 +++ 11 files changed, 544 insertions(+) create mode 100644 .vscode/settings.json rename src/main.py => core/__init__.py (100%) create mode 100644 core/__version__.py create mode 100644 core/article.py create mode 100644 core/content.py create mode 100644 core/document.py create mode 100644 core/logger.py create mode 100644 core/requester.py create mode 100644 pyproject.toml create mode 100644 rss_reader.py create mode 100644 setup.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b3ee124 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "python.pythonPath": "venv/bin/python", + "python.linting.pylintEnabled": true, + "python.linting.enabled": true, + "python.formatting.provider": "black" +} diff --git a/src/main.py b/core/__init__.py similarity index 100% rename from src/main.py rename to core/__init__.py diff --git a/core/__version__.py b/core/__version__.py new file mode 100644 index 0000000..2c3ca06 --- /dev/null +++ b/core/__version__.py @@ -0,0 +1,2 @@ +__version_info__ = (1, 0, 0) +__version__ = ".".join(map(str, __version_info__)) diff --git a/core/article.py b/core/article.py new file mode 100644 index 0000000..e152124 --- /dev/null +++ b/core/article.py @@ -0,0 +1,210 @@ +from bs4 import BeautifulSoup +import json +from logging import Logger +from typing import List, Optional +from xml.etree import ElementTree + +from .content import Content +from .logger import default_logger + + +def find_links(tree: BeautifulSoup, logger: Optional[Logger] = default_logger) -> List[str]: + """ + Recursively finds all links inside the given HTML string. + + This method looks both for anchor tags `href` attributes and image tags `src` attributes which start with "http" + prefix (and, accordingly, are working links). + + Params: + - tree (BeautifulSoup): BeautifulSoup instance contains HTML template to search for links in + - logger (Logger): Logger instance to use for logging inside the function (optional) + + Returns: + - List of strings which contains all the found links + """ + + links = [] + + logger.info("Parsing anchor tags") + + # Iterate through all anchor tags which have `href` attribute + for el in tree.find_all("a", href=True): + logger.info("Got anchor element: %s", el) + # Check that the href is an external HTTP/S link (not an element ID or email link and etc.) + if el["href"].startswith("http"): + logger.info("Got HTTP/S link: %s", el["href"]) + links.append(el["href"]) + + logger.info("Parsing image tags") + + # Iterate through all image tags which have `src` attribute + for el in tree.find_all("img", src=True): + logger.info("Got image element: %s", el) + # Check that the href is an external HTTP/S link (not an element ID or email link and etc.) + if el["src"].startswith("http"): + logger.info("Got HTTP/S link: %s", el["src"]) + links.append(el["src"]) + + return links + + +def remove_duplicates(items: List, logger: Optional[Logger] = default_logger) -> List: + """ + Returns copy of the passed array without duplicates. + + The purpose of this method is because when the `list(set(items))` is used, the original list items order is being + destroyed. This function iterates through the whole list in sequence and adds the items to new list only if they + have not been seen earlier, so the original list order is saved. + + Params: + - items (list): List to remove the duplicates from + - logger (Logger): Logger instance to use for logging inside the function (optional) + + Returns: + - New list without duplicates + """ + + result = [] + seen_elements = set() + + logger.info("Removing duplicate items") + + for element in items: + # Only adds new element to the resulting list if it has not been added earlier + if not element in seen_elements: + result.append(element) + seen_elements.add(element) + else: + logger.info("Excluding duplicate item: %s", element) + + return result + + +class Article: + """ + Represents the RSS standard article data. + """ + + @classmethod + def from_xml(cls, element: ElementTree.Element, logger: Optional[Logger] = default_logger): + """ + Constructs new Article from an XML element. + + Params: + - element (ElementTree.Element): XML element to parse the article from + - logger (Logger): Logger instance to use for logging inside the method (optional) + + Returns: + - Article class instance which contains the parsed data + """ + + logger.info("Parsing %s article data", str(element)) + + # Grab the article title element + title = element.find("title") + logger.info("Got article title: %s", title) + + # Grab the article date (`pubDate` tag) + date = element.find("pubDate") + logger.info("Got article date: %s", date) + + # Grab the article description + description = element.find("description") + logger.info("Got article description: %s", description) + + # Grab the article public link + public_link = element.find("link") + logger.info("Got article public link: %s", public_link) + + # Parse the description text to HTML tree + tree = BeautifulSoup(description.text, "html.parser") + logger.info("Parsed the HTML data") + + # Parse the description to human-readable format (with images) + content = Content.from_html_tree(tree, logger) + + # Concatenate the array containing public link with links found in the `description` element to get complete + # article links list + all_links = [public_link.text] + all_links.extend(find_links(tree, logger)) + + # Remove duplicate links (article description can also contain the public link) + all_links = remove_duplicates(all_links, logger) + + return cls(title.text, date.text, public_link.text, content, all_links, logger) + + def __init__(self, title: str, date: str, public_link: str, content: Content, links: List[str], logger: Logger): + """ + Constructs new Article from parsed data. + + It is not recommended to call this method direclty with parsed XML, instead use `Article.from_xml` class method. + + Params: + - title (str): The article title + - date (str): Article creation date + - public_link (str): Article public link from which it can be accessed within the browser + - content (Content): Parsed article content (description) + - links (List[str]): List of all article links + - logger (Logger): Logger instance to use for logging inside the class + + Returns: + - The constructed Article object + """ + + self.title = title + self.date = date + self.public_link = public_link + self.content = content + self.links = links + self.logger = logger + + def to_string(self) -> str: + """ + Returns string representation of an article. + """ + + self.logger.info("Converting article header to string") + + title = f"Title: {self.title}\n" + date = f"Date: {self.date}\n" + public_link = f"Link: {self.public_link}\n" + content = f"\n{self.content}\n" + + self.logger.info("Converting article links to string") + + links_title = "\n\nLinks:" + links_list = "" + + for i, l in enumerate(self.links): + links_list += f"\n[{i + 1}]: {l}" + + return title + date + public_link + content + links_title + links_list + + def __str__(self) -> str: + """ + Overrides the standard python string convertion behavior to be able to directly call `str(article_instance)` + and receive the correct article string representation. + """ + + return self.to_string() + + def to_dict(self) -> dict: + """ + Returns the dict representation of an article data. + """ + + return { + "title": self.title, + "date": self.date, + "public_link": self.public_link, + "content": self.content.to_dict(), + "links": self.links, + } + + def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str: + """ + Returns JSON representation of an article. + """ + + self.logger.info("Converting article to JSON") + return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys) diff --git a/core/content.py b/core/content.py new file mode 100644 index 0000000..4ae920b --- /dev/null +++ b/core/content.py @@ -0,0 +1,84 @@ +from bs4 import BeautifulSoup +import json +from logging import Logger +from typing import Dict, List, Optional + +from .logger import default_logger + + +class Content: + """ + Represents the RSS standard article content. + """ + + @classmethod + def from_html_tree(cls, tree: BeautifulSoup, logger: Optional[Logger] = default_logger): + """ + Constructs new Content instance from a description HTML tree. + + Params: + - tree (BeautifulSoup): HTML tree to parse the content from + - logger (Logger): Logger instance to use for logging inside the method (optional) + + Returns: + - Content class instance which contains the parsed data + """ + + # Grab the description text + description = tree.text + # Grab the content image `alt` attribute + image = tree.find("img", alt=True)["alt"] + + return cls(description, image, logger) + + def __init__(self, description: str, image: str, logger: Logger): + """ + Constructs new Content from parsed data. + + It is not recommended to call this method direclty with parsed HTML, instead use + `Content.from_html_tree` class method. + + Params: + - description (str): The content description text + - image (str): Value of the `alt` attribute of the content image + - logger (Logger): Logger instance to use for logging inside the class + + Returns: + - The constructed Content object + """ + + self.description = description + self.image = image + self.logger = logger + + def to_string(self) -> str: + """ + Returns string representation of a content. + """ + + self.logger.info("Converting article content to string") + return f"[image: {self.image}] {self.description}" + + def __str__(self) -> str: + """ + Overrides the standard python string convertion behavior to be able to directly call `str(content_instance)` + and receive the correct content string representation. + """ + + return self.to_string() + + def to_dict(self) -> Dict[str, str]: + """ + Returns the dict representation of an article content. + """ + + data = {"description": self.description, "image": self.image} + return data + + def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str: + """ + Returns JSON representation of an article content. + """ + + self.logger.info("Converting article content to JSON") + return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys) diff --git a/core/document.py b/core/document.py new file mode 100644 index 0000000..1026cfe --- /dev/null +++ b/core/document.py @@ -0,0 +1,111 @@ +import json +from logging import Logger +from typing import List, Optional +from xml.etree import ElementTree + +from .article import Article +from .logger import default_logger + + +class Document: + """ + Represents the RSS document data. + """ + + @classmethod + def from_xml( + cls, document: ElementTree.Element, limit: Optional[int] = 0, logger: Optional[Logger] = default_logger + ): + """ + Constructs new RSS Document from an XML element. + + Params: + - document (ElementTree.Element): XML element to parse the document from + - limit (Optional[int]): Maximum amount of articles to parse (defaults to 0, which means "all articles") + - logger (Logger): Logger instance to use for logging inside the class + + Returns: + - Document class instance which contains the parsed data + """ + + # Grab the `channel` element from RSS document (it contains all the required content) + channel = document.find("channel") + logger.info("Got document channel element: %s", str(channel)) + + # Grab the whole blog title (it is usually first `title` element in RSS documents) + feed = channel.find("title") + logger.info("Got document title element: %s", feed.text) + + # Grab all articles items + all_items = channel.findall("item") + logger.info("Got %d articles elements", len(all_items)) + + # Use provided limit value to limit the articles iterations + end_of_parsing = limit if limit else len(all_items) + logger.info("Parsing document articles until reaching the limit: %d", end_of_parsing) + + # Construct articles from the RSS document + articles = [] + for item in all_items[:end_of_parsing]: + articles.append(Article.from_xml(item, logger)) + + return cls(feed.text, articles, logger) + + def __init__(self, feed: str, articles: List[Article], logger: Logger): + """ + Constructs new Document from parsed data. + + It is not recommended to call this method direclty with parsed XML, instead use `Document.from_xml` class method. + + Params: + - feed (str): The main blog title + - articles (List[Article]): The parsed blog articles list + - logger (Logger): Logger instance to use for logging inside the class + + Returns: + - The constructed Document object + """ + + self.feed = feed + self.articles = articles + self.logger = logger + + def to_string(self) -> str: + """ + Returns string representation of a Document. + """ + + self.logger.info("Converting document header to string") + + feed = f"\nFeed: {self.feed}\n\n" + articles = "" + + self.logger.info("Converting document articles to string") + + for a in self.articles: + articles += "=" * 64 + "\n" * 2 + articles += a.to_string() + "\n" * 2 + + return feed + articles + + def __str__(self) -> str: + """ + Overrides the standard python string convertion behavior to be able to directly call `str(document_instance)` + and receive the correct document string representation. + """ + + return self.to_string() + + def to_dict(self) -> dict: + articles = [] + for a in self.articles: + articles.append(a.to_dict()) + return {"feed": self.feed, "articles": articles} + + def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str: + """ + Returns JSON representation of a document. + """ + + self.logger.info("Converting document to JSON") + return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys) diff --git a/core/logger.py b/core/logger.py new file mode 100644 index 0000000..2548191 --- /dev/null +++ b/core/logger.py @@ -0,0 +1,32 @@ +import logging +from typing import Optional + +default_logger = logging.getLogger(__name__) +default_logger.setLevel(logging.DEBUG) + + +def create_logger(verbose: Optional[bool] = False) -> logging.Logger: + """ + Creates new logger instance wich outputs messages to stdout, based on CLI verbose argument. + + Params: + - verbose (bool): Whether to verbose the module output in console (optional, defaults to False) + + Returns: + - Logger instance to use for application logging purposes + """ + + logger = logging.getLogger(__name__) + format_pattern = "%(asctime)s %(levelname)s %(message)s" + formatter = logging.Formatter(format_pattern) + + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if verbose: + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.FATAL) + + return logger diff --git a/core/requester.py b/core/requester.py new file mode 100644 index 0000000..5f1296c --- /dev/null +++ b/core/requester.py @@ -0,0 +1,46 @@ +from logging import Logger +from requests import get +from sys import exit +from typing import Optional +from xml.etree import ElementTree + +from .logger import default_logger + + +def request_xml(url: str, logger: Optional[Logger] = default_logger) -> ElementTree.Element: + """ + Uses HTTP GET method to request the content of the and returns the XML parser from it. + + Params: + - url (str): The URL to query XML from + + Returns: + - ElementTree.Element instance which can be used to interact with the queried XML + """ + + logger.info("Sending GET request to %s", url) + response = get(url) + logger.info("%s responded with %s", url, str(response)) + + # Validate that the server responded without errors (or with "200" status code) + if response.status_code != 200: + msg = f"Request to {url} failed with code: {response.status_code}" + logger.error(msg) + raise Exception(msg) + + # Additionaly, check that the response is "ok" (for some unexpected cases) + if not response.ok: + msg = f"Failed to send the request to {url}" + logger.error(msg) + raise Exception(msg) + + logger.info("Parsing XML from response text") + + try: + xml = ElementTree.fromstring(response.text) + logger.info("Parsed XML: %s", str(xml)) + return xml + except Exception as e: + msg = f"Failed to parse the XML: {e}" + logger.error(msg) + raise Exception(msg) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4e42573 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.black] +line-length = 120 +target-version = ['py38'] diff --git a/rss_reader.py b/rss_reader.py new file mode 100644 index 0000000..eb6d12a --- /dev/null +++ b/rss_reader.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from argparse import ArgumentParser +from sys import exit +from termcolor import colored + +from core.__version__ import __version__ + +from core.document import Document +from core.logger import create_logger +from core.requester import request_xml + +parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.") + +parser.add_argument("source", type=str, help="RSS URL") +parser.add_argument("--version", action="version", help="Print version info", version=__version__) +parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") +parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages") +parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided") + +args = parser.parse_args() +logger = create_logger(args.verbose) + +try: + response = request_xml(args.source, logger) + document = Document.from_xml(response, args.limit, logger) + data = document.to_json() if args.json else str(document) + print(data) +except Exception as e: + logger.error(e) + print(colored(str(e), "red")) + exit(1) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..89a889e --- /dev/null +++ b/setup.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from os import path +from setuptools import find_packages, setup + +from core.__version__ import __version__ + +setup( + name="rss_reader", + version=__version__, + description="Pure Python command-line RSS reader.", + url="https://github.com/aviacore/PythonHomework", + author="Sergey Kornilov", + author_email="info@ksn.by", + packages=find_packages(), + requires=["argparse", "bs4", "requests", "termcolor"], + scripts=["rss_reader.py"], +) From 7b58523937c23f43ba9b41983b341a63ba060254 Mon Sep 17 00:00:00 2001 From: Sergey Kornilov Date: Sun, 17 Nov 2019 21:50:29 +0300 Subject: [PATCH 3/3] fix: setuptools cli and deps --- .vscode/settings.json | 2 +- bin/rss-reader | 33 +++++++++++++++++++++++++++++++++ rss_reader.py | 32 -------------------------------- setup.py | 6 +++--- 4 files changed, 37 insertions(+), 36 deletions(-) create mode 100644 bin/rss-reader delete mode 100644 rss_reader.py diff --git a/.vscode/settings.json b/.vscode/settings.json index b3ee124..d37719c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,5 @@ { - "python.pythonPath": "venv/bin/python", + "python.pythonPath": "/usr/bin/python3", "python.linting.pylintEnabled": true, "python.linting.enabled": true, "python.formatting.provider": "black" diff --git a/bin/rss-reader b/bin/rss-reader new file mode 100644 index 0000000..ce111c5 --- /dev/null +++ b/bin/rss-reader @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +from argparse import ArgumentParser +from sys import exit +from termcolor import colored + +from core.__version__ import __version__ + +from core.document import Document +from core.logger import create_logger +from core.requester import request_xml + +if __name__ == "__main__": + parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.") + + parser.add_argument("source", type=str, help="RSS URL") + parser.add_argument("--version", action="version", help="Print version info", version=__version__) + parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") + parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages") + parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided") + + args = parser.parse_args() + logger = create_logger(args.verbose) + + try: + response = request_xml(args.source, logger) + document = Document.from_xml(response, args.limit, logger) + data = document.to_json() if args.json else str(document) + print(data) + except Exception as e: + logger.error(e) + print(colored(str(e), "red")) + exit(1) diff --git a/rss_reader.py b/rss_reader.py deleted file mode 100644 index eb6d12a..0000000 --- a/rss_reader.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from argparse import ArgumentParser -from sys import exit -from termcolor import colored - -from core.__version__ import __version__ - -from core.document import Document -from core.logger import create_logger -from core.requester import request_xml - -parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.") - -parser.add_argument("source", type=str, help="RSS URL") -parser.add_argument("--version", action="version", help="Print version info", version=__version__) -parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") -parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages") -parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided") - -args = parser.parse_args() -logger = create_logger(args.verbose) - -try: - response = request_xml(args.source, logger) - document = Document.from_xml(response, args.limit, logger) - data = document.to_json() if args.json else str(document) - print(data) -except Exception as e: - logger.error(e) - print(colored(str(e), "red")) - exit(1) diff --git a/setup.py b/setup.py index 89a889e..dad2928 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- from os import path from setuptools import find_packages, setup @@ -13,6 +13,6 @@ author="Sergey Kornilov", author_email="info@ksn.by", packages=find_packages(), - requires=["argparse", "bs4", "requests", "termcolor"], - scripts=["rss_reader.py"], + install_requires=["argparse", "bs4", "requests", "termcolor"], + scripts=["bin/rss-reader"], )