Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"python.pythonPath": "/usr/bin/python3",
"python.linting.pylintEnabled": true,
"python.linting.enabled": true,
"python.formatting.provider": "black"
}
33 changes: 33 additions & 0 deletions bin/rss-reader
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from argparse import ArgumentParser
from sys import exit
from termcolor import colored

from core.__version__ import __version__

from core.document import Document
from core.logger import create_logger
from core.requester import request_xml

if __name__ == "__main__":
parser = ArgumentParser(prog="RSS reader", description="Pure Python command-line RSS reader.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Со ссылкой из тех задания утилита работает, однако я попробовал запустить ее на других RSS. На выбранных источниках утилита падает

root@de9e5548d7b0:/PythonHomework# rss-reader https://news.tut.by/rss/geonews/brest.rss
'NoneType' object is not subscriptable
root@de9e5548d7b0:/PythonHomework# rss-reader https://www.spiegel.de/international/index.rss
'NoneType' object is not subscriptable
root@de9e5548d7b0:/PythonHomework#


parser.add_argument("source", type=str, help="RSS URL")
parser.add_argument("--version", action="version", help="Print version info", version=__version__)
parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout")
parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages")
parser.add_argument("--limit", type=int, default=0, help="Limit news topics if this parameter provided")

args = parser.parse_args()
logger = create_logger(args.verbose)

try:
response = request_xml(args.source, logger)
document = Document.from_xml(response, args.limit, logger)
data = document.to_json() if args.json else str(document)
print(data)
except Exception as e:
logger.error(e)
print(colored(str(e), "red"))
exit(1)
Empty file added core/__init__.py
Empty file.
2 changes: 2 additions & 0 deletions core/__version__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__version_info__ = (1, 0, 0)
__version__ = ".".join(map(str, __version_info__))
210 changes: 210 additions & 0 deletions core/article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
from bs4 import BeautifulSoup
import json
from logging import Logger
from typing import List, Optional
from xml.etree import ElementTree

from .content import Content
from .logger import default_logger


def find_links(tree: BeautifulSoup, logger: Optional[Logger] = default_logger) -> List[str]:
"""
Recursively finds all links inside the given HTML string.

This method looks both for anchor tags `href` attributes and image tags `src` attributes which start with "http"
prefix (and, accordingly, are working links).

Params:
- tree (BeautifulSoup): BeautifulSoup instance contains HTML template to search for links in
- logger (Logger): Logger instance to use for logging inside the function (optional)

Returns:
- List of strings which contains all the found links
"""

links = []

logger.info("Parsing anchor tags")

# Iterate through all anchor tags which have `href` attribute
for el in tree.find_all("a", href=True):
logger.info("Got anchor element: %s", el)
# Check that the href is an external HTTP/S link (not an element ID or email link and etc.)
if el["href"].startswith("http"):
logger.info("Got HTTP/S link: %s", el["href"])
links.append(el["href"])

logger.info("Parsing image tags")

# Iterate through all image tags which have `src` attribute
for el in tree.find_all("img", src=True):
logger.info("Got image element: %s", el)
# Check that the href is an external HTTP/S link (not an element ID or email link and etc.)
if el["src"].startswith("http"):
logger.info("Got HTTP/S link: %s", el["src"])
links.append(el["src"])

return links


def remove_duplicates(items: List, logger: Optional[Logger] = default_logger) -> List:
"""
Returns copy of the passed array without duplicates.

The purpose of this method is because when the `list(set(items))` is used, the original list items order is being
destroyed. This function iterates through the whole list in sequence and adds the items to new list only if they
have not been seen earlier, so the original list order is saved.

Params:
- items (list): List to remove the duplicates from
- logger (Logger): Logger instance to use for logging inside the function (optional)

Returns:
- New list without duplicates
"""

result = []
seen_elements = set()

logger.info("Removing duplicate items")

for element in items:
# Only adds new element to the resulting list if it has not been added earlier
if not element in seen_elements:
result.append(element)
seen_elements.add(element)
else:
logger.info("Excluding duplicate item: %s", element)

return result


class Article:
"""
Represents the RSS standard article data.
"""

@classmethod
def from_xml(cls, element: ElementTree.Element, logger: Optional[Logger] = default_logger):
"""
Constructs new Article from an XML element.

Params:
- element (ElementTree.Element): XML element to parse the article from
- logger (Logger): Logger instance to use for logging inside the method (optional)

Returns:
- Article class instance which contains the parsed data
"""

logger.info("Parsing %s article data", str(element))

# Grab the article title element
title = element.find("title")
logger.info("Got article title: %s", title)

# Grab the article date (`pubDate` tag)
date = element.find("pubDate")
logger.info("Got article date: %s", date)

# Grab the article description
description = element.find("description")
logger.info("Got article description: %s", description)

# Grab the article public link
public_link = element.find("link")
logger.info("Got article public link: %s", public_link)

# Parse the description text to HTML tree
tree = BeautifulSoup(description.text, "html.parser")
logger.info("Parsed the HTML data")

# Parse the description to human-readable format (with images)
content = Content.from_html_tree(tree, logger)

# Concatenate the array containing public link with links found in the `description` element to get complete
# article links list
all_links = [public_link.text]
all_links.extend(find_links(tree, logger))

# Remove duplicate links (article description can also contain the public link)
all_links = remove_duplicates(all_links, logger)

return cls(title.text, date.text, public_link.text, content, all_links, logger)

def __init__(self, title: str, date: str, public_link: str, content: Content, links: List[str], logger: Logger):
"""
Constructs new Article from parsed data.

It is not recommended to call this method direclty with parsed XML, instead use `Article.from_xml` class method.

Params:
- title (str): The article title
- date (str): Article creation date
- public_link (str): Article public link from which it can be accessed within the browser
- content (Content): Parsed article content (description)
- links (List[str]): List of all article links
- logger (Logger): Logger instance to use for logging inside the class

Returns:
- The constructed Article object
"""

self.title = title
self.date = date
self.public_link = public_link
self.content = content
self.links = links
self.logger = logger

def to_string(self) -> str:
"""
Returns string representation of an article.
"""

self.logger.info("Converting article header to string")

title = f"Title: {self.title}\n"
date = f"Date: {self.date}\n"
public_link = f"Link: {self.public_link}\n"
content = f"\n{self.content}\n"

self.logger.info("Converting article links to string")

links_title = "\n\nLinks:"
links_list = ""

for i, l in enumerate(self.links):
links_list += f"\n[{i + 1}]: {l}"

return title + date + public_link + content + links_title + links_list

def __str__(self) -> str:
"""
Overrides the standard python string convertion behavior to be able to directly call `str(article_instance)`
and receive the correct article string representation.
"""

return self.to_string()

def to_dict(self) -> dict:
"""
Returns the dict representation of an article data.
"""

return {
"title": self.title,
"date": self.date,
"public_link": self.public_link,
"content": self.content.to_dict(),
"links": self.links,
}

def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str:
"""
Returns JSON representation of an article.
"""

self.logger.info("Converting article to JSON")
return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys)
84 changes: 84 additions & 0 deletions core/content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from bs4 import BeautifulSoup
import json
from logging import Logger
from typing import Dict, List, Optional

from .logger import default_logger


class Content:
"""
Represents the RSS standard article content.
"""

@classmethod
def from_html_tree(cls, tree: BeautifulSoup, logger: Optional[Logger] = default_logger):
"""
Constructs new Content instance from a description HTML tree.

Params:
- tree (BeautifulSoup): HTML tree to parse the content from
- logger (Logger): Logger instance to use for logging inside the method (optional)

Returns:
- Content class instance which contains the parsed data
"""

# Grab the description text
description = tree.text
# Grab the content image `alt` attribute
image = tree.find("img", alt=True)["alt"]

return cls(description, image, logger)

def __init__(self, description: str, image: str, logger: Logger):
"""
Constructs new Content from parsed data.

It is not recommended to call this method direclty with parsed HTML, instead use
`Content.from_html_tree` class method.

Params:
- description (str): The content description text
- image (str): Value of the `alt` attribute of the content image
- logger (Logger): Logger instance to use for logging inside the class

Returns:
- The constructed Content object
"""

self.description = description
self.image = image
self.logger = logger

def to_string(self) -> str:
"""
Returns string representation of a content.
"""

self.logger.info("Converting article content to string")
return f"[image: {self.image}] {self.description}"

def __str__(self) -> str:
"""
Overrides the standard python string convertion behavior to be able to directly call `str(content_instance)`
and receive the correct content string representation.
"""

return self.to_string()

def to_dict(self) -> Dict[str, str]:
"""
Returns the dict representation of an article content.
"""

data = {"description": self.description, "image": self.image}
return data

def to_json(self, indent: Optional[int] = 2, sort_keys: Optional[bool] = False) -> str:
"""
Returns JSON representation of an article content.
"""

self.logger.info("Converting article content to JSON")
return json.dumps(self.to_dict(), indent=indent, sort_keys=sort_keys)
Loading