Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a23b9ba
initial commit
DiSonDS Nov 10, 2019
1501ada
feat: pretty-printing of rss
DiSonDS Nov 10, 2019
d12703c
feat: limit parameter for printing
DiSonDS Nov 10, 2019
c12a740
feat: json parameter for printing rss in json
DiSonDS Nov 10, 2019
82f498e
feat: verbose parameter for output verbose log messages
DiSonDS Nov 10, 2019
0bc60c0
Create README.md
DiSonDS Nov 10, 2019
0c18404
feat: python3 shebang
DiSonDS Nov 13, 2019
aeaad8a
feat: setup.py setup script
DiSonDS Nov 13, 2019
ae819c7
fix: html tags in pretty-printing of rss
DiSonDS Nov 13, 2019
736d777
chore: rename "rss-reader.py" to "rss_reader.py"
DiSonDS Nov 17, 2019
343405e
feat: custom exception class "RSSFeedException"
DiSonDS Nov 17, 2019
7536708
feat: package export CLI utility named "rss-reader"
DiSonDS Nov 17, 2019
682992c
fix: pretty-printing in json
DiSonDS Nov 17, 2019
098f40c
fix: cyrillic letters in pretty-printing of json
DiSonDS Nov 17, 2019
b481630
chore: version increase (0.2.0)
DiSonDS Nov 17, 2019
d5162bd
feat: date parameter for reading cached entries
DiSonDS Nov 17, 2019
eb7a297
chore: update code style according to "pep 8"
DiSonDS Nov 17, 2019
f02dd33
chore: version increase (0.3.0)
DiSonDS Nov 17, 2019
996f0b6
feat: colorize parameter for colorized output
DiSonDS Nov 21, 2019
58d6eec
feat: parameters for converting to html/pdf; cache raw_entries
DiSonDS Nov 28, 2019
3c0dc41
feat: replaced package for generating pdf
DiSonDS Nov 29, 2019
8bbfcfd
chore: version increase (0.5.0)
DiSonDS Nov 30, 2019
c1e850d
feat: --to-epub parameter for convertation in epub
DiSonDS Nov 30, 2019
62bbcfa
fix: requirements (setup.py)
DiSonDS Dec 1, 2019
f886394
fix: README.md layout
DiSonDS Dec 1, 2019
7c9bb90
feat: photos , links in rss feed pretty-printing
DiSonDS Dec 1, 2019
bea5142
fix: converter crash when image "src" is None
DiSonDS Dec 1, 2019
8da41ee
feat: unittest (test/test_converter)
DiSonDS Dec 1, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,9 @@ venv.bak/

# mypy
.mypy_cache/

# idea
.idea

# cached rss
/cache
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# PythonHomework
[Introduction to Python] Homework Repository for EPAM courses

## How to use
1. install git `apt-get install git` (necessary to get a package with git)
2. `pip3 install .`
3. `rss-reader "https://www.androidpolice.com/feed/" --limit 3 --json --verbose --date`

## Important information
"--to-pdf" convertation is unstable

## Parameters
- **--help** (help text)
- **--json** (print rss feed in json format)
- **--verbose** (print verbose log messages)
- **--limit** (limit printed entries)
- **--date** (print cached entries if exist)
- **--to-html** (convert rss feed to html document)
- **--to-epub** (convert rss feed to epub document)
- **--to-pdf** (convert rss feed to pdf document)
- **--colorize** (colorize output)

## JSON structure
`{"feed": "rss_title", "entries": [{"title": "title", "date": "date", "link": "link", "summary": "summary", "photos": [...], "links": [...]}, ...]}`

## Storage
Used [Pickle](https://docs.python.org/3/library/pickle.html) for storage

Entries cached in `cache/date/domain.rss`
- cache - name of cache folder, default "cache"
- date - script execution date
- domain - domain of rss feed

Example: `cache/20191117/www.androidpolice.com.rss`

## Convertation

Examples:
- `--to-html folder_name` will create "out.html" and "images" folder in folder_name
- `--to-epub folder_name` will create "out.epub" in folder_name
- `--to-pdf folder_name` will create "out.pdf" in folder_name (*UNSTABLE*)

## TODO
- [x] [Iteration 1] One-shot command-line RSS reader.
- [x] [Iteration 2] Distribution
- [x] [Iteration 3] News caching
- [x] [Iteration 4] Format converter
- [x] * [Iteration 5] Output colorization
- [ ] * [Iteration 6] Web-server
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
feedparser>=6.0.0b1 # rss parsing
requests # http requests
bs4 # for xml and html
colorama # colored output https://pypi.org/project/colorama/
jinja2 # for generating html
git+https://github.com/xhtml2pdf/xhtml2pdf.git
ebooklib
Empty file added rss_reader/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions rss_reader/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env python

"""Package entry point."""


from rss_reader.rss_reader import main


if __name__ == '__main__': # pragma: no cover
main()
262 changes: 262 additions & 0 deletions rss_reader/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
#!/usr/bin/env python3

"""
Convert RSS feed to HTML/PDF
"""
import copy
import logging
import shutil
import random
from pathlib import Path

import requests
from xhtml2pdf import pisa
from jinja2 import Template
from bs4 import BeautifulSoup
from ebooklib import epub

from rss_reader.exceptions import RSSFeedException


class Converter:
""" Class for conversion RSS feed

Attributes:
title (str): Title of RSS feed
entries (list): List of RSS news
out_dir (str): Directory where output will be saved
"""

def __init__(self, title, entries, out_dir="out", image_dir="images", temp_image_dir="_temp_images"):
self.title = title
self.entries = entries
self.out_dir = out_dir

self.image_dir = image_dir
self.temp_image_dir = temp_image_dir

self.font_path = Path(__file__).resolve().parent / 'fonts/Roboto-Regular.ttf'

def _create_directories(self, image_dir):
""" Create directories if not exist (self.out_dir and self.out_dir/image_dir) """
if not Path(self.out_dir).is_dir():
logging.info("Creating directory /%s", Path(self.out_dir))
Path(self.out_dir).mkdir(parents=True, exist_ok=True)

if not image_dir.is_dir():
logging.info("Creating directory /%s", image_dir)
image_dir.mkdir(parents=True, exist_ok=True)

def _download_image(self, url, image_dir):
""" Download image in self.out_dir/image_dir

Returns:
filename: image name
"""
logging.info("Starting image download")

image_dir = Path(self.out_dir) / image_dir

try:
self._create_directories(image_dir)
except OSError:
raise RSSFeedException(message="Сan not create directory")

filename = url.split('/')[-1]
response = requests.get(url, allow_redirects=True)

with open(image_dir / filename, 'wb') as handler:
handler.write(response.content)

return filename

def _replace_urls_to_local_path(self, entry):
""" Replace img URLs in entry.summary to local file path

Args:
entry (dict): News dict

"""
soup = BeautifulSoup(entry.summary, "html.parser")

for img in soup.findAll('img'):
# use placeholder
if not img['src']:
# copy placeholder to self.out_dir/self.image_dir
filename = Path(__file__).resolve().parent / 'placeholder/placeholder.jpg'
shutil.copyfile(filename, Path(self.out_dir) / self.image_dir / 'placeholder.jpg')
img['src'] = str(Path(self.image_dir) / 'placeholder.jpg')
entry.summary = str(soup)
return entry

filename = self._download_image(img['src'], self.image_dir)
downloaded_img_local_path = Path(self.image_dir) / filename

img['src'] = str(downloaded_img_local_path)
entry.summary = str(soup)

return entry

def _replace_urls_to_absolute_path(self, entry):
""" Replace img URLs in entry.summary to local absolute file path

Special for xhtml2pdf (xhtml2pdf support only absolute file path)

Args:
entry (dict): News dict
"""
soup = BeautifulSoup(entry.summary, "html.parser")

for img in soup.findAll('img'):
# use placeholder
if not img['src']:
filename = Path(__file__).resolve().parent / 'placeholder/placeholder.jpg'
img['src'] = str(filename.absolute())
entry.summary = str(soup)
return entry

filename = self._download_image(img['src'], self.temp_image_dir)
downloaded_img_absolute_path = (Path(self.out_dir) / self.temp_image_dir / filename).absolute()

img['src'] = str(downloaded_img_absolute_path)
entry.summary = str(soup)

return entry

def _generate_html(self, is_cyrillic_font=False, is_absolute_path=False):
""" Generate HTML

Args:
is_cyrillic_font (bool) Should we generate HTML with cyrillic_font (to convert to PDF)?
is_absolute_path (bool): Should we generate HTML with absolute image PATH (to convert to PDF)?

Returns:
html: String with HTML code
"""
template = '''<html>
<head>
<meta charset="utf-8">
<title>{{title}}</title>

<style type=text/css>
{% if is_cyrillic_font %}
@font-face { font-family: Roboto; src: url({{font_path}}), ; }
{% endif %}
body{
font-family: Roboto;
}
div
{
{% if is_cyrillic_font %}
margin: 2px;
font-size: 15px;
{% else %}
margin: 20px;
font-size: 18px;
{% endif %}
}
</style>
</head>
<body>
{% for entry in entries %}
<div class='entry'>
<h2 class='title'>{{entry.title}}</h2>
<p><span class='date'>{{entry.published}}</span></p>
<p><a class='link' href='{{entry.link}}'>{{entry.link}}</a></p>
<div class='description'>{{entry.summary}}</div>
</div>
{% endfor %}
</body>
</html>'''

# replacing image url to downloaded image path
temp_entries = copy.deepcopy(self.entries)
if is_absolute_path:
entries = [self._replace_urls_to_absolute_path(entry) for entry in temp_entries]
else:
entries = [self._replace_urls_to_local_path(entry) for entry in temp_entries]

html = Template(template).render(title=self.title, entries=entries,
is_cyrillic_font=is_cyrillic_font, font_path=self.font_path)
return html

def entries_to_html(self):
""" Generate HTML file in self.out_dir """
html = self._generate_html()

with open(Path(self.out_dir) / 'out.html', 'w') as file_object:
file_object.write(html)

def entries_to_pdf(self):
""" Generate PDF file in self.out_dir """
html = self._generate_html(is_cyrillic_font=True, is_absolute_path=True)

with open(Path(self.out_dir) / 'out.pdf', 'w+b') as file:
pdf = pisa.CreatePDF(html, dest=file, encoding='UTF-8')

# Delete temp folder (self.out_dir/self.temp_image_dir)
temp_img_dir = Path(self.out_dir) / self.temp_image_dir
logging.info("Cleaning up %s", temp_img_dir)
shutil.rmtree(temp_img_dir)

if pdf.err:
raise RSSFeedException(message="Error during PDF generation")

def entries_to_epub(self):
""" Generate EPUB file in self.out_dir """
html = self._generate_html()

def add_images_to_book():
soup = BeautifulSoup(chapter.content, "html.parser")
image_urls = [img['src'] for img in soup.findAll('img') if img.has_attr('src')]

added_images = []
for image_url in image_urls:
# Images can repeat, check
if image_url in added_images:
continue

added_images.append(image_url)
img_local_filename = Path(self.out_dir) / image_url

with open(img_local_filename, 'br') as file_object:
epimg = epub.EpubImage()
epimg.file_name = image_url
epimg.set_content(file_object.read())

book.add_item(epimg)

book = epub.EpubBook()

# set metadata
book.set_identifier(f'id{random.randint(100000, 999999)}')
book.set_title(self.title)
book.set_language('en, ru')
book.add_author('rss-reader')

# create chapter
chapter = epub.EpubHtml(title='Intro', file_name=f'chap_01.xhtml', lang='en, ru')
chapter.content = html
# add images
add_images_to_book()
# add chapter
book.add_item(chapter)

# define Table Of Contents
book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'),
(epub.Section(self.title),
(chapter,))
)
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# define CSS style
style = 'BODY {color: white;}'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
# add CSS file
book.add_item(nav_css)
# basic spine
book.spine = ['nav', chapter]

# write to the file
epub.write_epub(Path(self.out_dir) / 'out.epub', book, {})
13 changes: 13 additions & 0 deletions rss_reader/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3

"""
Exceptions for rss-reader
"""


class RSSFeedException(Exception):
""" Custom exception class for RSSFeed errors """

def __init__(self, message):
super(RSSFeedException, self).__init__(message)
self.message = message
Binary file added rss_reader/fonts/Roboto-Regular.ttf
Binary file not shown.
Binary file added rss_reader/placeholder/placeholder.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading