diff --git a/.gitignore b/.gitignore index 894a44c..903c535 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,8 @@ venv.bak/ # mypy .mypy_cache/ +*.html +data/ +*.pdf +*.jpg +loglist.txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ef18989 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +Caching news write in file data\feeddata.txt and read by lines + +html scheme: + + + Title + + +

TITLE

+

DATE

+

LINK

+

FEED

+

IMAGES LINKS

+ + + + + + +##PDF convert: +can't convert cirilics symbols +can't convert symbols from "bad" fonts +can't print img in "bad" formats +if you want to convert feed with these exceptions, reader simple don't write THESE feed. + + +#about parser +checked on yahoo, NASA, BBC. +Work goog on all rss with 2.0 version, but RSS 1.0 has "bad" format for these reader. diff --git a/jsonStandart.md b/jsonStandart.md new file mode 100644 index 0000000..b770ed1 --- /dev/null +++ b/jsonStandart.md @@ -0,0 +1,25 @@ + {"item": + + {"link": + //URL of feed + } + + {"body": + + {"title": + //title text + } + + {"feed": + //feed text + } + + {"images": + //images links + } + + {"date": + //publicate date + } + } + } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..328d144 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +beautifulsoup4==4.8.1 +bs4==0.0.1 +fpdf==1.7.2 +logger==1.4 +lxml==4.4.1 +urlib3==1.25.7 +argparser==1.4.0 \ No newline at end of file diff --git a/rssreader/__init__.py b/rssreader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rssreader/__main__.py b/rssreader/__main__.py new file mode 100644 index 0000000..6304bf9 --- /dev/null +++ b/rssreader/__main__.py @@ -0,0 +1,2 @@ +from rssreader.rssreader import main +main() diff --git a/rssreader/rssreader.py b/rssreader/rssreader.py new file mode 100644 index 0000000..161d25d --- /dev/null +++ b/rssreader/rssreader.py @@ -0,0 +1,291 @@ +import argparse +import logging +import urllib3 +from bs4 import BeautifulSoup +import urllib.request +import sys +import json +from fpdf import FPDF +import os + + +def argsparsing(): + """arguments creating and control""" + parser = argparse.ArgumentParser() + parser.add_argument("source", help="RSS URL", type=str) + parser.add_argument("--version", action='version', version='%(prog)s ' + 'v 2.0', help="Print version info", ) + parser.add_argument("--json", help="Print result as JSON in stdout", action="store_true") + parser.add_argument("--verbose", help="Outputs verbose status messages", action="store_true") + parser.add_argument("--limit", type=int, help="Limit news topics if this parameter provided") + parser.add_argument("--date", type=int, help="Read cashed news by date in next format YYMMDD") + parser.add_argument('--html', type=str, help="Convert news to html and save in .html file.Path in format smth\\") + parser.add_argument('--pdf', type=str, help="Convert news to pdf and save in .pdf file.Path in format smth\\") + return parser.parse_args() + + +def making_log(operation, message, file='loglist.log'): + """func can do 2 ops, if 1 to write if 0 to read""" + if bool(operation): + logging.basicConfig(filename=file, format='%(name)s - %(levelname)s - %(message)s-%(asctime)s', + level=logging.INFO) + logging.info(message) + else: + print(open(file, 'r').read()) + + +def spliting_items(lst, index1, tag): + """help to create pdf files""" + try: + line_list = '' + split_list = [] + split_list = str(tag+str(lst)).split(" ") + for index in range(len(split_list)): + if len(str(line_list)) < 120: + line_list = line_list + " %s" % str(split_list[index]) + else: + index1.cell(250, 10, line_list, ln=1, align="C") + line_list = '' + index1.cell(250, 10, line_list, ln=1, align="pos") + except: + making_log(1, "Cant't save feed with index=%d to index") + print("Can't save news as index ;(") + + +class NewsRss: + """Class with all parts of rss news and methods to work with its.""" + def __init__(self): + self.arguments = argsparsing() + self.title = [] + self.pubDate = [] + self.link = [] + self.desc = [] + self.links = [] + self.datalist = [] + + def feed_find(self): + """find rss news by url and save it to memory""" + try: + urllib.request.urlopen(self.arguments.source) + except: + print("Error.URL is incorrect") + exit(1) + soup = BeautifulSoup(urllib.request.urlopen(self.arguments.source), "xml") + making_log(1, "Opened URL for news reading, URL: %s" % self.arguments.source) + try: + list = soup.find_all("item") + except: + print("Error. Can't find tag in URL. Try to use another URL for RSS parsing. ") + exit(1) + making_log(1, "Find all tags in feed.") + making_log(1, "Limit is: (%s) " % (str(self.arguments.limit))) + for cout, feed in enumerate(list): + if cout != self.arguments.limit: + making_log(1, "Opened feed on %s link." % feed.link.text) + strmedia = str(feed.find_all("media:content")) + tempstring = feed.description.text + llink = [] + for i in range(strmedia.count('url="')): + llink.append(strmedia[(strmedia.find('url="')+5): (strmedia.find('"', (strmedia.find('url="')+5)))]) + self.links.append(str(llink)) + self.link.append(feed.link.text) + tempstring = str(feed.description.text).replace("'", "'").replace(""", "'") + tempstring.replace("\", "\\") + self.title.append(str(feed.title.text).replace("'", "'").replace(""", "'")) + self.pubDate.append(feed.pubDate.text) + self.desc.append(tempstring[(tempstring.find('a>') + 1):tempstring.find('

+ + %s + + +

Title: %s

+

Date: %s

+

Link: %s

+

Feed: %s

+

+ + + """ % (str(self.title[index]), str(self.title[index]), str(self.pubDate[index]), str(self.link[index]), str(self.desc[index]), image) + with open(filename, "w") as fp: + fp.write(htmltext) + except: + making_log(1, "Error. Some news not converted to html.") + else: making_log(1, "All news converted to html.(all_goods)") + + def convert_to_pdf(self): + """convert news to pdf""" + for index in range(len(self.title)): + try: + string = str(self.title[index])[:-2].replace("?", " ") + string = string.replace(":", " ") + filename = "%s%s.pdf"%(str(self.arguments.pdf), string) + http = urllib3.PoolManager() + index1 = index + index1 = FPDF(orientation="L") + index1.add_page() + index1.set_font("Arial", size=12) + spliting_items(self.title[index], index1, "Title: ") + spliting_items(self.pubDate[index], index1, "Date: ") + spliting_items(self.link[index], index1, "Link: ") + spliting_items(self.desc[index], index1, "Feed: ") + image = str(self.links[index]) + image = image[2:-2] + r = http.request('GET', image) + fileimage = "%s%s.jpg"%(str(self.arguments.pdf), string) + fp = open(fileimage, "w+b") + fp.write(r.data) + try: + index1.image(fileimage, w=50) + except: making_log(1, "Feed with index %s has a bad img format." % index) + index1.output(filename) + except: making_log(1, "Feed with index %s can't convert to pdf." % index) + + def print_news(self): + """print news to stdout""" + making_log(1, "Print news in stdout opened.") + try: + for index in range(len(self.title)): + if self.arguments.json: + print(json.dumps({"item": {"link":self.link[index], "body": {"title": self.title[index], "date": self.pubDate[index], "images": self.links[index], "feed": self.desc[index]}}}, indent=4)) + print("\n\n\n") + else: + print("Title: " + self.title[index], + "\nDate: " + self.pubDate[index], + "\nLink: " + self.link[index]) + print("Feed: " + self.desc[index]) + if self.links != []: + print("Images: \n" + self.links[index]) + print("\n\n\n") + except: + print("Error. Can't print news. Smth go bad ;(") + making_log(1, "Error. Can't print news. Smth go bad ;(") + else: making_log(1, "All news were printed.(all_goods)") + + def date_check(self): + """check date arg by length""" + if len(str(self.arguments.date)) > 8 or len(str(self.arguments.date)) < 8 : + print("Error in date input") + return False + return True + + def filewrite(self): + """write news in .txt file""" + making_log(1, "Writing news in file opened. News saved in datafeed.txt") + for index in range(len(self.title)): + with open("data\datafeed.txt", "a") as fp: + try: + fp.write(str(self.pubDate[index])) + fp.write("\n") + fp.write(str(self.title[index])) + fp.write("\n") + fp.write(str(self.link[index])) + fp.write("\n") + fp.write(str(self.desc[index])) + fp.write("\n") + fp.write(str(self.links[index])) + fp.write("\n") + except: + making_log(1, "Error. Can't write feed with index=%s on file." % index) + else: making_log(1, "Feed wited in file.(all_goods)") + + def fileread(self): + """read news from .txt file""" + with open("data\datafeed.txt", "r") as fp: + flag = True + check = 0 + for line in fp: + day = line[(line.find(", ")+2): line.find(" ", line.find(", ")+2)] + month1 = line[line.find(" ", line.find(", ")+2): line.find(" ", line.find(", ")+5)] + month1 = month1[1:] + year = line[(line.rfind(month1)+4): (line.rfind(month1)+8)] + if month1 == 'Nov': month1 = '11' + elif month1 == 'Jan': month1 = '01' + elif month1 == 'Feb': month1 = '02' + elif month1 == 'Mar': month1 = '03' + elif month1 == 'Apr': month1 = '04' + elif month1 == 'May': month1 = '05' + elif month1 == 'Jun': month1 = '06' + elif month1 == 'Jul': month1 = '07' + elif month1 == 'Aug': month1 = '08' + elif month1 == 'Sep': month1 = '09' + elif month1 == 'Oct': month1 = '10' + elif month1 == 'Dec': month1 = '12' + cachedate = year + month1+day + if str(cachedate) == str(self.arguments.date): + linefortitlecheck = line + self.pubDate.append(linefortitlecheck) + checkline = fp.readline() + control = False + for cout in range(len(self.title)): + if checkline == self.title[cout]: + control = True + self.pubDate.remove(linefortitlecheck) + break + if not control: + flag = False + self.title.append(checkline) + self.link.append(fp.readline()) + self.desc.append(fp.readline()) + self.links.append(fp.readline()) + check = check+5 + if flag: print("No news on this date (") + + def create_dir(self): + """check if directory is created. If not-create dir""" + if self.arguments.pdf: + if not os.path.exists(self.arguments.pdf): + os.mkdir(self.arguments.pdf) + if self.arguments.html: + if not os.path.exists(self.arguments.html): + os.mkdir(self.arguments.html) + if not os.path.exists("data"): + os.mkdir("data") + + +def main(): + """main func""" + if not os.path.exists("data"): + os.mkdir("data") + news = NewsRss() + if news.arguments.date: + if news.date_check(): + news.fileread() + news.print_news() + if news.arguments.html: + news.convert_to_html() + if news.arguments.pdf: + news.create_dir() + news.convert_to_pdf() + if news.arguments.verbose: + making_log(0, '') + else: + news.feed_find() + news.print_news() + news.filewrite() + if news.arguments.pdf: + news.create_dir() + news.convert_to_pdf() + if news.arguments.html: + news.create_dir() + news.convert_to_html() + if news.arguments.verbose: + making_log(0, '') + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..104ca43 --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +from setuptools import setup + + +setup( + name="rssreader", + version=1.3, + description="Utility to read rss", + long_description="CLI utility for rss reading", + author="Vladislav Bakhmat", + author_email="uservice589@gmail.com", + packages=["rssreader"], + install_requires=[ + "argparse==1.4.0", + "bs4==0.0.1", + "urllib3==1.25.7", + "logger==1.4", + "feedparser==5.2.1", + "fpdf==1.7.2" + "lxml==4.4.1" + ], + python_requires='>=3.7', + entry_points={ + 'console_scripts': + ['rss-reader = %s.rssreader:main' % "rssreader"] + } +) \ No newline at end of file