Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,11 @@ fresh and interesting. The main idea is that the script would download
any JPEG or PNG formatted image that it found listed in the specified
subreddit and download them to a folder.


# Requirements:

* Python 2 (Python3 might be supported over 2to3, but see for
yourself and report back).
* Python 3
* Optional requirements: listed in setup.py under extras_require.


# Usage:

See `./redditdl.py --help` for uptodate details.
Expand Down
2 changes: 1 addition & 1 deletion redditdownload/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from redditdownload import *
from .redditdownload import *
2 changes: 1 addition & 1 deletion redditdownload/deviantart.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""module to parse deviantart page."""
try: # py2
from urllib2 import urlopen
from urllib.request import urlopen
except ImportError: # py3
from urllib.request import urlopen

Expand Down
16 changes: 8 additions & 8 deletions redditdownload/gfycat.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ def __init__(self):
super(gfycat, self).__init__()

def __fetch(self, url, param):
import urllib2
import urllib.request, urllib.error, urllib.parse
import json
try:
# added simple User-Ajent string to avoid CloudFlare block this request
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(url+param, None, headers)
connection = urllib2.urlopen(req).read()
except urllib2.HTTPError, err:
req = urllib.request.Request(url+param, None, headers)
connection = urllib.request.urlopen(req).read()
except urllib.error.HTTPError as err:
raise ValueError(err.read())
result = namedtuple("result", "raw json")
return result(raw=connection, json=json.loads(connection))
Expand Down Expand Up @@ -117,22 +117,22 @@ def get(self, what):
return ("Sorry, can't find %s" % error)

def download(self, location):
import urllib2
import urllib.request, urllib.error, urllib.parse
if not location.endswith(".mp4"):
location = location + self.get("gfyName") + ".mp4"
try:
# added simple User-Ajent string to avoid CloudFlare block this request
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(self.get("mp4Url"), None, headers)
file = urllib2.urlopen(req)
req = urllib.request.Request(self.get("mp4Url"), None, headers)
file = urllib.request.urlopen(req)
# make sure that the status code is 200, and the content type is mp4
if int(file.code) is not 200 or file.headers["content-type"] != "video/mp4":
raise ValueError("Problem downlading the file. Status code is %s or the content-type is not right %s"
% (file.code, file.headers["content-type"]))
data = file.read()
with open(location, "wb") as mp4:
mp4.write(data)
except urllib2.HTTPError, err:
except urllib.error.HTTPError as err:
raise ValueError(err.read())

def formated(self, ignoreNull=False):
Expand Down
22 changes: 11 additions & 11 deletions redditdownload/img_scrap_stuff.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
import re
import json
import logging
import urlparse
import urllib.parse
import traceback

from PIL import Image
from cStringIO import StringIO
from io import StringIO
import lxml
import html5lib # Heavily recommended for bs4 (apparently)
import bs4
Expand Down Expand Up @@ -52,7 +52,7 @@ def indexall_re(topstr, substr_re):
def walker(text, opening='{', closing='}'):
""" A near-useless experiment that was intended for `get_all_objects` """
stack = []
for pos in xrange(len(text)):
for pos in range(len(text)):
if text[pos:pos + len(opening)] == opening:
stack.append(pos)
continue
Expand Down Expand Up @@ -88,7 +88,7 @@ def get_all_objects(text, beginning=r'{', debug=False):
"""

def _dbg_actual(st, *ar):
print "D: ", st % ar
print("D: ", st % ar)

_dbg = _dbg_actual if debug else (lambda *ar: None)

Expand All @@ -106,9 +106,9 @@ def __getitem__(self, key):
class TheLoader(yaml.SafeLoader):
ESCAPE_REPLACEMENTS = ddd(yaml.SafeLoader.ESCAPE_REPLACEMENTS)

from cStringIO import StringIO
from io import StringIO
# optimised slicing
if isinstance(text, unicode):
if isinstance(text, str):
_dbg("encoding")
text = text.encode('utf-8')
_dbg("Length: %r", len(text))
Expand Down Expand Up @@ -214,13 +214,13 @@ def get_get_get(url, **kwa):

def get_get(*ar, **kwa):
retries = kwa.pop('_xretries', 5)
for retry in xrange(retries):
for retry in range(retries):
try:
return get_get_get(*ar, **kwa)
except Exception as exc:
traceback.print_exc()
ee = exc
print "On retry #%r (%s)" % (retry, repr(exc)[:30])
print("On retry #%r (%s)" % (retry, repr(exc)[:30]))
raise GetError(ee)


Expand All @@ -244,7 +244,7 @@ def get(url, cache_file=None, req_params=None, bs=True, response=False, undecode
for chunk in resp.iter_content(chunk_size=16384):
data += chunk
if len(data) > _max_len:
print "Too large"
print("Too large")
break
data = bytes(data) ## Have to, alas.
data_bytes = data
Expand Down Expand Up @@ -274,7 +274,7 @@ def _filter(l):


def _url_abs(l, base_url):
return (urlparse.urljoin(base_url, v) for v in l)
return (urllib.parse.urljoin(base_url, v) for v in l)


def _preprocess_bs_links(bs, links):
Expand Down Expand Up @@ -413,7 +413,7 @@ def _pp(lst):
for val in lst
if val.startswith('http') or val.startswith('/')]
# (urljoin should be done already though)
return [urlparse.urljoin(url, val) for val in res]
return [urllib.parse.urljoin(url, val) for val in res]

imgs, links = bs2img(bs), bs2lnk(bs)
to_check = imgs + links
Expand Down
13 changes: 7 additions & 6 deletions redditdownload/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
"""Return list of items from a sub-reddit of reddit.com."""

import sys
import HTMLParser
from urllib2 import urlopen, Request, HTTPError
import html.parser
from urllib.request import urlopen, Request
from urllib.error import HTTPError
from json import JSONDecoder


Expand Down Expand Up @@ -33,15 +34,15 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
if '/m/' not in subreddit:
warning = ('That doesn\'t look like a multireddit. Are you sure'
'you need that multireddit flag?')
print warning
print(warning)
sys.exit(1)
url = 'http://www.reddit.com/user/%s.json' % subreddit
if not multireddit:
if '/m/' in subreddit:
warning = ('It looks like you are trying to fetch a multireddit. \n'
'Check the multireddit flag. '
'Call --help for more info')
print warning
print (warning)
sys.exit(1)
# no sorting needed
if reddit_sort is None:
Expand Down Expand Up @@ -95,7 +96,7 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):

try:
req = Request(url, headers=hdr)
json = urlopen(req).read()
json = urlopen(req).read().decode('utf-8')
data = JSONDecoder().decode(json)
if isinstance(data, dict):
items = [x['data'] for x in data['data']['children']]
Expand All @@ -119,7 +120,7 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
# returns `url` values html-escaped, whereas we normally need them
# in the way they are meant to be downloaded (i.e. urlquoted at
# most).
htmlparser = HTMLParser.HTMLParser()
htmlparser = html.parser.HTMLParser()
for item in items:
if item.get('url'):
item['url'] = htmlparser.unescape(item['url'])
Expand Down
17 changes: 8 additions & 9 deletions redditdownload/redditdownload.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
#!/usr/bin/env python2
"""Download images from a reddit.com subreddit."""

from __future__ import print_function


import os
import re
import StringIO
import io
import sys
import logging
from urllib2 import urlopen, HTTPError, URLError
from httplib import InvalidURL
from urllib.request import urlopen, HTTPError, URLError
from http.client import InvalidURL
from argparse import ArgumentParser
from os.path import (
exists as pathexists, join as pathjoin, basename as pathbasename,
splitext as pathsplitext)
from os import mkdir, getcwd
import time

from .gfycat import gfycat
from .reddit import getitems
from .deviantart import process_deviant_url

Expand All @@ -29,7 +28,7 @@ def request(url, *ar, **kwa):
_retries = kwa.pop('_retries', 4)
_retry_pause = kwa.pop('_retry_pause', 0)
res = None
for _try in xrange(_retries):
for _try in range(_retries):
try:
res = urlopen(url, *ar, **kwa)
except Exception as exc:
Expand Down Expand Up @@ -83,7 +82,7 @@ def extract_imgur_album_urls(album_url):
match = re.compile(r'\"hash\":\"(.[^\"]*)\",\"title\"')
items = []

memfile = StringIO.StringIO(filedata)
memfile = io.StringIO.StringIO(filedata)

for line in memfile.readlines():
results = re.findall(match, line)
Expand Down Expand Up @@ -130,7 +129,7 @@ def download_from_url(url, dest_file):
raise HTTPError(actual_url, 404, "Imgur suggests the image was removed", None, None)

# Work out file type either from the response or the url.
if 'content-type' in info.keys():
if 'content-type' in list(info.keys()):
filetype = info['content-type']
elif url.endswith('.jpg') or url.endswith('.jpeg'):
filetype = 'image/jpeg'
Expand Down Expand Up @@ -232,7 +231,7 @@ def slugify(value):
# with some modification
import unicodedata
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub(r'[^\w\s-]', '', value).strip())
value = str(re.sub(r'[^\w\s-]', '', value).strip())
# value = re.sub(r'[-\s]+', '-', value) # not replacing space with hypen
return value

Expand Down
8 changes: 4 additions & 4 deletions redditdownload/scrap_wrongies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
""" Scrap stuff from "wrong data type" pages of RedditImageGrab """

try:
from redditdownload import _WRONGDATA_LOGFILE
from .redditdownload import _WRONGDATA_LOGFILE
except ImportError:
_WRONGDATA_LOGFILE = '.wrong_type_pages.jsl'

Expand All @@ -30,8 +30,8 @@
from atomicfile import AtomicFile
import magic

import img_scrap_stuff
from img_scrap_stuff import GetError
from . import img_scrap_stuff
from .img_scrap_stuff import GetError


_log = logging.getLogger(__name__)
Expand Down Expand Up @@ -108,7 +108,7 @@ def consecutive_filename(filename):
filebase, fileext = fileparts[0], None
else:
filebase, fileext = fileparts
for i in xrange(1, 9000):
for i in range(1, 9000):
filetry = '%s__%02d' % (filebase, i)
if fileext is not None:
filetry = '%s.%s' % (filetry, fileext)
Expand Down