Skip to content

Commit 2e1de8d

Browse files
committed
add mangahere scraper; numerous misc fixes
remove debugging imports, add more tests to mangasee scraper, add support for multi-volume/multi-season titles, fix 404 detection on mangasee scraper, change beautifulsoup element parsing to find() instead of find_all()
1 parent 9d4325c commit 2e1de8d

File tree

5 files changed

+409
-8
lines changed

5 files changed

+409
-8
lines changed

cum/scrapers/base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,11 @@ def filename(self):
195195
elif match(r'[0-9]*\.[0-9]*$', self.chapter):
196196
number, decimal = self.chapter.split('.')
197197
chapter = 'c{:0>3} x{}'.format(number, decimal)
198+
# Individually numbered chapter with double-decimal (e.g. '2.164.5').
199+
# Used by titles with multiple volumes/seasons and special chapters.
200+
elif match(r'[0-9]*(\.[0-9]*){2}$', self.chapter):
201+
volume, number, decimal = self.chapter.split('.')
202+
chapter = 'c{:0>3} x{:0>3}.{}'.format(volume, number, decimal)
198203
# Failing all else, e.g. 'Special'. Becomes 'c000 [Special]'.
199204
else:
200205
chapter = 'c000 [{}]'.format(self.chapter)

cum/scrapers/mangahere.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
from bs4 import BeautifulSoup
2+
from cum import config, exceptions
3+
from cum.scrapers.base import BaseChapter, BaseSeries, download_pool
4+
from functools import partial
5+
import concurrent.futures
6+
import re
7+
import requests
8+
9+
10+
class MangahereSeries(BaseSeries):
11+
url_re = re.compile(r'https?://((www|m)\.)?mangahere\.cc/manga/.+')
12+
13+
def __init__(self, url, **kwargs):
14+
super().__init__(url, **kwargs)
15+
# convert mobile link to desktop
16+
spage = requests.get(url.replace("m.", "www."))
17+
if spage.status_code == 404:
18+
raise exceptions.ScrapingError
19+
self.soup = BeautifulSoup(spage.text, config.get().html_parser)
20+
self.chapters = self.get_chapters()
21+
22+
def get_chapters(self):
23+
try:
24+
rows = self.soup.find("ul", class_="detail-main-list")\
25+
.find_all("li")
26+
except AttributeError:
27+
raise exceptions.ScrapingError()
28+
chapters = []
29+
for i, row in enumerate(rows):
30+
chap_num = re.match((r"/manga/[^/]+((/v[0-9]+)?"
31+
r"/c[0-9\.]+)/[0-9]+\.html$"),
32+
row.find("a")["href"]).groups()[0]\
33+
.replace("/", "")
34+
if "v" in chap_num:
35+
chap_num = chap_num.replace("v", "").replace("c", ".")
36+
else:
37+
chap_num = chap_num.replace("c", "")
38+
if chap_num == "000":
39+
chap_num = "0"
40+
else:
41+
chap_num = chap_num.lstrip("0")
42+
# convert mobile link to desktop
43+
chap_url = "https://www.mangahere.cc" + \
44+
row.find("a")["href"].replace("/roll_manga/", "/manga/")
45+
chap_name = row.find("p", class_="title3").text
46+
chap_date = row.find("p", class_="title2").text
47+
result = MangahereChapter(name=self.name,
48+
alias=self.alias,
49+
chapter=chap_num,
50+
url=chap_url,
51+
title=chap_name,
52+
groups=[],
53+
upload_date=chap_date)
54+
chapters.append(result)
55+
return chapters
56+
57+
@property
58+
def name(self):
59+
try:
60+
return re.match(r".+ - Read (.+) Online at MangaHere$",
61+
self.soup.find("title").text).groups()[0]
62+
except AttributeError:
63+
raise exceptions.ScrapingError
64+
65+
66+
class MangahereChapter(BaseChapter):
67+
url_re = re.compile((r'https?://((www|m)\.)?mangahere\.cc'
68+
r'/(roll_)?manga(/v[0-9]+)?/c[0-9\.]+/[0-9]+\.html$'))
69+
upload_date = None
70+
uses_pages = True
71+
72+
def download(self):
73+
if not getattr(self, "cpage", None):
74+
self.cpage = requests.get(self.url.replace("www.", "m.")
75+
.replace("/manga/", "/roll_manga/"))
76+
if not getattr(self, "soup", None):
77+
self.soup = BeautifulSoup(self.cpage.text,
78+
config.get().html_parser)
79+
80+
image_list = self.soup.find("div", class_="mangaread-img")\
81+
.find_all("img")
82+
pages = []
83+
for image in image_list:
84+
pages.append(image["data-original"].replace("http://", "https://"))
85+
86+
futures = []
87+
files = [None] * len(pages)
88+
req_session = requests.Session()
89+
with self.progress_bar(pages) as bar:
90+
for i, page in enumerate(pages):
91+
retries = 0
92+
while retries < 10:
93+
try:
94+
r = req_session.get(page, stream=True)
95+
break
96+
except requests.exceptions.ConnectionError:
97+
retries += 1
98+
if r.status_code != 200:
99+
r.close()
100+
raise ValueError
101+
fut = download_pool.submit(self.page_download_task, i, r)
102+
fut.add_done_callback(partial(self.page_download_finish,
103+
bar, files))
104+
futures.append(fut)
105+
concurrent.futures.wait(futures)
106+
self.create_zip(files)
107+
108+
def from_url(url):
109+
chap_num = re.match((r"https?://((www|m)\.)?mangahere\.cc/(roll_)?"
110+
r"manga/[^/]+((/v[0-9]+)?/c[0-9\.]+)"
111+
r"/[0-9]+\.html"), url)\
112+
.groups()[3].replace("/", "")
113+
if "v" in chap_num:
114+
chap_num = chap_num.replace("v", "").replace("c", ".")
115+
else:
116+
chap_num = chap_num.replace("c", "")
117+
if chap_num == "000":
118+
chap_num = "0"
119+
else:
120+
chap_num = chap_num.lstrip("0")
121+
parent_url = re.match((r"(https?://((www|m)\.)?mangahere\.cc/(roll_)?"
122+
r"manga/[^/]+)(/v[0-9]+)?/"
123+
r"c[0-9\.]+/[0-9]+\.html"),
124+
url).groups()[0]
125+
series = MangahereSeries(parent_url)
126+
for chapter in series.chapters:
127+
if chapter.chapter == str(chap_num):
128+
return chapter
129+
return None

cum/scrapers/mangasee.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,32 @@
66
import json
77
import re
88
import requests
9-
import traceback
109

1110

1211
class MangaseeSeries(BaseSeries):
1312
url_re = re.compile(r'https?://mangaseeonline\.us/manga/.+')
13+
multi_season_regex = re.compile((r"(https?://mangaseeonline\.us)"
14+
r"?/read-online/"
15+
r".+-chapter-[0-9\.]+-index-"
16+
r"([0-9]+)-page-[0-9]+\.html"))
1417

1518
def __init__(self, url, **kwargs):
1619
super().__init__(url, **kwargs)
1720
spage = requests.get(url)
21+
if spage.status_code == 404:
22+
raise exceptions.ScrapingError
1823
self.soup = BeautifulSoup(spage.text, config.get().html_parser)
1924
self.chapters = self.get_chapters()
2025

26+
def _get_chapnum_multiseason_series(self, url, chap_num):
27+
if not re.match(self.multi_season_regex, url):
28+
# chapter is from season 1
29+
return "01." + chap_num.zfill(3)
30+
else:
31+
# chapter is from season >1
32+
season = re.match(self.multi_season_regex, url).groups()[1]
33+
return season.zfill(2) + "." + chap_num.zfill(3)
34+
2135
def get_chapters(self):
2236
try:
2337
rows = self.soup.find_all("a", class_="list-group-item")
@@ -27,9 +41,12 @@ def get_chapters(self):
2741
for i, row in enumerate(rows):
2842
chap_num = re.match(r"Read .+ Chapter ([0-9\.]+) For Free Online",
2943
row["title"]).groups()[0]
44+
if not hasattr(self, "is_multi_season"):
45+
if re.match(self.multi_season_regex, row["href"]):
46+
self.is_multi_season = True
3047
chap_url = "https://mangaseeonline.us" + row["href"]
31-
chap_name = row.find_all("span")[0].text
32-
chap_date = row.find_all("time")[0].text
48+
chap_name = row.find("span").text
49+
chap_date = row.find("time").text
3350
result = MangaseeChapter(name=self.name,
3451
alias=self.alias,
3552
chapter=chap_num,
@@ -38,15 +55,24 @@ def get_chapters(self):
3855
groups=[],
3956
upload_date=chap_date)
4057
chapters.append(result)
58+
# the chapters in the first season of a multi-season title
59+
# are indistinguishable from a non-multi-season title. thus
60+
# we must retroactively reanalyze all chapters and adjust
61+
# chapter numbers if *any* are multi-season
62+
if hasattr(self, "is_multi_season"):
63+
for chapter in chapters:
64+
chapter.chapter = self.\
65+
_get_chapnum_multiseason_series(chapter.url,
66+
chapter.chapter)
67+
4168
return chapters
4269

4370
@property
4471
def name(self):
4572
try:
4673
return re.match(r"Read (.+) Man[a-z]+ For Free \| MangaSee",
47-
self.soup.find_all("title")[0].text).groups()[0]
74+
self.soup.find("title").text).groups()[0]
4875
except AttributeError:
49-
print(traceback.format_exc())
5076
raise exceptions.ScrapingError
5177

5278

@@ -106,10 +132,10 @@ def download(self):
106132
def from_url(url):
107133
cpage = requests.get(url)
108134
soup = BeautifulSoup(cpage.text, config.get().html_parser)
109-
chap_num = soup.find_all("span", class_="CurChapter")[0].text
110-
iname = soup.find_all("a", class_="list-link")[0]["href"]
135+
# chap_num = soup.find("span", class_="CurChapter").text
136+
iname = soup.find("a", class_="list-link")["href"]
111137
series = MangaseeSeries("https://mangaseeonline.us" + iname)
112138
for chapter in series.chapters:
113-
if chapter.chapter == str(chap_num):
139+
if chapter.url == url:
114140
return chapter
115141
return None

0 commit comments

Comments
 (0)