Skip to content

Commit f04f80e

Browse files
committed
Add Mangasee scraper
Adds support for Mangasee (mangaseeonline.us)
1 parent 8bec439 commit f04f80e

File tree

4 files changed

+258
-0
lines changed

4 files changed

+258
-0
lines changed

cum/db.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,9 @@ def to_object(self):
242242
if parse.netloc == 'www.yuri-ism.net':
243243
from cum.scrapers.yuriism import YuriismChapter
244244
return YuriismChapter(**kwargs)
245+
if parse.netloc == 'mangaseeonline.us':
246+
from cum.scrapers.mangasee import MangaseeChapter
247+
return MangaseeChapter(**kwargs)
245248

246249

247250
class Group(Base):

cum/scrapers/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,22 @@
22
from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries
33
from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries
44
from cum.scrapers.mangadex import MangadexSeries, MangadexChapter
5+
from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter
56
from cum.scrapers.yuriism import YuriismChapter, YuriismSeries
67

78
series_scrapers = [
89
DokiReaderSeries,
910
DynastyScansSeries,
1011
MadokamiSeries,
1112
MangadexSeries,
13+
MangaseeSeries,
1214
YuriismSeries,
1315
]
1416
chapter_scrapers = [
1517
DokiReaderChapter,
1618
DynastyScansChapter,
1719
MadokamiChapter,
1820
MangadexChapter,
21+
MangaseeChapter,
1922
YuriismChapter,
2023
]

cum/scrapers/mangasee.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from bs4 import BeautifulSoup
2+
from cum import config, exceptions
3+
from cum.scrapers.base import BaseChapter, BaseSeries, download_pool
4+
from functools import partial
5+
import concurrent.futures
6+
import json
7+
import re
8+
import requests
9+
import traceback
10+
11+
12+
class MangaseeSeries(BaseSeries):
13+
url_re = re.compile(r'https?://mangaseeonline\.us/manga/.+')
14+
15+
def __init__(self, url, **kwargs):
16+
super().__init__(url, **kwargs)
17+
spage = requests.get(url)
18+
self.soup = BeautifulSoup(spage.text, config.get().html_parser)
19+
self.chapters = self.get_chapters()
20+
21+
def get_chapters(self):
22+
try:
23+
rows = self.soup.find_all("a", class_="list-group-item")
24+
except AttributeError:
25+
raise exceptions.ScrapingError()
26+
chapters = []
27+
for i, row in enumerate(rows):
28+
chap_num = re.match(r"Read .+ Chapter ([0-9\.]+) For Free Online",
29+
row["title"]).groups()[0]
30+
chap_url = "https://mangaseeonline.us" + row["href"]
31+
chap_name = row.find_all("span")[0].text
32+
chap_date = row.find_all("time")[0].text
33+
result = MangaseeChapter(name=self.name,
34+
alias=self.alias,
35+
chapter=chap_num,
36+
url=chap_url,
37+
title=chap_name,
38+
groups=[],
39+
upload_date=chap_date)
40+
chapters.append(result)
41+
return chapters
42+
43+
@property
44+
def name(self):
45+
try:
46+
return re.match(r"Read (.+) Man[a-z]+ For Free \| MangaSee",
47+
self.soup.find_all("title")[0].text).groups()[0]
48+
except AttributeError:
49+
print(traceback.format_exc())
50+
raise exceptions.ScrapingError
51+
52+
53+
class MangaseeChapter(BaseChapter):
54+
url_re = re.compile((r'https?://mangaseeonline\.us/'
55+
r'read-online/.+-chapter-[0-9\.]+-page-[0-9]+\.html'))
56+
upload_date = None
57+
uses_pages = True
58+
59+
def download(self):
60+
if not getattr(self, "cpage", None):
61+
self.cpage = requests.get(self.url)
62+
if not getattr(self, "soup", None):
63+
self.soup = BeautifulSoup(self.cpage.text,
64+
config.get().html_parser)
65+
66+
for script in self.soup.find_all("script"):
67+
if re.match("\n\tChapterArr=.+", script.text):
68+
image_list = script.text
69+
continue
70+
71+
image_list = re.sub("\n\tChapterArr=", "", image_list)
72+
image_list = re.sub(";\n\t?", "", image_list)
73+
image_list = re.sub("PageArr=", ",", image_list)
74+
image_list = "[" + image_list + "]"
75+
image_list = json.loads(image_list)[1]
76+
pages = []
77+
for image in image_list:
78+
if image != "CurPage":
79+
if re.match(".+blogspot.+", image_list[image]):
80+
image_list[image] = image_list[image].\
81+
replace("http://", "https://")
82+
pages.append(image_list[image])
83+
84+
futures = []
85+
files = [None] * len(pages)
86+
with self.progress_bar(pages) as bar:
87+
for i, page in enumerate(pages):
88+
retries = 0
89+
while retries < 3:
90+
try:
91+
r = requests.get(page, stream=True)
92+
break
93+
except requests.exceptions.ConnectionError:
94+
retries += 1
95+
if r.status_code != 200:
96+
r.close()
97+
raise ValueError
98+
fut = download_pool.submit(self.page_download_task, i, r)
99+
fut.add_done_callback(partial(self.page_download_finish,
100+
bar, files))
101+
futures.append(fut)
102+
concurrent.futures.wait(futures)
103+
self.create_zip(files)
104+
105+
def from_url(url):
106+
cpage = requests.get(url)
107+
soup = BeautifulSoup(cpage.text, config.get().html_parser)
108+
chap_num = soup.find_all("span", class_="CurChapter")[0].text
109+
iname = soup.find_all("a", class_="list-link")[0]["href"]
110+
series = MangaseeSeries("https://mangaseeonline.us" + iname)
111+
for chapter in series.chapters:
112+
if chapter.chapter == str(chap_num):
113+
return chapter
114+
return None

tests/test_scraper_mangasee.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from bs4 import BeautifulSoup
2+
from cum import config, exceptions
3+
from nose.tools import nottest
4+
from urllib.parse import urljoin
5+
import cumtest
6+
import os
7+
import requests
8+
import unittest
9+
import zipfile
10+
11+
12+
class TestMangasee(cumtest.CumTest):
13+
MANGASEE_URL = 'https://mangaseeonline.us/'
14+
15+
def setUp(self):
16+
super().setUp()
17+
global mangasee
18+
from cum.scrapers import mangasee
19+
20+
def tearDown(self):
21+
self.directory.cleanup()
22+
23+
def get_five_latest_releases(self):
24+
r = requests.get(self.MANGASEE_URL)
25+
soup = BeautifulSoup(r.text, config.get().html_parser)
26+
chapters = soup.find_all("a", class_="latestSeries")
27+
links = [urljoin(self.MANGASEE_URL, x.get("href")) for x in chapters]
28+
return links[:5]
29+
30+
@nottest
31+
def series_information_tester(self, data):
32+
series = mangasee.MangaseeSeries(data['url'])
33+
self.assertEqual(series.name, data['name'])
34+
self.assertEqual(series.alias, data['alias'])
35+
self.assertEqual(series.url, data['url'])
36+
self.assertIs(series.directory, None)
37+
self.assertEqual(len(series.chapters), len(data['chapters']))
38+
for chapter in series.chapters:
39+
self.assertEqual(chapter.name, data['name'])
40+
self.assertEqual(chapter.alias, data['alias'])
41+
self.assertIn(chapter.chapter, data['chapters'])
42+
data['chapters'].remove(chapter.chapter)
43+
self.assertIs(chapter.directory, None)
44+
self.assertEqual(len(data['chapters']), 0)
45+
46+
def test_chapter_download_latest(self):
47+
latest_releases = self.get_five_latest_releases()
48+
for release in latest_releases:
49+
try:
50+
chapter = mangasee.MangaseeChapter.from_url(release)
51+
except exceptions.ScrapingError as e:
52+
print('scraping error for {} - {}'.format(release, e))
53+
continue
54+
else:
55+
chapter.get(use_db=False)
56+
57+
def test_chapter_filename_decimal(self):
58+
URL = "https://mangaseeonline.us/read-online/" + \
59+
"Citrus-S-A-B-U-R-O-Uta-chapter-20.5-page-1.html"
60+
chapter = mangasee.MangaseeChapter.from_url(URL)
61+
path = os.path.join(self.directory.name, 'Citrus SABURO Uta',
62+
'Citrus SABURO Uta - c020 x5 [Unknown].zip')
63+
self.assertEqual(chapter.chapter, '20.5')
64+
self.assertEqual(chapter.filename, path)
65+
66+
def test_chapter_information_normal(self):
67+
URL = "https://mangaseeonline.us/read-online/" + \
68+
"Ramen-Daisuki-Koizumi-San-chapter-18-page-1.html"
69+
chapter = mangasee.MangaseeChapter.from_url(URL)
70+
self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san')
71+
self.assertTrue(chapter.available())
72+
self.assertEqual(chapter.chapter, '18')
73+
self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-san')
74+
self.assertEqual(chapter.title, 'Chapter 18')
75+
path = os.path.join(self.directory.name,
76+
'Ramen Daisuki Koizumi-san',
77+
'Ramen Daisuki Koizumi-san - c018 [Unknown].zip')
78+
self.assertEqual(chapter.filename, path)
79+
chapter.download()
80+
self.assertTrue(os.path.isfile(path))
81+
with zipfile.ZipFile(path) as chapter_zip:
82+
files = chapter_zip.infolist()
83+
self.assertEqual(len(files), 8)
84+
85+
def test_chapter_information_chapterzero(self):
86+
URL = "https://mangaseeonline.us/read-online/" + \
87+
"Inu-To-Hasami-Wa-Tsukaiyou-chapter-0-page-1.html"
88+
chapter = mangasee.MangaseeChapter.from_url(URL)
89+
self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou')
90+
self.assertEqual(chapter.chapter, '0')
91+
self.assertEqual(chapter.name, 'Inu to Hasami wa Tsukaiyou')
92+
self.assertEqual(chapter.title, 'Chapter 0')
93+
path = os.path.join(
94+
self.directory.name, 'Inu to Hasami wa Tsukaiyou',
95+
'Inu to Hasami wa Tsukaiyou - c000 [Unknown].zip')
96+
self.assertEqual(chapter.filename, path)
97+
chapter.download()
98+
self.assertTrue(os.path.isfile(path))
99+
with zipfile.ZipFile(path) as chapter_zip:
100+
files = chapter_zip.infolist()
101+
self.assertEqual(len(files), 51)
102+
103+
def test_chapter_unavailable(self):
104+
URL = "https://mangaseeonline.us/read-online/" + \
105+
"Oyasumi-Punpun-chapter-999-page-1.html"
106+
chapter = mangasee.MangaseeChapter(url=URL)
107+
self.assertFalse(chapter.available())
108+
109+
def test_series_oneword(self):
110+
data = {'alias': 'aria',
111+
'chapters': ['1', '2', '3', '4', '5', '6', '7', '8',
112+
'9', '10', '10.5', '11', '12', '13', '14', '15',
113+
'16', '17', '18', '19', '20', '21', '22', '23',
114+
'24', '25', '26', '27', '28', '29', '30', '30.5',
115+
'31', '32', '33', '34', '35', '35.5', '36',
116+
'37', '37.5', '38', '39', '40', '41', '42', '43',
117+
'44', '45', '45.5', '46', '47', '48', '49',
118+
'50', '50.5', '51', '52', '53', '54', '55', '56',
119+
'57', '57.5', '58', '59', '60', '60.5'],
120+
'name': 'Aria',
121+
'url': 'https://mangaseeonline.us/manga/Aria'}
122+
self.series_information_tester(data)
123+
124+
def test_series_multiplewords(self):
125+
data = {'alias': 'prunus-girl',
126+
'chapters': ['1', '2', '3', '4', '5', '6', '7', '8',
127+
'9', '10', '11', '12', '13', '14', '15',
128+
'16', '17', '18', '19', '20', '21', '22',
129+
'23', '24', '25', '26', '27', '28', '29', '30',
130+
'31', '32', '32.5', '33', '34', '35', '36', '37',
131+
'38', '39', '40', '41', '42', '43'],
132+
'name': 'Prunus Girl',
133+
'url': 'https://mangaseeonline.us/manga/Prunus-Girl'}
134+
self.series_information_tester(data)
135+
136+
137+
if __name__ == '__main__':
138+
unittest.main()

0 commit comments

Comments
 (0)