Skip to content

Commit 7f77961

Browse files
authored
Merge pull request #68 from RyouMon/feature-fast-crawl
Shutdown spiders if collection not updated. Add ID to ComicBookInfo. Change filename of nhentai comics. close #54
2 parents feca3cb + 696a075 commit 7f77961

30 files changed

+589
-120
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ coverage.xml
5050
*.py,cover
5151
.hypothesis/
5252
.pytest_cache/
53+
tests/.trial_temp/
5354

5455
# Translations
5556
*.mo

src/favorites_crawler/commands/crawl.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from favorites_crawler.constants.domains import LMMPIC_DOMAIN, NHENTAI_DOMAIN, TWITTER_DOMAIN
1212
from favorites_crawler.utils.config import load_config, overwrite_spider_settings
13-
from favorites_crawler.constants.path import DEFAULT_FAVORS_HOME
13+
from favorites_crawler.utils.common import get_favors_home
1414
from favorites_crawler.utils.auth import refresh_pixiv_token
1515
from favorites_crawler.utils.cookies import load_cookie
1616

@@ -30,15 +30,19 @@ def crawl_yandere():
3030
@app.command('pixiv')
3131
def crawl_pixiv():
3232
"""Crawl your favorite illustrations from pixiv."""
33-
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
34-
access_token = refresh_pixiv_token(favors_home)
33+
favors_home = get_favors_home()
34+
try:
35+
access_token = refresh_pixiv_token(favors_home)
36+
except Exception as e:
37+
print(e)
38+
exit(1)
3539
crawl('pixiv', access_token=access_token)
3640

3741

3842
@app.command('nhentai')
3943
def crawl_nhentai():
4044
"""Crawl your favorite comics from nhentai."""
41-
favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME))
45+
favors_home = get_favors_home()
4246
cookies = load_cookie(NHENTAI_DOMAIN, favors_home)
4347
crawl('nhentai', cookies=cookies)
4448

@@ -47,15 +51,15 @@ def crawl_nhentai():
4751
@app.command('twitter')
4852
def crawl_twitter():
4953
"""Crawl your favorite pictures from twitter."""
50-
favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME))
54+
favors_home = get_favors_home()
5155
cookies = load_cookie(TWITTER_DOMAIN, favors_home)
5256
crawl('twitter', cookies=cookies)
5357

5458

5559
@app.command('lemon')
5660
def crawl_lemon(id_list: list[str] = typer.Option([], '--id', '-i')):
5761
"""Crawl your favorite photo albums from lemon."""
58-
favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME))
62+
favors_home = get_favors_home()
5963
cookies = load_cookie(LMMPIC_DOMAIN, favors_home)
6064
crawl('lemon', id_list=id_list, cookies=cookies)
6165

@@ -66,7 +70,9 @@ def spider_closed(spider):
6670
print('Dumping Scrapy stats:', stats)
6771
if spider.name == 'yandere_vote':
6872
return
69-
if not (stats.get('item_scraped_count', 0) + stats.get('item_dropped_count', 0)):
73+
if stats.get('finish_reason') == 'fastly-finished':
74+
return
75+
elif not (stats.get('item_scraped_count', 0) + stats.get('item_dropped_count', 0)):
7076
print(Panel(
7177
'[red]Nothing was crawled, your cookies or token may have expired.',
7278
border_style="red",
@@ -82,7 +88,7 @@ def crawl(name, **kwargs):
8288
:param kwargs: kwargs passed to spider's __init__ method
8389
"""
8490
spider = spider_loader.load(name)
85-
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
91+
favors_home = get_favors_home()
8692
overwrite_spider_settings(spider, favors_home, load_config(favors_home))
8793
process = CrawlerProcess(scrapy_settings)
8894
process.crawl(spider, **kwargs)

src/favorites_crawler/commands/login.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
import os
21
import shutil
32
from typing import Optional
43

54
import typer
65

76
from favorites_crawler.utils.auth import CustomGetPixivToken, parse_twitter_likes_url, parser_twitter_likes_features
87
from favorites_crawler.utils.config import dump_config, load_config
9-
from favorites_crawler.constants.path import DEFAULT_FAVORS_HOME
8+
from favorites_crawler.utils.common import get_favors_home
109

1110

1211
app = typer.Typer(help='Prepare auth information for crawling.', no_args_is_help=True)
@@ -33,10 +32,11 @@ def login_pixiv(
3332
3433
If you do not provide your username and password, you will login manually on the web page
3534
"""
36-
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
35+
favors_home = get_favors_home()
3736
config = load_config(favors_home)
3837
token_getter = CustomGetPixivToken()
3938
try:
39+
print('Launching chrome...')
4040
login_info = token_getter.login(username=username, password=password)
4141
except Exception as e:
4242
print(f'Failed to login. {e!r}')
@@ -65,7 +65,7 @@ def login_yandere(
6565
"""
6666
Login to yandere.
6767
"""
68-
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
68+
favors_home = get_favors_home()
6969
config = load_config(favors_home)
7070
yandere_config = config.setdefault('yandere', {})
7171
yandere_config['USERNAME'] = username
@@ -104,7 +104,7 @@ def login_twitter(
104104
6. Copy Authorization, X-Csrf-Token and RequestURL from request(Likes?variables...) input on terminal.\n
105105
7. Use "Get cookies.txt" browser extension download cookie file.
106106
"""
107-
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
107+
favors_home = get_favors_home()
108108
config = load_config(favors_home)
109109
twitter_config = config.setdefault('twitter', {})
110110
try:
@@ -140,7 +140,7 @@ def login_nhentai(
140140
4. Copy user-agent from any request.\n
141141
5. Use "Get cookies.txt" browser extension download cookie file.
142142
"""
143-
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
143+
favors_home = get_favors_home()
144144
config = load_config(favors_home)
145145
nhentai_config = config.setdefault('nhentai', {})
146146
try:

src/favorites_crawler/exceptions.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class LoginFailed(Exception):
2+
pass

src/favorites_crawler/itemloaders.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,7 @@
44
from itemloaders.processors import Compose, MapCompose
55

66
from favorites_crawler import items
7-
from favorites_crawler.processors import take_first, identity, get_nhentai_id, wrap_credits, \
8-
original_url_from_nhentai_thumb_url, select_best_nhentai_title, clean_nhentai_title, \
9-
get_year_from_iso_format, get_month_from_iso_format, get_series_from_title, get_volume_from_title, \
10-
clean_parodies, get_lemon_page, get_pixiv_tags, get_yandere_tags, get_twitter_tags, fix_tweet_media_url, \
11-
tweet_time_2_datetime
7+
from favorites_crawler.processors import *
128
from favorites_crawler.utils.text import convert_to_ascii
139

1410

@@ -46,7 +42,7 @@ class NHentaiGalleryItemLoader(BaseItemLoader):
4642
series_out = Compose(take_first, get_series_from_title)
4743
volume_out = Compose(take_first, get_volume_from_title)
4844
title_out = Compose(select_best_nhentai_title, clean_nhentai_title)
49-
sort_title_out = Compose(select_best_nhentai_title, clean_nhentai_title)
45+
sort_title_out = join_nhentai_title
5046
file_urls_out = MapCompose(original_url_from_nhentai_thumb_url)
5147
credits_out = wrap_credits
5248
publicationYear_out = Compose(take_first, get_year_from_iso_format)

src/favorites_crawler/items.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import json
2-
import datetime
1+
from __future__ import annotations
2+
33
import os.path
4+
from datetime import datetime, date
45
from dataclasses import dataclass, field, fields
56
from urllib.parse import unquote, urlparse
67

@@ -15,7 +16,7 @@ class BaseItem:
1516
file_urls: list = field(default=None)
1617
tags: list = field(default=None)
1718
referer: str = field(default=None)
18-
created_time: datetime.datetime = field(default=None)
19+
created_time: datetime = field(default=None)
1920

2021
def get_filepath(self, url, spider):
2122
folder_name = self.get_folder_name(spider)
@@ -30,12 +31,13 @@ def get_filename(self, url, spider):
3031
def get_folder_name(self, spider):
3132
name = self.title
3233
if not name:
33-
name = str(datetime.date.today())
34+
name = str(date.today())
3435
return drop_illegal_characters(name)
3536

3637

3738
@dataclass
3839
class ComicBookInfoItem:
40+
id: int = field(default=None, metadata={'is_ext_comic_info': True})
3941
title: str = field(default=None, metadata={'is_comic_info': True})
4042
series: str = field(default=None, metadata={'is_comic_info': True})
4143
publisher: str = field(default=None, metadata={'is_comic_info': True})
@@ -53,21 +55,26 @@ class ComicBookInfoItem:
5355
tags: list = field(default=None, metadata={'is_comic_info': True})
5456
comments: str = field(default=None, metadata={'is_comic_info': True})
5557

56-
def get_comic_info(self):
57-
comic_book_info = {}
58-
for f in fields(self):
59-
if not f.metadata.get('is_comic_info', False):
60-
continue
61-
val = getattr(self, f.name)
62-
if not val:
63-
continue
64-
comic_book_info[f.name] = val
65-
66-
return json.dumps({
58+
def get_comic_info(self) -> dict:
59+
metadata = {
6760
'appID': f'FavoritesCrawler',
68-
'lastModified': str(datetime.datetime.now()),
69-
'ComicBookInfo/1.0': comic_book_info,
70-
}, ensure_ascii=False)
61+
'lastModified': str(datetime.now()),
62+
'ComicBookInfo/1.0': {},
63+
'x-FavoritesCrawler': {},
64+
}
65+
comic_book_info = metadata['ComicBookInfo/1.0']
66+
ext_info = metadata['x-FavoritesCrawler']
67+
for field_ in fields(self):
68+
if field_.metadata.get('is_comic_info', False):
69+
value = getattr(self, field_.name)
70+
if value:
71+
comic_book_info[field_.name] = value
72+
elif field_.metadata.get('is_ext_comic_info', False):
73+
value = getattr(self, field_.name)
74+
if value:
75+
ext_info[field_.name] = value
76+
77+
return metadata
7178

7279

7380
@dataclass
@@ -120,11 +127,12 @@ def get_filename(self, url, spider):
120127

121128
@dataclass
122129
class NHentaiGalleryItem(BaseItem, ComicBookInfoItem):
130+
id: int = field(default=None, metadata={'is_ext_comic_info': True})
123131
title: str = field(default=None, metadata={'is_comic_info': True})
124132
tags: list = field(default=None, metadata={'is_comic_info': True})
125133
parodies: str = field(default=None)
126134
characters: list = field(default=None)
127135
sort_title: str = field(default=None)
128136

129137
def get_folder_name(self, _):
130-
return drop_illegal_characters(self.sort_title)
138+
return drop_illegal_characters(self.sort_title) + f' ({self.id})'

src/favorites_crawler/pipelines.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,24 +102,23 @@ class ComicPipeline(BasePipeline):
102102
def __init__(self, store_uri, **kwargs):
103103
super().__init__(store_uri, **kwargs)
104104
self.files_path = Path(store_uri).resolve()
105-
self.comic_comments = {}
105+
self.comics = {}
106106

107107
def close_spider(self, spider):
108-
for title, comment in self.comic_comments.items():
108+
for title, comic_info in self.comics.items():
109109
folder = self.files_path / title
110110
if not folder.exists():
111111
continue
112112
try:
113-
create_comic_archive(folder, comment=comment)
114-
except FileNotFoundError:
115-
pass
113+
create_comic_archive(folder, comic_info=comic_info)
114+
except Exception as e:
115+
spider.logger.error('Failed to create cbz file: %r', e)
116116

117117
def process_item(self, item, spider):
118118
if hasattr(item, 'get_comic_info'):
119119
title = item.get_folder_name(spider)
120120
if (self.files_path / f'{title}.cbz').exists():
121-
raise DropItem(f'Comic file of "{title}" already exist, stop download this comic.')
122-
comment = item.get_comic_info()
123-
self.comic_comments[title] = bytes(comment, encoding='utf-8')
121+
raise DropItem(f'Comic "{title}" already exist, stop downloading this comic.')
122+
self.comics[title] = item.get_comic_info()
124123

125124
return super().process_item(item, spider)

src/favorites_crawler/processors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import re
24
from datetime import datetime
35

@@ -163,3 +165,7 @@ def fix_tweet_media_url(url):
163165

164166
def tweet_time_2_datetime(tweet_time):
165167
return datetime.strptime(tweet_time, '%a %b %d %H:%M:%S %z %Y')
168+
169+
170+
def join_nhentai_title(parts: list[str]) -> str:
171+
return ' '.join(map(lambda s: s.strip(), parts))

src/favorites_crawler/spiders/__init__.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,47 @@
55

66
from abc import ABCMeta
77

8+
from scrapy.exceptions import CloseSpider
89
from scrapy.spiders import CrawlSpider
910

11+
from favorites_crawler.utils.common import get_favors_home
12+
from favorites_crawler.utils.config import load_config, dump_config
13+
1014

1115
class BaseSpider(CrawlSpider, metaclass=ABCMeta):
1216
custom_settings = {}
17+
cookies = None
18+
19+
def __init__(self, *args, **kwargs):
20+
super().__init__(*args, **kwargs)
21+
self.last_bookmark_id = self.custom_settings.get('LAST_BOOKMARK_ID')
22+
self.last_bookmark_id_updated = False
23+
24+
def close_spider_when_bookmark_not_updated(self, response, **kwargs):
25+
"""Close spider when bookmark not updated"""
26+
last_bookmark_id = self.get_last_bookmark_id(response, **kwargs)
27+
self._close_spider_when_bookmark_not_updated(last_bookmark_id)
28+
self.update_last_bookmark_id(last_bookmark_id)
29+
30+
def get_last_bookmark_id(self, response, **kwargs):
31+
"""Get last bookmark id from start_url response"""
32+
raise NotImplementedError()
33+
34+
def _close_spider_when_bookmark_not_updated(self, bookmark_id):
35+
"""Close spider when current bookmark id equals to last bookmark id."""
36+
if self.last_bookmark_id and (self.last_bookmark_id == bookmark_id):
37+
self.logger.info('Bookmark not updated, closing spider.')
38+
raise CloseSpider('fastly-finished')
39+
40+
def update_last_bookmark_id(self, bookmark_id):
41+
"""Update last bookmark id"""
42+
if not bookmark_id or self.last_bookmark_id_updated:
43+
return
44+
self.last_bookmark_id = bookmark_id
45+
self.last_bookmark_id_updated = True
46+
favors_home = get_favors_home()
47+
config = load_config(favors_home)
48+
spider_config = config.setdefault(self.name, {})
49+
spider_config['LAST_BOOKMARK_ID'] = bookmark_id
50+
dump_config(config, favors_home)
51+
self.logger.info('Updated LAST_BOOKMARK_ID: %s', bookmark_id)

0 commit comments

Comments
 (0)