Skip to content

Commit 894dabc

Browse files
committed
refactor: 数据存储重构,分离不同类型的存储实现
1 parent e31aebb commit 894dabc

37 files changed

+1427
-864
lines changed

base/base_crawler.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,13 @@ async def login_by_mobile(self):
3939
@abstractmethod
4040
async def login_by_cookies(self):
4141
pass
42+
43+
44+
class AbstractStore(ABC):
45+
@abstractmethod
46+
async def store_content(self, content_item: Dict):
47+
pass
48+
49+
@abstractmethod
50+
async def store_comment(self, comment_item: Dict):
51+
pass

config/base_config.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
# 是否保存登录状态
2121
SAVE_LOGIN_STATE = True
2222

23+
# 数据保存类型选项配置,支持三种类型:csv、db、json
24+
SAVE_DATA_OPTION = "csv" # csv or db or json
25+
2326
# 用户浏览器缓存的浏览器文件配置
2427
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
2528

@@ -54,7 +57,10 @@
5457
]
5558

5659
# 指定快手平台需要爬取的ID列表
57-
KS_SPECIFIED_ID_LIST = []
60+
KS_SPECIFIED_ID_LIST = [
61+
"3xf8enb8dbj6uig",
62+
"3x6zz972bchmvqe"
63+
]
5864

5965
# 指定B站平台需要爬取的视频bvid列表
6066
BILI_SPECIFIED_ID_LIST = [

config/db_config.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,3 @@
77
# mysql config
88
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db password
99
RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"
10-
11-
# save data to database option
12-
IS_SAVED_DATABASED = False # if you want to save data to database, set True

db.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
1+
from typing import List
2+
13
from tortoise import Tortoise, run_async
24

35
from config.db_config import *
46
from tools import utils
57

68

9+
def get_platform_models() -> List[str]:
10+
models = ["store.xhs", "store.douyin", "store.bilibili", "store.kuaishou", "store.weibo"]
11+
return models
12+
13+
714
async def init_db(create_db: bool = False) -> None:
815
await Tortoise.init(
916
db_url=RELATION_DB_URL,
10-
modules={'models': ['models']},
11-
# modules={'models': ['models.kuaishou']}, # generate special table
17+
modules={'models': get_platform_models()},
1218
_create_db=create_db
1319
)
1420

main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ async def main():
4040
choices=["search", "detail"], default=config.CRAWLER_TYPE)
4141

4242
# init db
43-
if config.IS_SAVED_DATABASED:
43+
if config.SAVE_DATA_OPTION == "db":
4444
await db.init_db()
4545

4646
args = parser.parse_args()

media_platform/bilibili/core.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,18 @@
66
import asyncio
77
import os
88
import random
9-
import time
109
from asyncio import Task
11-
from typing import Dict, List, Optional, Tuple, Union
10+
from typing import Dict, List, Optional, Tuple
1211

1312
from playwright.async_api import (BrowserContext, BrowserType, Page,
1413
async_playwright)
1514

1615
import config
1716
from base.base_crawler import AbstractCrawler
18-
from models import bilibili
1917
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
18+
from store import bilibili as bilibili_store
2019
from tools import utils
21-
from var import comment_tasks_var, crawler_type_var
20+
from var import crawler_type_var
2221

2322
from .client import BilibiliClient
2423
from .exception import DataFetchError
@@ -88,7 +87,6 @@ async def start(self):
8887
pass
8988
utils.logger.info("[BilibiliCrawler.start] Bilibili Crawler finished ...")
9089

91-
9290
async def search(self):
9391
"""
9492
search bilibili video with keywords
@@ -118,7 +116,7 @@ async def search(self):
118116
for video_item in video_items:
119117
if video_item:
120118
video_id_list.append(video_item.get("View").get("aid"))
121-
await bilibili.update_bilibili_video(video_item)
119+
await bilibili_store.update_bilibili_video(video_item)
122120

123121
page += 1
124122
await self.batch_get_video_comments(video_id_list)
@@ -150,7 +148,7 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
150148
await self.bili_client.get_video_all_comments(
151149
video_id=video_id,
152150
crawl_interval=random.random(),
153-
callback=bilibili.batch_update_bilibili_video_comments
151+
callback=bilibili_store.batch_update_bilibili_video_comments
154152
)
155153

156154
except DataFetchError as ex:
@@ -176,7 +174,7 @@ async def get_specified_videos(self):
176174
video_aid: str = video_item_view.get("aid")
177175
if video_aid:
178176
video_aids_list.append(video_aid)
179-
await bilibili.update_bilibili_video(video_detail)
177+
await bilibili_store.update_bilibili_video(video_detail)
180178
await self.batch_get_video_comments(video_aids_list)
181179

182180
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -195,7 +193,8 @@ async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Sema
195193
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
196194
return None
197195
except KeyError as ex:
198-
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
196+
utils.logger.error(
197+
f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
199198
return None
200199

201200
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:

media_platform/bilibili/login.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
# @Author : [email protected]
33
# @Time : 2023/12/2 18:44
4-
# @Desc : bilibli登录类实现
4+
# @Desc : bilibli登录实现类
55

66
import asyncio
77
import functools

media_platform/douyin/core.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88

99
import config
1010
from base.base_crawler import AbstractCrawler
11-
from models import douyin
1211
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
12+
from store import douyin as douyin_store
1313
from tools import utils
1414
from var import crawler_type_var
1515

@@ -99,7 +99,7 @@ async def search(self) -> None:
9999
except TypeError:
100100
continue
101101
aweme_list.append(aweme_info.get("aweme_id", ""))
102-
await douyin.update_douyin_aweme(aweme_item=aweme_info)
102+
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
103103
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
104104
await self.batch_get_note_comments(aweme_list)
105105

@@ -112,7 +112,7 @@ async def get_specified_awemes(self):
112112
aweme_details = await asyncio.gather(*task_list)
113113
for aweme_detail in aweme_details:
114114
if aweme_detail is not None:
115-
await douyin.update_douyin_aweme(aweme_detail)
115+
await douyin_store.update_douyin_aweme(aweme_detail)
116116
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
117117

118118
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
@@ -146,7 +146,7 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_co
146146
keywords=config.COMMENT_KEYWORDS # 关键词列表
147147
)
148148
# 现在返回的 comments 已经是经过关键词筛选的
149-
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
149+
await douyin_store.batch_update_dy_aweme_comments(aweme_id, comments)
150150
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
151151
except DataFetchError as e:
152152
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")

media_platform/kuaishou/client.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import config
1111
from tools import utils
1212

13-
from .exception import DataFetchError, IPBlockError
13+
from .exception import DataFetchError
1414
from .graphql import KuaiShouGraphQL
1515

1616

@@ -56,13 +56,21 @@ async def post(self, uri: str, data: dict) -> Dict:
5656
return await self.request(method="POST", url=f"{self._host}{uri}",
5757
data=json_str, headers=self.headers)
5858

59-
@staticmethod
60-
async def pong() -> bool:
59+
async def pong(self) -> bool:
6160
"""get a note to check if login state is ok"""
6261
utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
6362
ping_flag = False
6463
try:
65-
pass
64+
post_data = {
65+
"operationName": "visionProfileUserList",
66+
"variables": {
67+
"ftype": 1,
68+
},
69+
"query": self.graphql.get("vision_profile")
70+
}
71+
res = await self.post("", post_data)
72+
if res.get("visionProfileUserList", {}).get("result") == 1:
73+
ping_flag = True
6674
except Exception as e:
6775
utils.logger.error(f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again...")
6876
ping_flag = False

media_platform/kuaishou/core.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111
import config
1212
from base.base_crawler import AbstractCrawler
13-
from models import kuaishou
1413
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
14+
from store import kuaishou as kuaishou_store
1515
from tools import utils
1616
from var import comment_tasks_var, crawler_type_var
1717

@@ -106,7 +106,7 @@ async def search(self):
106106

107107
for video_detail in vision_search_photo.get("feeds"):
108108
video_id_list.append(video_detail.get("photo", {}).get("id"))
109-
await kuaishou.update_kuaishou_video(video_item=video_detail)
109+
await kuaishou_store.update_kuaishou_video(video_item=video_detail)
110110

111111
# batch fetch video comments
112112
page += 1
@@ -121,7 +121,7 @@ async def get_specified_videos(self):
121121
video_details = await asyncio.gather(*task_list)
122122
for video_detail in video_details:
123123
if video_detail is not None:
124-
await kuaishou.update_kuaishou_video(video_detail)
124+
await kuaishou_store.update_kuaishou_video(video_detail)
125125
await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
126126

127127
async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -167,7 +167,7 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
167167
await self.ks_client.get_video_all_comments(
168168
photo_id=video_id,
169169
crawl_interval=random.random(),
170-
callback=kuaishou.batch_update_ks_video_comments
170+
callback=kuaishou_store.batch_update_ks_video_comments
171171
)
172172
except DataFetchError as ex:
173173
utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")

media_platform/kuaishou/graphql.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def __init__(self):
1111
self.load_graphql_queries()
1212

1313
def load_graphql_queries(self):
14-
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql"]
14+
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"]
1515

1616
for file in graphql_files:
1717
with open(self.graphql_dir + file, mode="r") as f:
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
query visionProfileUserList($pcursor: String, $ftype: Int) {
2+
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
3+
result
4+
fols {
5+
user_name
6+
headurl
7+
user_text
8+
isFollowing
9+
user_id
10+
__typename
11+
}
12+
hostName
13+
pcursor
14+
__typename
15+
}
16+
}

media_platform/weibo/core.py

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515

1616
import config
1717
from base.base_crawler import AbstractCrawler
18-
from models import weibo
1918
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
19+
from store import weibo as weibo_store
2020
from tools import utils
2121
from var import crawler_type_var
2222

@@ -120,7 +120,7 @@ async def search(self):
120120
if note_item:
121121
mblog: Dict = note_item.get("mblog")
122122
note_id_list.append(mblog.get("id"))
123-
await weibo.update_weibo_note(note_item)
123+
await weibo_store.update_weibo_note(note_item)
124124

125125
page += 1
126126
await self.batch_get_notes_comments(note_id_list)
@@ -138,7 +138,7 @@ async def get_specified_notes(self):
138138
video_details = await asyncio.gather(*task_list)
139139
for note_item in video_details:
140140
if note_item:
141-
await weibo.update_weibo_note(note_item)
141+
await weibo_store.update_weibo_note(note_item)
142142
await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
143143

144144
async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -184,33 +184,11 @@ async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
184184
async with semaphore:
185185
try:
186186
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
187-
188-
# Read keyword and quantity from config
189-
keywords = config.COMMENT_KEYWORDS
190-
max_comments = config.MAX_COMMENTS_PER_POST
191-
192-
# Download comments
193-
all_comments = await self.wb_client.get_note_all_comments(
187+
await self.wb_client.get_note_all_comments(
194188
note_id=note_id,
195189
crawl_interval=random.randint(1,10), # 微博对API的限流比较严重,所以延时提高一些
190+
callback=weibo_store.batch_update_weibo_note_comments
196191
)
197-
198-
# Filter comments by keyword
199-
if keywords:
200-
filtered_comments = [
201-
comment for comment in all_comments if
202-
any(keyword in comment["content"]["message"] for keyword in keywords)
203-
]
204-
else:
205-
filtered_comments = all_comments
206-
207-
# Limit the number of comments
208-
if max_comments > 0:
209-
filtered_comments = filtered_comments[:max_comments]
210-
211-
# Update weibo note comments
212-
await weibo.batch_update_weibo_note_comments(note_id, filtered_comments)
213-
214192
except DataFetchError as ex:
215193
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
216194
except Exception as e:

media_platform/xhs/core.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
import config
1111
from base.base_crawler import AbstractCrawler
12-
from models import xiaohongshu as xhs_model
1312
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
13+
from store import xhs as xhs_store
1414
from tools import utils
1515
from var import crawler_type_var
1616

@@ -112,7 +112,7 @@ async def search(self) -> None:
112112
note_details = await asyncio.gather(*task_list)
113113
for note_detail in note_details:
114114
if note_detail is not None:
115-
await xhs_model.update_xhs_note(note_detail)
115+
await xhs_store.update_xhs_note(note_detail)
116116
note_id_list.append(note_detail.get("note_id"))
117117
page += 1
118118
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
@@ -127,7 +127,7 @@ async def get_specified_notes(self):
127127
note_details = await asyncio.gather(*task_list)
128128
for note_detail in note_details:
129129
if note_detail is not None:
130-
await xhs_model.update_xhs_note(note_detail)
130+
await xhs_store.update_xhs_note(note_detail)
131131
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
132132

133133
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -174,7 +174,7 @@ async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
174174

175175
# 更新或保存过滤后的评论
176176
for comment in filtered_comments:
177-
await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
177+
await xhs_store.update_xhs_note_comment(note_id=note_id, comment_item=comment)
178178

179179
@staticmethod
180180
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:

models/__init__.py

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)