Skip to content

Commit e757074

Browse files
committed
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中
1 parent 745e59c commit e757074

File tree

20 files changed

+339
-169
lines changed

20 files changed

+339
-169
lines changed

README.md

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,22 @@
2121
- [x] 抖音登录(二维码、手机号、cookies)
2222
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
2323
- [x] 支持登录成功后的上下文浏览器环境保留
24+
- [x] 数据持久化到硬盘(关系型数据库)
2425

25-
## 待实现
26-
27-
- [ ] 数据持久化到硬盘
2826

2927
## 使用方法
3028

3129
1. 安装依赖库
3230
`pip install -r requirements.txt`
3331
2. 安装playwright浏览器驱动
3432
`playwright install`
35-
3. 运行爬虫程序
33+
3. 是否选择开启保存数据到DB中
34+
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED``RELATION_DB_URL` 变量
35+
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
36+
4. 运行爬虫程序
3637
`python main.py --platform xhs --lt qrcode`
37-
4. 打开小红书扫二维码登录
38+
5. 打开对应APP扫二维码登录
39+
3840

3941
## 项目代码结构
4042

@@ -67,24 +69,16 @@ MediaCrawler
6769
│ ├── help.py # 辅助函数
6870
│ └── login.py # 登录实现
6971
├── modles
70-
│ ├── douyin
71-
│ │ └── m_douyin.py
72-
│ └── xhs
73-
│ └── m_xhs.py
72+
│ ├── douyin.py # 抖音数据模型
73+
│ └── xiaohongshu.py # 小红书数据模型
7474
├── tools
7575
│ └── utils.py # 工具函数
7676
├── main.py # 程序入口
7777
└── recv_sms_notification.py # 短信转发器的HTTP SERVER接口
7878
```
79+
## 数据持久化
7980

80-
## 小红书运行截图
81-
82-
![小红书运行截图](https://s2.loli.net/2023/06/09/PVBe3X5vf4yncrd.gif)
83-
84-
## 抖音运行截图
85-
86-
- ![抖音运行截图](https://s2.loli.net/2023/06/25/GXfkeLhpTyNiAqH.gif)
87-
81+
![数据持久化](https://s2.loli.net/2023/07/24/ZTcGWz8jPAy7b5M.png)
8882

8983
## 支持一下
9084

config/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from .base_config import *
22
from .account_config import *
3+
from .db_config import *

config/base_config.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,7 @@
22
PLATFORM = "xhs"
33
KEYWORDS = "健身,旅游"
44
LOGIN_TYPE = "qrcode" # qrcode or phone or cookies
5-
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
6-
7-
# redis config
8-
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
9-
REDIS_DB_PWD = "123456" # your redis password
5+
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
106

117
# enable ip proxy
128
ENABLE_IP_PROXY = False
@@ -18,7 +14,7 @@
1814
HEADLESS = True
1915

2016
# save login state
21-
SAVE_LOGIN_STATE = False
17+
SAVE_LOGIN_STATE = True
2218

2319
# save user data dir
2420
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name

config/db_config.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# redis config
2+
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
3+
REDIS_DB_PWD = "123456" # your redis password
4+
5+
# mysql config
6+
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
7+
8+
# save data to database option
9+
IS_SAVED_DATABASED = True # if you want to save data to database, set True

db.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from tortoise import Tortoise
2+
from tortoise import run_async
3+
4+
from config.db_config import *
5+
6+
from tools import utils
7+
8+
9+
async def init_db(create_db: bool = False) -> None:
10+
await Tortoise.init(
11+
db_url=RELATION_DB_URL,
12+
modules={'models': ['models']},
13+
_create_db=create_db
14+
)
15+
16+
17+
async def init():
18+
await init_db(create_db=True)
19+
await Tortoise.generate_schemas()
20+
utils.logger.info("Init DB Success!")
21+
22+
23+
if __name__ == '__main__':
24+
run_async(init())

images/douyin.gif

-2.37 MB
Binary file not shown.

images/xiaoshongshu.gif

-2.88 MB
Binary file not shown.

main.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
import asyncio
33
import argparse
44

5+
import db
56
import config
6-
from tools import utils
77
from base import proxy_account_pool
88
from media_platform.douyin import DouYinCrawler
99
from media_platform.xhs import XiaoHongShuCrawler
@@ -29,6 +29,10 @@ async def main():
2929
# init account pool
3030
account_pool = proxy_account_pool.create_account_pool()
3131

32+
# init db
33+
if config.IS_SAVED_DATABASED:
34+
await db.init_db()
35+
3236
args = parser.parse_args()
3337
crawler = CrawlerFactory().create_crawler(platform=args.platform)
3438
crawler.init_config(

media_platform/douyin/core.py

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ class DouYinCrawler(AbstractCrawler):
2323
dy_client: DOUYINClient
2424

2525
def __init__(self) -> None:
26-
self.browser_context: Optional[BrowserContext] = None # type: ignore
27-
self.context_page: Optional[Page] = None # type: ignore
26+
self.browser_context: Optional[BrowserContext] = None # type: ignore
27+
self.context_page: Optional[Page] = None # type: ignore
2828
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
2929
self.index_url = "https://www.douyin.com"
30-
self.command_args: Optional[Namespace] = None # type: ignore
31-
self.account_pool: Optional[AccountPool] = None # type: ignore
30+
self.command_args: Optional[Namespace] = None # type: ignore
31+
self.account_pool: Optional[AccountPool] = None # type: ignore
3232

3333
def init_config(self, **kwargs):
3434
for key, value in kwargs.items():
@@ -53,7 +53,7 @@ async def start(self) -> None:
5353
self.dy_client = await self.create_douyin_client(httpx_proxy)
5454
if not await self.dy_client.ping(browser_context=self.browser_context):
5555
login_obj = DouYinLogin(
56-
login_type=self.command_args.lt, # type: ignore
56+
login_type=self.command_args.lt, # type: ignore
5757
login_phone=account_phone,
5858
browser_context=self.browser_context,
5959
context_page=self.context_page,
@@ -88,35 +88,37 @@ async def search_posts(self) -> None:
8888
post_item.get("aweme_mix_info", {}).get("mix_items")[0]
8989
except TypeError:
9090
continue
91-
aweme_list.append(aweme_info.get("aweme_id",""))
91+
aweme_list.append(aweme_info.get("aweme_id", ""))
9292
await douyin.update_douyin_aweme(aweme_item=aweme_info)
9393
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
94-
# await self.batch_get_note_comments(aweme_list)
94+
await self.batch_get_note_comments(aweme_list)
9595

9696
async def batch_get_note_comments(self, aweme_list: List[str]):
9797
task_list: List[Task] = []
98+
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
9899
for aweme_id in aweme_list:
99-
task = asyncio.create_task(self.get_comments(aweme_id), name=aweme_id)
100+
task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
100101
task_list.append(task)
101102
await asyncio.wait(task_list)
102103

103-
async def get_comments(self, aweme_id: str):
104-
try:
105-
await self.dy_client.get_aweme_all_comments(
106-
aweme_id=aweme_id,
107-
callback=douyin.batch_update_dy_aweme_comments
108-
)
109-
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
110-
except DataFetchError as e:
111-
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
104+
async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
105+
async with semaphore:
106+
try:
107+
await self.dy_client.get_aweme_all_comments(
108+
aweme_id=aweme_id,
109+
callback=douyin.batch_update_dy_aweme_comments
110+
)
111+
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
112+
except DataFetchError as e:
113+
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
112114

113115
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
114116
"""Create proxy info for playwright and httpx"""
115117
if not config.ENABLE_IP_PROXY:
116118
return None, None, None
117119

118120
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
119-
phone, ip_proxy = self.account_pool.get_account() # type: ignore
121+
phone, ip_proxy = self.account_pool.get_account() # type: ignore
120122
playwright_proxy = {
121123
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
122124
"username": config.IP_PROXY_USER,
@@ -127,7 +129,7 @@ def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str
127129

128130
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
129131
"""Create douyin client"""
130-
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
132+
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
131133
douyin_client = DOUYINClient(
132134
proxies=httpx_proxy,
133135
headers={
@@ -152,18 +154,19 @@ async def launch_browser(
152154
) -> BrowserContext:
153155
"""Launch browser and create browser context"""
154156
if config.SAVE_LOGIN_STATE:
155-
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) # type: ignore
157+
user_data_dir = os.path.join(os.getcwd(), "browser_data",
158+
config.USER_DATA_DIR % self.command_args.platform) # type: ignore
156159
browser_context = await chromium.launch_persistent_context(
157160
user_data_dir=user_data_dir,
158161
accept_downloads=True,
159162
headless=headless,
160-
proxy=playwright_proxy, # type: ignore
163+
proxy=playwright_proxy, # type: ignore
161164
viewport={"width": 1920, "height": 1080},
162165
user_agent=user_agent
163-
) # type: ignore
166+
) # type: ignore
164167
return browser_context
165168
else:
166-
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
169+
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
167170
browser_context = await browser.new_context(
168171
viewport={"width": 1920, "height": 1080},
169172
user_agent=user_agent

media_platform/xhs/client.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,15 @@ async def post(self, uri: str, data: dict) -> Dict:
8383
async def ping(self) -> bool:
8484
"""get a note to check if login state is ok"""
8585
utils.logger.info("begin to ping xhs...")
86-
note_id = "5e5cb38a000000000100185e"
86+
ping_flag = False
8787
try:
88-
note_card: Dict = await self.get_note_by_id(note_id)
89-
return note_card.get("note_id") == note_id
90-
except Exception:
91-
return False
88+
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
89+
if note_card.get("items"):
90+
ping_flag = True
91+
except Exception as e:
92+
utils.logger.error(f"ping xhs failed: {e}")
93+
ping_flag = False
94+
return ping_flag
9295

9396
async def update_cookies(self, browser_context: BrowserContext):
9497
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())

media_platform/xhs/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from .exception import *
1616
from .login import XHSLogin
1717
from .client import XHSClient
18-
from models import xhs as xhs_model
18+
from models import xiaohongshu as xhs_model
1919
from base.base_crawler import AbstractCrawler
2020
from base.proxy_account_pool import AccountPool
2121

models/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .douyin import *
2+
from .xiaohongshu import *
3+

0 commit comments

Comments
 (0)