sincereliu
diff --git a/‎base/base_crawler.py
Lines changed: 10 additions & 0 deletions b/‎base/base_crawler.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎config/base_config.py
Lines changed: 7 additions & 1 deletion b/‎config/base_config.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎config/db_config.py
Lines changed: 0 additions & 3 deletions b/‎config/db_config.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎db.py
Lines changed: 8 additions & 2 deletions b/‎db.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎main.py
Lines changed: 1 addition & 1 deletion b/‎main.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎media_platform/bilibili/core.py
Lines changed: 8 additions & 9 deletions b/‎media_platform/bilibili/core.py
Lines changed: 8 additions & 9 deletions
diff --git a/‎media_platform/bilibili/login.py
Lines changed: 1 addition & 1 deletion b/‎media_platform/bilibili/login.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎media_platform/douyin/core.py
Lines changed: 4 additions & 4 deletions b/‎media_platform/douyin/core.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎media_platform/kuaishou/client.py
Lines changed: 12 additions & 4 deletions b/‎media_platform/kuaishou/client.py
Lines changed: 12 additions & 4 deletions
diff --git a/‎media_platform/kuaishou/core.py
Lines changed: 4 additions & 4 deletions b/‎media_platform/kuaishou/core.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎media_platform/kuaishou/graphql.py
Lines changed: 1 addition & 1 deletion b/‎media_platform/kuaishou/graphql.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎media_platform/kuaishou/graphql/vision_profile.graphql
Lines changed: 16 additions & 0 deletions b/‎media_platform/kuaishou/graphql/vision_profile.graphql
Lines changed: 16 additions & 0 deletions
diff --git a/‎media_platform/weibo/core.py
Lines changed: 5 additions & 27 deletions b/‎media_platform/weibo/core.py
Lines changed: 5 additions & 27 deletions
diff --git a/‎media_platform/xhs/core.py
Lines changed: 4 additions & 4 deletions b/‎media_platform/xhs/core.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎models/__init__.py
Lines changed: 0 additions & 5 deletions b/‎models/__init__.py
Lines changed: 0 additions & 5 deletions
@@ -39,3 +39,13 @@ async def login_by_mobile(self):
     @abstractmethod
     async def login_by_cookies(self):
         pass
+
+
+class AbstractStore(ABC):
+    @abstractmethod
+    async def store_content(self, content_item: Dict):
+        pass
+
+    @abstractmethod
+    async def store_comment(self, comment_item: Dict):
+        pass
@@ -20,6 +20,9 @@
 # 是否保存登录状态
 SAVE_LOGIN_STATE = True
 
+# 数据保存类型选项配置,支持三种类型：csv、db、json
+SAVE_DATA_OPTION = "csv" # csv or db or json
+
 # 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
 
@@ -54,7 +57,10 @@
 ]
 
 # 指定快手平台需要爬取的ID列表
-KS_SPECIFIED_ID_LIST = []
+KS_SPECIFIED_ID_LIST = [
+    "3xf8enb8dbj6uig",
+    "3x6zz972bchmvqe"
+]
 
 # 指定B站平台需要爬取的视频bvid列表
 BILI_SPECIFIED_ID_LIST = [
 
@@ -7,6 +7,3 @@
 # mysql config
 RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")  # your relation db password
 RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"
-
-# save data to database option
-IS_SAVED_DATABASED = False  # if you want to save data to database, set True
@@ -1,14 +1,20 @@
+from typing import List
+
 from tortoise import Tortoise, run_async
 
 from config.db_config import *
 from tools import utils
 
 
+def get_platform_models() -> List[str]:
+    models = ["store.xhs", "store.douyin", "store.bilibili", "store.kuaishou", "store.weibo"]
+    return models
+
+
 async def init_db(create_db: bool = False) -> None:
     await Tortoise.init(
         db_url=RELATION_DB_URL,
-        modules={'models': ['models']},
-        # modules={'models': ['models.kuaishou']}, # generate special table
+        modules={'models': get_platform_models()},
         _create_db=create_db
     )
 
 
@@ -40,7 +40,7 @@ async def main():
                         choices=["search", "detail"], default=config.CRAWLER_TYPE)
 
     # init db
-    if config.IS_SAVED_DATABASED:
+    if config.SAVE_DATA_OPTION == "db":
         await db.init_db()
 
     args = parser.parse_args()
 
@@ -6,19 +6,18 @@
 import asyncio
 import os
 import random
-import time
 from asyncio import Task
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple
 
 from playwright.async_api import (BrowserContext, BrowserType, Page,
                                   async_playwright)
 
 import config
 from base.base_crawler import AbstractCrawler
-from models import bilibili
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import bilibili as bilibili_store
 from tools import utils
-from var import comment_tasks_var, crawler_type_var
+from var import crawler_type_var
 
 from .client import BilibiliClient
 from .exception import DataFetchError
@@ -88,7 +87,6 @@ async def start(self):
                 pass
             utils.logger.info("[BilibiliCrawler.start] Bilibili Crawler finished ...")
 
-
     async def search(self):
         """
         search bilibili video with keywords
@@ -118,7 +116,7 @@ async def search(self):
                 for video_item in video_items:
                     if video_item:
                         video_id_list.append(video_item.get("View").get("aid"))
-                        await bilibili.update_bilibili_video(video_item)
+                        await bilibili_store.update_bilibili_video(video_item)
 
                 page += 1
                 await self.batch_get_video_comments(video_id_list)
@@ -150,7 +148,7 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
                 await self.bili_client.get_video_all_comments(
                     video_id=video_id,
                     crawl_interval=random.random(),
-                    callback=bilibili.batch_update_bilibili_video_comments
+                    callback=bilibili_store.batch_update_bilibili_video_comments
                 )
 
             except DataFetchError as ex:
@@ -176,7 +174,7 @@ async def get_specified_videos(self):
                 video_aid: str = video_item_view.get("aid")
                 if video_aid:
                     video_aids_list.append(video_aid)
-                await bilibili.update_bilibili_video(video_detail)
+                await bilibili_store.update_bilibili_video(video_detail)
         await self.batch_get_video_comments(video_aids_list)
 
     async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -195,7 +193,8 @@ async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Sema
                 utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
                 return None
             except KeyError as ex:
-                utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
+                utils.logger.error(
+                    f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
                 return None
 
     async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
 
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # @Author  : [email protected]
 # @Time    : 2023/12/2 18:44
-# @Desc    : bilibli登录类实现
+# @Desc    : bilibli登录实现类
 
 import asyncio
 import functools
 
@@ -8,8 +8,8 @@
 
 import config
 from base.base_crawler import AbstractCrawler
-from models import douyin
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import douyin as douyin_store
 from tools import utils
 from var import crawler_type_var
 
@@ -99,7 +99,7 @@ async def search(self) -> None:
                     except TypeError:
                         continue
                     aweme_list.append(aweme_info.get("aweme_id", ""))
-                    await douyin.update_douyin_aweme(aweme_item=aweme_info)
+                    await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
             utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
             await self.batch_get_note_comments(aweme_list)
 
@@ -112,7 +112,7 @@ async def get_specified_awemes(self):
         aweme_details = await asyncio.gather(*task_list)
         for aweme_detail in aweme_details:
             if aweme_detail is not None:
-                await douyin.update_douyin_aweme(aweme_detail)
+                await douyin_store.update_douyin_aweme(aweme_detail)
         await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
 
     async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
@@ -146,7 +146,7 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_co
                     keywords=config.COMMENT_KEYWORDS  # 关键词列表
                 )
                 # 现在返回的 comments 已经是经过关键词筛选的
-                await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
+                await douyin_store.batch_update_dy_aweme_comments(aweme_id, comments)
                 utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
             except DataFetchError as e:
                 utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
 
@@ -10,7 +10,7 @@
 import config
 from tools import utils
 
-from .exception import DataFetchError, IPBlockError
+from .exception import DataFetchError
 from .graphql import KuaiShouGraphQL
 
 
@@ -56,13 +56,21 @@ async def post(self, uri: str, data: dict) -> Dict:
         return await self.request(method="POST", url=f"{self._host}{uri}",
                                   data=json_str, headers=self.headers)
 
-    @staticmethod
-    async def pong() -> bool:
+    async def pong(self) -> bool:
         """get a note to check if login state is ok"""
         utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
         ping_flag = False
         try:
-            pass
+            post_data = {
+                "operationName": "visionProfileUserList",
+                "variables": {
+                    "ftype": 1,
+                },
+                "query": self.graphql.get("vision_profile")
+            }
+            res = await self.post("", post_data)
+            if res.get("visionProfileUserList", {}).get("result") == 1:
+                ping_flag = True
         except Exception as e:
             utils.logger.error(f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again...")
             ping_flag = False
 
@@ -10,8 +10,8 @@
 
 import config
 from base.base_crawler import AbstractCrawler
-from models import kuaishou
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import kuaishou as kuaishou_store
 from tools import utils
 from var import comment_tasks_var, crawler_type_var
 
@@ -106,7 +106,7 @@ async def search(self):
 
                 for video_detail in vision_search_photo.get("feeds"):
                     video_id_list.append(video_detail.get("photo", {}).get("id"))
-                    await kuaishou.update_kuaishou_video(video_item=video_detail)
+                    await kuaishou_store.update_kuaishou_video(video_item=video_detail)
 
                 # batch fetch video comments
                 page += 1
@@ -121,7 +121,7 @@ async def get_specified_videos(self):
         video_details = await asyncio.gather(*task_list)
         for video_detail in video_details:
             if video_detail is not None:
-                await kuaishou.update_kuaishou_video(video_detail)
+                await kuaishou_store.update_kuaishou_video(video_detail)
         await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
 
     async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -167,7 +167,7 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
                 await self.ks_client.get_video_all_comments(
                     photo_id=video_id,
                     crawl_interval=random.random(),
-                    callback=kuaishou.batch_update_ks_video_comments
+                    callback=kuaishou_store.batch_update_ks_video_comments
                 )
             except DataFetchError as ex:
                 utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
 
@@ -11,7 +11,7 @@ def __init__(self):
         self.load_graphql_queries()
 
     def load_graphql_queries(self):
-        graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql"]
+        graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"]
 
         for file in graphql_files:
             with open(self.graphql_dir + file, mode="r") as f:
 
@@ -0,0 +1,16 @@
+query visionProfileUserList($pcursor: String, $ftype: Int) {
+  visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
+    result
+    fols {
+      user_name
+      headurl
+      user_text
+      isFollowing
+      user_id
+      __typename
+    }
+    hostName
+    pcursor
+    __typename
+  }
+}
@@ -15,8 +15,8 @@
 
 import config
 from base.base_crawler import AbstractCrawler
-from models import weibo
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import weibo as weibo_store
 from tools import utils
 from var import crawler_type_var
 
@@ -120,7 +120,7 @@ async def search(self):
                     if note_item:
                         mblog: Dict = note_item.get("mblog")
                         note_id_list.append(mblog.get("id"))
-                        await weibo.update_weibo_note(note_item)
+                        await weibo_store.update_weibo_note(note_item)
 
                 page += 1
                 await self.batch_get_notes_comments(note_id_list)
@@ -138,7 +138,7 @@ async def get_specified_notes(self):
         video_details = await asyncio.gather(*task_list)
         for note_item in video_details:
             if note_item:
-                await weibo.update_weibo_note(note_item)
+                await weibo_store.update_weibo_note(note_item)
         await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
 
     async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -184,33 +184,11 @@ async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
         async with semaphore:
             try:
                 utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
-
-                # Read keyword and quantity from config
-                keywords = config.COMMENT_KEYWORDS
-                max_comments = config.MAX_COMMENTS_PER_POST
-
-                # Download comments
-                all_comments = await self.wb_client.get_note_all_comments(
+                await self.wb_client.get_note_all_comments(
                     note_id=note_id,
                     crawl_interval=random.randint(1,10), # 微博对API的限流比较严重，所以延时提高一些
+                    callback=weibo_store.batch_update_weibo_note_comments
                 )
-
-                # Filter comments by keyword
-                if keywords:
-                    filtered_comments = [
-                        comment for comment in all_comments if
-                        any(keyword in comment["content"]["message"] for keyword in keywords)
-                    ]
-                else:
-                    filtered_comments = all_comments
-
-                # Limit the number of comments
-                if max_comments > 0:
-                    filtered_comments = filtered_comments[:max_comments]
-
-                # Update weibo note comments
-                await weibo.batch_update_weibo_note_comments(note_id, filtered_comments)
-
             except DataFetchError as ex:
                 utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
             except Exception as e:
 
@@ -9,8 +9,8 @@
 
 import config
 from base.base_crawler import AbstractCrawler
-from models import xiaohongshu as xhs_model
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import xhs as xhs_store
 from tools import utils
 from var import crawler_type_var
 
@@ -112,7 +112,7 @@ async def search(self) -> None:
                 note_details = await asyncio.gather(*task_list)
                 for note_detail in note_details:
                     if note_detail is not None:
-                        await xhs_model.update_xhs_note(note_detail)
+                        await xhs_store.update_xhs_note(note_detail)
                         note_id_list.append(note_detail.get("note_id"))
                 page += 1
                 utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
@@ -127,7 +127,7 @@ async def get_specified_notes(self):
         note_details = await asyncio.gather(*task_list)
         for note_detail in note_details:
             if note_detail is not None:
-                await xhs_model.update_xhs_note(note_detail)
+                await xhs_store.update_xhs_note(note_detail)
         await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
 
     async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@@ -174,7 +174,7 @@ async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
 
             # 更新或保存过滤后的评论
             for comment in filtered_comments:
-                await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
+                await xhs_store.update_xhs_note_comment(note_id=note_id, comment_item=comment)
 
     @staticmethod
     def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: