Skip to content

Commit 1c7c47a

Browse files
committed
添加清理不可靠的代理任务
1 parent bb0db36 commit 1c7c47a

File tree

7 files changed

+44
-5
lines changed

7 files changed

+44
-5
lines changed

setting.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@
2727
'interval': 180 # 校验间隔(秒)
2828
}
2929

30+
# 清除不可用代理配置
31+
EXPIRATION_VALIDATOR = {
32+
'interval': 60 * 30
33+
}
34+
3035
# 数据库配置
3136
DB = {
3237
'db_name': 'proxy.db',

src/database/abs_database.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,8 @@ def update_anonymity(self, url, value):
2121
def remove(self, key):
2222
raise NotImplementedError
2323

24+
def remove_all_zero_reliability(self):
25+
raise NotImplementedError
26+
2427
def init_db(self):
2528
return

src/database/sqlite_opt.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ def init_db(self):
128128
""")
129129
except sqlite3.OperationalError as e:
130130
logger.warn(e)
131-
# logger.exception(e)
132131
finally:
133132
cursor.close()
134133
conn.close()
@@ -163,6 +162,21 @@ def get_all_in_page(self):
163162
session.close()
164163
return None
165164

165+
def remove_all_zero_reliability(self):
166+
conn = self._get_connect()
167+
cursor = conn.cursor()
168+
try:
169+
cursor.execute(f"""
170+
DELETE FROM {DB["table_name"]}
171+
WHERE reliability = 0
172+
""")
173+
conn.commit()
174+
except sqlite3.OperationalError as e:
175+
logger.warn(e)
176+
finally:
177+
cursor.close()
178+
conn.close()
179+
166180
@staticmethod
167181
def _get_connect():
168182
return sqlite3.connect(DB['db_name'])

src/runner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
from src.entity.proxy_entity import ProxyEntity
88
from src.log.logger import logger
99
from src.spider.spiders import spider_collection
10-
from setting import WEB_SERVER, VALIDATOR, SPIDER, ANONYMITY_VALIDATOR
10+
from setting import WEB_SERVER, VALIDATOR, SPIDER, ANONYMITY_VALIDATOR, EXPIRATION_VALIDATOR
11+
from src.validator.expiration_validator import expiration_validator
1112
from src.validator.validator import validator
1213
from src.validator.anonymity_validator import anonymity_validator
1314
from src.web.web_flask import app
@@ -41,5 +42,6 @@ def run():
4142
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
4243
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval'])
4344
scheduler.add_job(anonymity_validator.run, 'interval', seconds=ANONYMITY_VALIDATOR['interval'])
45+
scheduler.add_job(expiration_validator.run, 'interval', seconds=EXPIRATION_VALIDATOR['interval'])
4446
scheduler.start()
4547
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])

src/spider/abs_spider.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,12 @@ class AbsSpider(object):
1212

1313
def __init__(self, name='unknown') -> None:
1414
self._name = name
15-
self.urls = self.get_urls()
15+
self._urls = self.get_urls()
1616

1717
async def crawl(self):
1818
logger.info(f'{self._name}开始爬取...')
19-
urls = self.get_urls()
2019
res = []
21-
for url in urls:
20+
for url in self._urls:
2221
try:
2322
for page in self.get_page_range():
2423
async with aiohttp.ClientSession() as session:

src/validator/expiration_validator.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from src.database.sqlite_opt import sqlite_opt
2+
from src.log.logger import logger
3+
4+
5+
class ExpirationValidator(object):
6+
7+
def run(self):
8+
logger.info('开始删除不可用代理')
9+
sqlite_opt.remove_all_zero_reliability()
10+
logger.info('不可用代理删除完毕')
11+
12+
13+
expiration_validator = ExpirationValidator()

test/database/test_sqlite_opt.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,6 @@ def test_add_proxy(self):
2121
def test_get_all_proxies(self):
2222
proxy_list = self._opt.get_all_proxies()
2323
assert len(proxy_list) > 0
24+
25+
def test_remove_all_zero_reliability(self):
26+
self._opt.remove_all_zero_reliability()

0 commit comments

Comments
 (0)