Skip to content

Commit dd9f22f

Browse files
committed
添加:匿名性校验
1 parent b4e5c06 commit dd9f22f

File tree

9 files changed

+185
-50
lines changed

9 files changed

+185
-50
lines changed

main.py

Lines changed: 2 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,4 @@
1-
import asyncio
2-
import typing
3-
4-
from apscheduler.schedulers.background import BackgroundScheduler
5-
6-
from src.database.sqlite_opt import sqlite_opt
7-
from src.entity.proxy_entity import ProxyEntity
8-
from src.log.logger import logger
9-
from src.spider.spiders import spider_collection
10-
from setting import WEB_SERVER, VALIDATOR, SPIDER
11-
from src.validator.validator import validator
12-
from src.web.web_flask import app
13-
14-
15-
def crawl():
16-
proxies = []
17-
tasks = []
18-
for spider_name in SPIDER['list']:
19-
tasks.append(spider_collection[spider_name].crawl())
20-
# proxies.extend(spider_collection[spider_name].crawl())
21-
loop = asyncio.new_event_loop()
22-
asyncio.set_event_loop(loop)
23-
results = loop.run_until_complete(asyncio.gather(*tasks))
24-
loop.close()
25-
for proxies_list in results:
26-
proxies.extend(proxies_list)
27-
# proxies = loop.run_until_complete(asyncio.gather(*tasks))
28-
# 持久化
29-
save(proxies)
30-
31-
32-
def save(proxies: typing.List[ProxyEntity]):
33-
for proxy in proxies:
34-
sqlite_opt.add_proxy(proxy)
35-
1+
from src.runner import run
362

373
if __name__ == '__main__':
38-
logger.info('初始化sqlite数据库...')
39-
sqlite_opt.init_db()
40-
scheduler = BackgroundScheduler()
41-
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
42-
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval'])
43-
scheduler.start()
44-
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])
4+
run()

setting.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# 代理爬虫配置
22
SPIDER = {
3-
'crawl_interval': 60, # 爬取IP代理的间隔(秒)
3+
'crawl_interval': 120, # 爬取IP代理的间隔(秒)
44
'list': [ # 使用的代理爬虫(类名)
55
'Spider66Ip',
66
'SpiderQuanWangIp',
@@ -14,9 +14,17 @@
1414

1515
# 校验器配置
1616
VALIDATOR = {
17-
'test_url': 'http://www.baidu.com',
17+
'test_url': 'http://www.baidu.com', # 可用校验url
1818
'request_timeout': 4, # 校验超时时间
19-
'validate_interval': 30
19+
'validate_interval': 60 # 校验间隔(秒)
20+
}
21+
22+
# 匿名性校验配置
23+
ANONYMITY_VALIDATOR = {
24+
'http_test_url': 'http://httpbin.org/get', # 匿名校验url
25+
'https_test_url': 'https://httpbin.org/get',
26+
'request_timeout': 4, # 校验最大超时时间
27+
'interval': 180 # 校验间隔(秒)
2028
}
2129

2230
# 数据库配置

src/database/abs_database.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,18 @@ def add_proxy(self, proxy):
66
def get_all_proxies(self):
77
raise NotImplementedError
88

9+
def get_unknown_anonymity_proxies(self):
10+
raise NotImplementedError
11+
912
def increase_reliability(self, url):
1013
raise NotImplementedError
1114

1215
def reduce_reliability(self, url):
1316
raise NotImplementedError
1417

18+
def update_anonymity(self, url, value):
19+
raise NotImplementedError
20+
1521
def remove(self, key):
1622
raise NotImplementedError
1723

src/database/sqlite_opt.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from sqlalchemy import create_engine, desc
66
from sqlalchemy.orm import sessionmaker
77
from src.entity.proxy_entity import ProxyEntity
8+
from src.enum.common import ProxyCoverEnum
89
from src.log.logger import logger
910
import sqlite3
1011

@@ -35,7 +36,19 @@ def get_all_proxies(self):
3536
return session.query(ProxyEntity).all()
3637
except Exception as e:
3738
logger.exception(e)
38-
pass
39+
finally:
40+
session.close()
41+
return []
42+
43+
def get_unknown_anonymity_proxies(self):
44+
session = self._DBSession()
45+
try:
46+
return (session.query(ProxyEntity)
47+
.filter(ProxyEntity.reliability > 0)
48+
.filter(ProxyEntity.proxy_cover == ProxyCoverEnum.UNKNOWN.value)
49+
.all())
50+
except Exception as e:
51+
logger.exception(e)
3952
finally:
4053
session.close()
4154
return []
@@ -79,6 +92,21 @@ def reduce_reliability(self, url):
7992
def remove(self, key):
8093
return super().remove(key)
8194

95+
def update_anonymity(self, url, value):
96+
conn = self._get_connect()
97+
cursor = conn.cursor()
98+
try:
99+
cursor.execute(f"""
100+
UPDATE {DB["table_name"]} SET proxy_cover = {value}
101+
WHERE url='{url}'
102+
""")
103+
conn.commit()
104+
except Exception as e:
105+
logger.exception(e)
106+
finally:
107+
cursor.close()
108+
conn.close()
109+
82110
def init_db(self):
83111
conn = self._get_connect()
84112
cursor = conn.cursor()

src/entity/proxy_entity.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
from src.enum.common import ProxyTypeEnum, ProxyCoverEnum
22
from sqlalchemy.ext.declarative import declarative_base
33
from sqlalchemy import Column, Integer, String
4+
from setting import DB
45
Base = declarative_base()
56

67

78
class ProxyEntity(Base):
8-
__tablename__ = 'proxy'
9+
__tablename__ = DB['table_name']
910
url = Column(String(36), primary_key=True)
1011
# ip = Column(String(20))
1112
# port = Column(String(5))

src/runner.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import asyncio
2+
import typing
3+
4+
from apscheduler.schedulers.background import BackgroundScheduler
5+
6+
from src.database.sqlite_opt import sqlite_opt
7+
from src.entity.proxy_entity import ProxyEntity
8+
from src.log.logger import logger
9+
from src.spider.spiders import spider_collection
10+
from setting import WEB_SERVER, VALIDATOR, SPIDER, ANONYMITY_VALIDATOR
11+
from src.validator.validator import validator
12+
from src.validator.anonymity_validator import anonymity_validator
13+
from src.web.web_flask import app
14+
15+
16+
def crawl():
17+
proxies = []
18+
tasks = []
19+
for spider_name in SPIDER['list']:
20+
tasks.append(spider_collection[spider_name].crawl())
21+
# proxies.extend(spider_collection[spider_name].crawl())
22+
loop = asyncio.new_event_loop()
23+
asyncio.set_event_loop(loop)
24+
results = loop.run_until_complete(asyncio.gather(*tasks))
25+
loop.close()
26+
for proxies_list in results:
27+
proxies.extend(proxies_list)
28+
# proxies = loop.run_until_complete(asyncio.gather(*tasks))
29+
# 持久化
30+
save(proxies)
31+
32+
33+
def save(proxies: typing.List[ProxyEntity]):
34+
for proxy in proxies:
35+
sqlite_opt.add_proxy(proxy)
36+
37+
38+
def run():
39+
logger.info('初始化sqlite数据库...')
40+
sqlite_opt.init_db()
41+
scheduler = BackgroundScheduler()
42+
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
43+
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval'])
44+
scheduler.add_job(anonymity_validator.run, 'interval', seconds=ANONYMITY_VALIDATOR['interval'])
45+
scheduler.start()
46+
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])

src/spider/abs_spider.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,12 @@ def __init__(self, name='unknown') -> None:
1010
self._name = name
1111

1212
async def crawl(self):
13-
res = []
1413
logger.info(f'{self._name}开始爬取...')
1514
try:
16-
res.extend(await self.do_crawl())
15+
return await self.do_crawl()
1716
except Exception as e:
1817
logger.exception(f'{self._name}爬取失败:e:{e}')
19-
return res
18+
return []
2019

2120
async def do_crawl(self) -> List[ProxyEntity]:
2221
raise NotImplementedError

src/validator/anonymity_validator.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import asyncio
2+
import json
3+
4+
import aiohttp
5+
6+
from setting import ANONYMITY_VALIDATOR, HEADERS
7+
from src.database.sqlite_opt import sqlite_opt
8+
from src.enum.common import ProxyCoverEnum, ProxyTypeEnum
9+
from src.log.logger import logger
10+
11+
12+
class AnonymityValidator(object):
13+
14+
urls = {
15+
ProxyTypeEnum.UNKNOWN.value: ANONYMITY_VALIDATOR['http_test_url'],
16+
ProxyTypeEnum.HTTP.value: ANONYMITY_VALIDATOR['http_test_url'],
17+
ProxyTypeEnum.HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'],
18+
ProxyTypeEnum.HTTP_AND_HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'],
19+
}
20+
21+
def run(self):
22+
# 获取proxy列表
23+
proxy_list = sqlite_opt.get_unknown_anonymity_proxies()
24+
if len(proxy_list) > 0:
25+
tasks = [self.valid_proxy(proxy.url, proxy.proxy_type) for proxy in proxy_list]
26+
asyncio.run(asyncio.wait(tasks))
27+
28+
async def valid_proxy(self, proxy_url, proxy_type):
29+
async with aiohttp.ClientSession() as session:
30+
try:
31+
async with session.get(self.urls[proxy_type],
32+
proxy=proxy_url,
33+
headers=HEADERS,
34+
timeout=ANONYMITY_VALIDATOR['request_timeout']) as resp:
35+
if resp.status == 200:
36+
# 检验其匿名性
37+
r_dict = json.loads(await resp.text())
38+
headers = r_dict.get('headers', '')
39+
ip = r_dict.get('origin')
40+
proxy_connection = headers.get('Proxy-Connection', None)
41+
flag = True
42+
if ',' in ip:
43+
ips = str.split(ip, ',')
44+
first = ips[0]
45+
for p in ips:
46+
if first != p.lstrip():
47+
proxy_cover = ProxyCoverEnum.TRANSPARENT.value # 透明
48+
flag = False
49+
break
50+
if flag:
51+
if proxy_connection:
52+
proxy_cover = ProxyCoverEnum.NORMAL_COVER.value # 普匿
53+
else:
54+
proxy_cover = ProxyCoverEnum.HIGH_COVER.value # 高匿
55+
# 更新匿名性
56+
sqlite_opt.update_anonymity(proxy_url, proxy_cover)
57+
logger.info(f'验证匿名性成功: url:{proxy_url}, coverValue:{proxy_cover}')
58+
else:
59+
logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, 返回码:{resp.status}')
60+
except asyncio.TimeoutError:
61+
logger.warn(f'验证匿名性请求超时, proxy_url:{proxy_url}')
62+
except ConnectionRefusedError:
63+
logger.warn(f'验证匿名性请求被拒绝, proxy_url:{proxy_url}')
64+
except Exception as e:
65+
# logger.exception(e)
66+
logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, e:{e}')
67+
68+
69+
anonymity_validator = AnonymityValidator()
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import unittest
2+
3+
from src.database.sqlite_opt import sqlite_opt
4+
from src.validator.anonymity_validator import anonymity_validator
5+
6+
7+
class TestAnonymityValidator(unittest.TestCase):
8+
9+
def setUp(self) -> None:
10+
self._opt = sqlite_opt
11+
self._validator = anonymity_validator
12+
13+
# self._opt.clean()
14+
15+
def test_valid_proxy(self):
16+
self._validator.run()
17+
pass
18+

0 commit comments

Comments
 (0)