Skip to content

Commit 6adf3be

Browse files
committed
add more spider & async
1 parent 277e444 commit 6adf3be

11 files changed

+287
-50
lines changed

main.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import asyncio
22
import typing
33

4-
from apscheduler.schedulers.asyncio import AsyncIOScheduler
54
from apscheduler.schedulers.background import BackgroundScheduler
65

76
from src.database.sqlite_opt import sqlite_opt
@@ -34,19 +33,11 @@ def save(proxies: typing.List[ProxyEntity]):
3433
sqlite_opt.add_proxy(proxy)
3534

3635

37-
def init_db():
38-
sqlite_opt.init_db()
39-
40-
41-
def check():
42-
validator.run()
43-
44-
4536
if __name__ == '__main__':
46-
init_db()
37+
sqlite_opt.init_db()
4738
scheduler = BackgroundScheduler()
48-
# scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
49-
scheduler.add_job(crawl, 'interval', seconds=10)
50-
# scheduler.add_job(check, 'interval', seconds=VALIDATOR['validate_interval'])
39+
scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
40+
# scheduler.add_job(crawl, 'interval', seconds=60)
41+
scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval'])
5142
scheduler.start()
5243
app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])

setting.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,31 @@
1+
# 代理爬虫配置
12
SPIDER = {
2-
'crawl_interval': 75, # 爬取IP代理的间隔(秒)
3-
'list': [
4-
# 'Spider66Ip',
5-
# 'SpiderQuanWangIp',
3+
'crawl_interval': 60, # 爬取IP代理的间隔(秒)
4+
'list': [ # 使用的代理爬虫(类名)
5+
'Spider66Ip',
6+
'SpiderQuanWangIp',
67
'SpiderXiciIp',
7-
# 'SpiderKuaiDaiLiIp'
8+
'SpiderKuaiDaiLiIp',
9+
'SpiderYunDaiLiIp',
10+
'SpiderIpHaiIp',
11+
'SpiderMianFeiDaiLiIp'
812
]
913
}
1014

15+
# 校验器配置
1116
VALIDATOR = {
1217
'test_url': 'http://www.baidu.com',
13-
'request_timeout': 4,
14-
'validate_interval': 60
18+
'request_timeout': 4, # 校验超时时间
19+
'validate_interval': 30
1520
}
1621

17-
# sqlite
22+
# 数据库配置
1823
DB = {
1924
'db_name': 'test.db',
2025
'table_name': 'proxy'
2126
}
2227

28+
# WEB配置(Flask)
2329
WEB_SERVER = {
2430
'host': 'localhost',
2531
'port': '8080'

src/spider/abs_spider.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ async def crawl(self):
1313
print(f'{self._name}开始爬取...')
1414
try:
1515
res.extend(await self.do_crawl())
16-
# print(f'{self._name}爬取完毕!共:{len(res)}个代理')
1716
except Exception as e:
1817
print(f'{self._name}爬取失败:e:{e}')
1918
return res

src/spider/spiders.py

Lines changed: 216 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
import time
1+
import asyncio
22
from typing import List
33

44
import aiohttp
5-
import requests
65

76
from setting import HEADERS
87
from src.entity.proxy_entity import ProxyEntity
@@ -211,27 +210,31 @@ def __init__(self) -> None:
211210
'https://www.kuaidaili.com/free/intr' # 透明
212211
]
213212

214-
def do_crawl(self) -> List[ProxyEntity]:
213+
async def do_crawl(self) -> List[ProxyEntity]:
215214
result = []
216215
for base_url in self._base_urls:
217-
for page in range(1, 4):
218-
res = requests.get(f'{base_url}/{page}', headers=HEADERS)
219-
soup = BeautifulSoup(res.text, 'lxml')
220-
trs = soup.find('table').find('tbody').find_all('tr')
221-
for tr in trs:
222-
tds = tr.find_all('td')
223-
ip = tds[0].text
224-
port = tds[1].text
225-
proxy_cover = tds[2].text
226-
proxy_type = tds[3].text
227-
region = tds[4].text
228-
result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
229-
# ip, port, protocol=proxy_type.lower(),
230-
source=self._name,
231-
proxy_type=self._judge_proxy_type(proxy_type),
232-
proxy_cover=self._judge_proxy_cover(proxy_cover),
233-
region=region))
234-
time.sleep(3)
216+
for page in range(1, 3):
217+
async with aiohttp.ClientSession() as session:
218+
async with session.get(f'{base_url}/{page}', headers=HEADERS) as resp:
219+
220+
# res = requests.get(f'{base_url}/{page}', headers=HEADERS)
221+
soup = BeautifulSoup(await resp.text(), 'lxml')
222+
trs = soup.find('table').find('tbody').find_all('tr')
223+
for tr in trs:
224+
tds = tr.find_all('td')
225+
ip = tds[0].text
226+
port = tds[1].text
227+
proxy_cover = tds[2].text
228+
proxy_type = tds[3].text
229+
region = tds[4].text
230+
result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
231+
# ip, port, protocol=proxy_type.lower(),
232+
source=self._name,
233+
proxy_type=self._judge_proxy_type(proxy_type),
234+
proxy_cover=self._judge_proxy_cover(proxy_cover),
235+
region=region))
236+
# 爬太快会被封
237+
await asyncio.sleep(3)
235238
return result
236239

237240
def _judge_proxy_type(self, type_str: str):
@@ -250,3 +253,195 @@ def _judge_proxy_cover(self, cover_str: str):
250253
return ProxyCoverEnum.HIGH_COVER.value
251254
else:
252255
return ProxyCoverEnum.UNKNOWN.value
256+
257+
258+
@spider_register
259+
class SpiderYunDaiLiIp(AbsSpider):
260+
"""
261+
云代理IP 刷新速度: 快
262+
http://www.ip3366.net/free
263+
"""
264+
def __init__(self) -> None:
265+
super().__init__('云代理IP爬虫')
266+
self._base_urls = [
267+
'http://www.ip3366.net/free/?stype=1', # 高匿
268+
'http://www.ip3366.net/free/?stype=2' # 透明 or 普匿
269+
]
270+
271+
async def do_crawl(self) -> List[ProxyEntity]:
272+
result = []
273+
for base_url in self._base_urls:
274+
for page in range(1, 3):
275+
async with aiohttp.ClientSession() as session:
276+
async with session.get(f'{base_url}&page={page}', headers=HEADERS) as resp:
277+
278+
# res = requests.get(f'{base_url}/{page}', headers=HEADERS)
279+
soup = BeautifulSoup(await resp.text(), 'lxml')
280+
trs = soup.find('table').find('tbody').find_all('tr')
281+
for tr in trs:
282+
tds = tr.find_all('td')
283+
ip = tds[0].text
284+
port = tds[1].text
285+
proxy_cover = tds[2].text
286+
proxy_type = tds[3].text
287+
region = tds[4].text
288+
result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
289+
# ip, port, protocol=proxy_type.lower(),
290+
source=self._name,
291+
proxy_type=self._judge_proxy_type(proxy_type),
292+
proxy_cover=self._judge_proxy_cover(proxy_cover),
293+
region=region))
294+
return result
295+
296+
def _judge_proxy_type(self, type_str: str):
297+
type_low = type_str.lower()
298+
if type_low == 'http':
299+
return ProxyTypeEnum.HTTP.value
300+
elif type_low == 'https':
301+
return ProxyTypeEnum.HTTPS.value
302+
else:
303+
return ProxyTypeEnum.UNKNOWN.value
304+
305+
def _judge_proxy_cover(self, cover_str: str):
306+
if cover_str == '透明代理IP':
307+
return ProxyCoverEnum.TRANSPARENT.value
308+
elif cover_str == '高匿代理IP':
309+
return ProxyCoverEnum.HIGH_COVER.value
310+
elif cover_str == '普通代理IP':
311+
return ProxyCoverEnum.NORMAL_COVER.value
312+
else:
313+
return ProxyCoverEnum.UNKNOWN.value
314+
315+
316+
@spider_register
317+
class SpiderIpHaiIp(AbsSpider):
318+
"""
319+
IP海代理IP 刷新速度: 8分钟/1个
320+
有时会连不上
321+
http://www.iphai.com
322+
"""
323+
def __init__(self) -> None:
324+
super().__init__('IP海代理IP爬虫')
325+
self._base_urls = [
326+
'http://www.iphai.com/free/ng', # 国内高匿
327+
'http://www.iphai.com/free/np', # 国内普通
328+
'http://www.iphai.com/free/wg', # 国外高匿
329+
'http://www.iphai.com/free/wp', # 国外普通
330+
]
331+
332+
async def do_crawl(self) -> List[ProxyEntity]:
333+
result = []
334+
for base_url in self._base_urls:
335+
async with aiohttp.ClientSession() as session:
336+
async with session.get(base_url, headers=HEADERS) as resp:
337+
soup = BeautifulSoup(await resp.text(), 'lxml')
338+
table = soup.find('table')
339+
if table is None:
340+
continue
341+
tbody = soup.find('tbody')
342+
if tbody is None:
343+
continue
344+
trs = tbody.find_all('tr')
345+
for i, tr in enumerate(trs):
346+
if i == 0:
347+
continue
348+
tds = tr.find_all('td')
349+
ip = tds[0].text
350+
port = tds[1].text
351+
proxy_cover = tds[2].text
352+
proxy_type = tds[3].text if tds[3].text != '' else 'http'
353+
region = tds[4].text
354+
result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
355+
# ip, port, protocol=proxy_type.lower(),
356+
source=self._name,
357+
proxy_type=self._judge_proxy_type(proxy_type),
358+
proxy_cover=self._judge_proxy_cover(proxy_cover),
359+
region=region))
360+
await asyncio.sleep(2)
361+
return result
362+
363+
@staticmethod
364+
def _judge_proxy_type(type_str: str):
365+
type_low = type_str.lower()
366+
if type_low == 'http':
367+
return ProxyTypeEnum.HTTP.value
368+
elif type_low == 'https':
369+
return ProxyTypeEnum.HTTPS.value
370+
else:
371+
return ProxyTypeEnum.UNKNOWN.value
372+
373+
@staticmethod
374+
def _judge_proxy_cover(cover_str: str):
375+
if cover_str == '透明':
376+
return ProxyCoverEnum.TRANSPARENT.value
377+
elif cover_str == '高匿':
378+
return ProxyCoverEnum.HIGH_COVER.value
379+
elif cover_str == '普匿':
380+
return ProxyCoverEnum.NORMAL_COVER.value
381+
else:
382+
return ProxyCoverEnum.UNKNOWN.value
383+
384+
385+
@spider_register
386+
class SpiderMianFeiDaiLiIp(AbsSpider):
387+
"""
388+
免费代理IP库
389+
http://ip.jiangxianli.com/
390+
"""
391+
def __init__(self) -> None:
392+
super().__init__('免费代理IP爬虫')
393+
self._base_url = 'http://ip.jiangxianli.com/?page={}'
394+
395+
async def do_crawl(self) -> List[ProxyEntity]:
396+
result = []
397+
for page in range(1, 4):
398+
async with aiohttp.ClientSession() as session:
399+
async with session.get(self._base_url.format(page), headers=HEADERS) as resp:
400+
soup = BeautifulSoup(await resp.text(), 'lxml')
401+
table = soup.find('table')
402+
if table is None:
403+
continue
404+
tbody = soup.find('tbody')
405+
if tbody is None:
406+
continue
407+
trs = tbody.find_all('tr')
408+
for i, tr in enumerate(trs):
409+
if i == 0:
410+
continue
411+
tds = tr.find_all('td')
412+
ip = tds[1].text
413+
port = tds[2].text
414+
proxy_cover = tds[3].text
415+
proxy_type = tds[4].text if tds[3].text != '' else 'http'
416+
region = tds[5].text
417+
supplier = tds[6].text
418+
result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
419+
# ip, port, protocol=proxy_type.lower(),
420+
source=self._name,
421+
supplier=supplier,
422+
proxy_type=self._judge_proxy_type(proxy_type),
423+
proxy_cover=self._judge_proxy_cover(proxy_cover),
424+
region=region))
425+
await asyncio.sleep(2)
426+
return result
427+
428+
@staticmethod
429+
def _judge_proxy_type(type_str: str):
430+
type_low = type_str.lower()
431+
if type_low == 'http':
432+
return ProxyTypeEnum.HTTP.value
433+
elif type_low == 'https':
434+
return ProxyTypeEnum.HTTPS.value
435+
else:
436+
return ProxyTypeEnum.UNKNOWN.value
437+
438+
@staticmethod
439+
def _judge_proxy_cover(cover_str: str):
440+
if cover_str == '透明':
441+
return ProxyCoverEnum.TRANSPARENT.value
442+
elif cover_str == '高匿':
443+
return ProxyCoverEnum.HIGH_COVER.value
444+
elif cover_str == '普匿':
445+
return ProxyCoverEnum.NORMAL_COVER.value
446+
else:
447+
return ProxyCoverEnum.UNKNOWN.value

src/web/web_flask.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@ def index():
1010
"""主页
1111
"""
1212
return '''
13-
<h1>Welcome to Home Page😄</h1>
14-
<h2>APIS:</h2>
13+
<h1>😘Welcome to Home Page😄</h1>
14+
<h1>🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️</h1>
15+
<h2>APIs:</h2>
1516
<h3>Get a usable proxy:</h3>
1617
<p>/get</p>
1718
<h3>Get all usable proxies:</h3>

test/spider/test_spider_66_ip.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ def setUp(self) -> None:
1010
self._spider = Spider66Ip()
1111

1212
def test_crawl(self):
13-
# async def dodo():
14-
# return await
1513
result = asyncio.run(self._spider.crawl())
1614
assert result
1715
assert len(result) > 0

test/spider/test_spider_ip_hai_ip.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import asyncio
2+
import unittest
3+
4+
from src.spider.spiders import SpiderIpHaiIp
5+
6+
7+
class TestSpiderXiciIp(unittest.TestCase):
8+
9+
def setUp(self) -> None:
10+
self._spider = SpiderIpHaiIp()
11+
12+
def test_crawl(self):
13+
result = asyncio.run(self._spider.crawl())
14+
assert result
15+
assert len(result) > 0

test/spider/test_spider_kuai_dai_li_ip.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import unittest
23

34
from src.spider.spiders import SpiderKuaiDaiLiIp
@@ -9,6 +10,6 @@ def setUp(self) -> None:
910
self._spider = SpiderKuaiDaiLiIp()
1011

1112
def test_crawl(self):
12-
result = self._spider.crawl()
13+
result = asyncio.run(self._spider.crawl())
1314
assert result
1415
assert len(result) > 0

0 commit comments

Comments
 (0)