1
- import time
1
+ import asyncio
2
2
from typing import List
3
3
4
4
import aiohttp
5
- import requests
6
5
7
6
from setting import HEADERS
8
7
from src .entity .proxy_entity import ProxyEntity
@@ -211,27 +210,31 @@ def __init__(self) -> None:
211
210
'https://www.kuaidaili.com/free/intr' # 透明
212
211
]
213
212
214
- def do_crawl (self ) -> List [ProxyEntity ]:
213
+ async def do_crawl (self ) -> List [ProxyEntity ]:
215
214
result = []
216
215
for base_url in self ._base_urls :
217
- for page in range (1 , 4 ):
218
- res = requests .get (f'{ base_url } /{ page } ' , headers = HEADERS )
219
- soup = BeautifulSoup (res .text , 'lxml' )
220
- trs = soup .find ('table' ).find ('tbody' ).find_all ('tr' )
221
- for tr in trs :
222
- tds = tr .find_all ('td' )
223
- ip = tds [0 ].text
224
- port = tds [1 ].text
225
- proxy_cover = tds [2 ].text
226
- proxy_type = tds [3 ].text
227
- region = tds [4 ].text
228
- result .append (ProxyEntity (f'{ proxy_type .lower ()} ://{ ip } :{ port } ' ,
229
- # ip, port, protocol=proxy_type.lower(),
230
- source = self ._name ,
231
- proxy_type = self ._judge_proxy_type (proxy_type ),
232
- proxy_cover = self ._judge_proxy_cover (proxy_cover ),
233
- region = region ))
234
- time .sleep (3 )
216
+ for page in range (1 , 3 ):
217
+ async with aiohttp .ClientSession () as session :
218
+ async with session .get (f'{ base_url } /{ page } ' , headers = HEADERS ) as resp :
219
+
220
+ # res = requests.get(f'{base_url}/{page}', headers=HEADERS)
221
+ soup = BeautifulSoup (await resp .text (), 'lxml' )
222
+ trs = soup .find ('table' ).find ('tbody' ).find_all ('tr' )
223
+ for tr in trs :
224
+ tds = tr .find_all ('td' )
225
+ ip = tds [0 ].text
226
+ port = tds [1 ].text
227
+ proxy_cover = tds [2 ].text
228
+ proxy_type = tds [3 ].text
229
+ region = tds [4 ].text
230
+ result .append (ProxyEntity (f'{ proxy_type .lower ()} ://{ ip } :{ port } ' ,
231
+ # ip, port, protocol=proxy_type.lower(),
232
+ source = self ._name ,
233
+ proxy_type = self ._judge_proxy_type (proxy_type ),
234
+ proxy_cover = self ._judge_proxy_cover (proxy_cover ),
235
+ region = region ))
236
+ # 爬太快会被封
237
+ await asyncio .sleep (3 )
235
238
return result
236
239
237
240
def _judge_proxy_type (self , type_str : str ):
@@ -250,3 +253,195 @@ def _judge_proxy_cover(self, cover_str: str):
250
253
return ProxyCoverEnum .HIGH_COVER .value
251
254
else :
252
255
return ProxyCoverEnum .UNKNOWN .value
256
+
257
+
258
+ @spider_register
259
+ class SpiderYunDaiLiIp (AbsSpider ):
260
+ """
261
+ 云代理IP 刷新速度: 快
262
+ http://www.ip3366.net/free
263
+ """
264
+ def __init__ (self ) -> None :
265
+ super ().__init__ ('云代理IP爬虫' )
266
+ self ._base_urls = [
267
+ 'http://www.ip3366.net/free/?stype=1' , # 高匿
268
+ 'http://www.ip3366.net/free/?stype=2' # 透明 or 普匿
269
+ ]
270
+
271
+ async def do_crawl (self ) -> List [ProxyEntity ]:
272
+ result = []
273
+ for base_url in self ._base_urls :
274
+ for page in range (1 , 3 ):
275
+ async with aiohttp .ClientSession () as session :
276
+ async with session .get (f'{ base_url } &page={ page } ' , headers = HEADERS ) as resp :
277
+
278
+ # res = requests.get(f'{base_url}/{page}', headers=HEADERS)
279
+ soup = BeautifulSoup (await resp .text (), 'lxml' )
280
+ trs = soup .find ('table' ).find ('tbody' ).find_all ('tr' )
281
+ for tr in trs :
282
+ tds = tr .find_all ('td' )
283
+ ip = tds [0 ].text
284
+ port = tds [1 ].text
285
+ proxy_cover = tds [2 ].text
286
+ proxy_type = tds [3 ].text
287
+ region = tds [4 ].text
288
+ result .append (ProxyEntity (f'{ proxy_type .lower ()} ://{ ip } :{ port } ' ,
289
+ # ip, port, protocol=proxy_type.lower(),
290
+ source = self ._name ,
291
+ proxy_type = self ._judge_proxy_type (proxy_type ),
292
+ proxy_cover = self ._judge_proxy_cover (proxy_cover ),
293
+ region = region ))
294
+ return result
295
+
296
+ def _judge_proxy_type (self , type_str : str ):
297
+ type_low = type_str .lower ()
298
+ if type_low == 'http' :
299
+ return ProxyTypeEnum .HTTP .value
300
+ elif type_low == 'https' :
301
+ return ProxyTypeEnum .HTTPS .value
302
+ else :
303
+ return ProxyTypeEnum .UNKNOWN .value
304
+
305
+ def _judge_proxy_cover (self , cover_str : str ):
306
+ if cover_str == '透明代理IP' :
307
+ return ProxyCoverEnum .TRANSPARENT .value
308
+ elif cover_str == '高匿代理IP' :
309
+ return ProxyCoverEnum .HIGH_COVER .value
310
+ elif cover_str == '普通代理IP' :
311
+ return ProxyCoverEnum .NORMAL_COVER .value
312
+ else :
313
+ return ProxyCoverEnum .UNKNOWN .value
314
+
315
+
316
+ @spider_register
317
+ class SpiderIpHaiIp (AbsSpider ):
318
+ """
319
+ IP海代理IP 刷新速度: 8分钟/1个
320
+ 有时会连不上
321
+ http://www.iphai.com
322
+ """
323
+ def __init__ (self ) -> None :
324
+ super ().__init__ ('IP海代理IP爬虫' )
325
+ self ._base_urls = [
326
+ 'http://www.iphai.com/free/ng' , # 国内高匿
327
+ 'http://www.iphai.com/free/np' , # 国内普通
328
+ 'http://www.iphai.com/free/wg' , # 国外高匿
329
+ 'http://www.iphai.com/free/wp' , # 国外普通
330
+ ]
331
+
332
+ async def do_crawl (self ) -> List [ProxyEntity ]:
333
+ result = []
334
+ for base_url in self ._base_urls :
335
+ async with aiohttp .ClientSession () as session :
336
+ async with session .get (base_url , headers = HEADERS ) as resp :
337
+ soup = BeautifulSoup (await resp .text (), 'lxml' )
338
+ table = soup .find ('table' )
339
+ if table is None :
340
+ continue
341
+ tbody = soup .find ('tbody' )
342
+ if tbody is None :
343
+ continue
344
+ trs = tbody .find_all ('tr' )
345
+ for i , tr in enumerate (trs ):
346
+ if i == 0 :
347
+ continue
348
+ tds = tr .find_all ('td' )
349
+ ip = tds [0 ].text
350
+ port = tds [1 ].text
351
+ proxy_cover = tds [2 ].text
352
+ proxy_type = tds [3 ].text if tds [3 ].text != '' else 'http'
353
+ region = tds [4 ].text
354
+ result .append (ProxyEntity (f'{ proxy_type .lower ()} ://{ ip } :{ port } ' ,
355
+ # ip, port, protocol=proxy_type.lower(),
356
+ source = self ._name ,
357
+ proxy_type = self ._judge_proxy_type (proxy_type ),
358
+ proxy_cover = self ._judge_proxy_cover (proxy_cover ),
359
+ region = region ))
360
+ await asyncio .sleep (2 )
361
+ return result
362
+
363
+ @staticmethod
364
+ def _judge_proxy_type (type_str : str ):
365
+ type_low = type_str .lower ()
366
+ if type_low == 'http' :
367
+ return ProxyTypeEnum .HTTP .value
368
+ elif type_low == 'https' :
369
+ return ProxyTypeEnum .HTTPS .value
370
+ else :
371
+ return ProxyTypeEnum .UNKNOWN .value
372
+
373
+ @staticmethod
374
+ def _judge_proxy_cover (cover_str : str ):
375
+ if cover_str == '透明' :
376
+ return ProxyCoverEnum .TRANSPARENT .value
377
+ elif cover_str == '高匿' :
378
+ return ProxyCoverEnum .HIGH_COVER .value
379
+ elif cover_str == '普匿' :
380
+ return ProxyCoverEnum .NORMAL_COVER .value
381
+ else :
382
+ return ProxyCoverEnum .UNKNOWN .value
383
+
384
+
385
+ @spider_register
386
+ class SpiderMianFeiDaiLiIp (AbsSpider ):
387
+ """
388
+ 免费代理IP库
389
+ http://ip.jiangxianli.com/
390
+ """
391
+ def __init__ (self ) -> None :
392
+ super ().__init__ ('免费代理IP爬虫' )
393
+ self ._base_url = 'http://ip.jiangxianli.com/?page={}'
394
+
395
+ async def do_crawl (self ) -> List [ProxyEntity ]:
396
+ result = []
397
+ for page in range (1 , 4 ):
398
+ async with aiohttp .ClientSession () as session :
399
+ async with session .get (self ._base_url .format (page ), headers = HEADERS ) as resp :
400
+ soup = BeautifulSoup (await resp .text (), 'lxml' )
401
+ table = soup .find ('table' )
402
+ if table is None :
403
+ continue
404
+ tbody = soup .find ('tbody' )
405
+ if tbody is None :
406
+ continue
407
+ trs = tbody .find_all ('tr' )
408
+ for i , tr in enumerate (trs ):
409
+ if i == 0 :
410
+ continue
411
+ tds = tr .find_all ('td' )
412
+ ip = tds [1 ].text
413
+ port = tds [2 ].text
414
+ proxy_cover = tds [3 ].text
415
+ proxy_type = tds [4 ].text if tds [3 ].text != '' else 'http'
416
+ region = tds [5 ].text
417
+ supplier = tds [6 ].text
418
+ result .append (ProxyEntity (f'{ proxy_type .lower ()} ://{ ip } :{ port } ' ,
419
+ # ip, port, protocol=proxy_type.lower(),
420
+ source = self ._name ,
421
+ supplier = supplier ,
422
+ proxy_type = self ._judge_proxy_type (proxy_type ),
423
+ proxy_cover = self ._judge_proxy_cover (proxy_cover ),
424
+ region = region ))
425
+ await asyncio .sleep (2 )
426
+ return result
427
+
428
+ @staticmethod
429
+ def _judge_proxy_type (type_str : str ):
430
+ type_low = type_str .lower ()
431
+ if type_low == 'http' :
432
+ return ProxyTypeEnum .HTTP .value
433
+ elif type_low == 'https' :
434
+ return ProxyTypeEnum .HTTPS .value
435
+ else :
436
+ return ProxyTypeEnum .UNKNOWN .value
437
+
438
+ @staticmethod
439
+ def _judge_proxy_cover (cover_str : str ):
440
+ if cover_str == '透明' :
441
+ return ProxyCoverEnum .TRANSPARENT .value
442
+ elif cover_str == '高匿' :
443
+ return ProxyCoverEnum .HIGH_COVER .value
444
+ elif cover_str == '普匿' :
445
+ return ProxyCoverEnum .NORMAL_COVER .value
446
+ else :
447
+ return ProxyCoverEnum .UNKNOWN .value
0 commit comments