2019-04-06 | 爬虫 | UNLOCK

Python爬虫之简单的代理池实现


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding: utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import requests
from bs4 import BeautifulSoup
import pymongo
import time
from multiprocessing import Pool

to_extract = ["http://www.nimadaili.com/gaoni/{}/",
"http://www.nimadaili.com/http/{}/",
"http://www.nimadaili.com/https/{}/"]
client = pymongo.MongoClient('localhost',27017,connect=False)
nimaproxy = client['nimaproxy']
extract_ip = nimaproxy['extract_ip']
useful_ip = nimaproxy['useful_ip']

# 待提取量级
def get_disturl (num):
return [ori.format(i) for i in range(1,num+1) for ori in to_extract]


# 提取数据
def pageparsing(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
ips = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(1)')
protocols = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(2)')
anns = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(3)')
locations = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(4)')
for ip,protocol,ann,location in zip(ips,protocols,anns,locations):
data = {
'ip':ip.get_text(),
'protocols': protocol.get_text().strip('代理').split(','),
'ann': ann.get_text(),
'location': location.get_text()
}
print(data)
extract_ip.insert_one(data)


# 清洗数据
def washurl(num):
for i in range(num):
pool = Pool(processes=6)
pool.map(_extract_ip_test,extract_ip.find())
pool.close()
time.sleep(3)

nimaproxy['useful_ip'].insert_many(extract_ip.find({},{'_id':0}))
extract_ip.delete_many({})

# 测试可用
def _extract_ip_test(info):
proxy = info.get('protocols')[0].lower() + '://' + info.get('ip')
proxies = {
'http': proxy,
'https': proxy
}
try:
data = requests.get('https://www.baidu.com', timeout=3, proxies=proxies)
print(proxy + ':' + str(data.status_code))
except Exception as e:
extract_ip.delete_one(info)
print(e)

# 测速函数
def _useful_ip_test(info):
proxy = info.get('protocols')[0].lower() + '://' + info.get('ip')
proxies = {
'http': proxy,
'https': proxy
}
try:
data = requests.get('https://www.baidu.com', timeout=3, proxies=proxies)
useful_ip.update_one(info,{'$set':{'speed':data.elapsed.total_seconds()}})
except Exception as e:
useful_ip.delete_one(info)
print(e)

# 获取数据
def geturl(num):
# 对useful_ip集合中的所有数据进行测速,再根据延时排序返回列表
pool = Pool(processes=6)
pool.map(_useful_ip_test, useful_ip.find())
pool.close()
time.sleep(3)
proxeis = [info.get('protocols')[0].lower()+'://'+info.get('ip') for info in sorted(list(useful_ip.find()),key=lambda item:item['speed'],reverse=False)]
print(proxeis)
return proxeis

# 开始执行
def extract(num):
pool = Pool(processes=6)
pool.map(pageparsing,get_disturl(num))
pool.close()
time.sleep(3)

评论加载中