1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
| import sys reload(sys) sys.setdefaultencoding('utf8') import requests from bs4 import BeautifulSoup import pymongo import time from multiprocessing import Pool
to_extract = ["http://www.nimadaili.com/gaoni/{}/", "http://www.nimadaili.com/http/{}/", "http://www.nimadaili.com/https/{}/"] client = pymongo.MongoClient('localhost',27017,connect=False) nimaproxy = client['nimaproxy'] extract_ip = nimaproxy['extract_ip'] useful_ip = nimaproxy['useful_ip']
def get_disturl (num): return [ori.format(i) for i in range(1,num+1) for ori in to_extract]
def pageparsing(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,'lxml') ips = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(1)') protocols = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(2)') anns = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(3)') locations = soup.select('body > div > div:nth-child(2) > div.mt-0.mb-2.table-responsive > table > tbody > tr > td:nth-child(4)') for ip,protocol,ann,location in zip(ips,protocols,anns,locations): data = { 'ip':ip.get_text(), 'protocols': protocol.get_text().strip('代理').split(','), 'ann': ann.get_text(), 'location': location.get_text() } print(data) extract_ip.insert_one(data)
def washurl(num): for i in range(num): pool = Pool(processes=6) pool.map(_extract_ip_test,extract_ip.find()) pool.close() time.sleep(3)
nimaproxy['useful_ip'].insert_many(extract_ip.find({},{'_id':0})) extract_ip.delete_many({})
def _extract_ip_test(info): proxy = info.get('protocols')[0].lower() + '://' + info.get('ip') proxies = { 'http': proxy, 'https': proxy } try: data = requests.get('https://www.baidu.com', timeout=3, proxies=proxies) print(proxy + ':' + str(data.status_code)) except Exception as e: extract_ip.delete_one(info) print(e)
def _useful_ip_test(info): proxy = info.get('protocols')[0].lower() + '://' + info.get('ip') proxies = { 'http': proxy, 'https': proxy } try: data = requests.get('https://www.baidu.com', timeout=3, proxies=proxies) useful_ip.update_one(info,{'$set':{'speed':data.elapsed.total_seconds()}}) except Exception as e: useful_ip.delete_one(info) print(e)
def geturl(num): pool = Pool(processes=6) pool.map(_useful_ip_test, useful_ip.find()) pool.close() time.sleep(3) proxeis = [info.get('protocols')[0].lower()+'://'+info.get('ip') for info in sorted(list(useful_ip.find()),key=lambda item:item['speed'],reverse=False)] print(proxeis) return proxeis
def extract(num): pool = Pool(processes=6) pool.map(pageparsing,get_disturl(num)) pool.close() time.sleep(3)
|
评论加载中