爬代理ip并验证可用性
爬代理ip
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 17:30:36 2020
@author: Administrator
"""
#生成可用代理ip#python版本2.7
import sys
import time
import random
import re
import requests
from bs4 import BeautifulSoup as bs
from lxml import etree
from fake_useragent import UserAgent
#查看userAgent池文件地址
#https://pan.baidu.com/s/1_Qv1LGBSjO2bnF4ocMqhwQ 提取码: 2hpu
import tempfile
print(tempfile.gettempdir() + '\\fake_useragent_0.1.11.json')
# 实例化 UserAgent类
# 如报错就把上述json放到temp文件夹中
ua = UserAgent()
# 对应浏览器的头部信息
#print(ua.ie)
#print(ua.opera)
#print(ua.chrome)
#print(ua.firefox)
#print(ua.safari)
# 随机返回头部信息,推荐使用
print(ua.random)
#reload(sys)
#sys.setdefaultencoding('utf-8')
# 利用一个正则就可以直接采集代理IP的站点
PROXY_SITES_BY_REGX = {
'urls': [
'http://ab57.ru/downloads/proxyold.txt',
'http://www.proxylists.net/http_highanon.txt',
'http://www.atomintersoft.com/high_anonymity_elite_proxy_list',
'http://www.atomintersoft.com/transparent_proxy_list',
'http://www.atomintersoft.com/anonymous_proxy_list',
'http://www.proxy4free.info/',
'http://tools.rosinstrument.com/proxy/plab100.xml',
'https://www.rmccurdy.com/scripts/proxy/good.txt',
'http://proxy.ipcn.org/proxylist2.html',
'http://best-proxy.ru/feed',
'http://www.proxylists.net/?HTTP',
'http://uks.pl.ua/script/getproxy.php?last'
],
'proxy_regx': r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,4}"
}
//*[@id="services"]/div/div[2]/div/div/div/table/tbody/tr[1]/td[1]
# 需要利用xpath 定位代理IP 的站点
PROXY_SITES_BY_XPATH = [
{
'urls': ['http://www.66ip.cn/%s.html' % page for page in ['index'] + list(range(2, 11))],
'ip_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[1]/text()" ,
'port_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[2]/text()"
},
{
'urls': ['http://www.mimiip.com/gngao/%s' % page for page in range(2, 10)],
'ip_xpath': ".//table[@class='list']/tbody/tr/td[1]/text()",
'port_xpath': ".//table[@class='list']/tbody/tr/td[2]/text()"
},
{
'urls': ['http://www.ip181.com/daili/%s.html' % page for page in range(1, 8)],
'ip_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[1]/text()" ,
'port_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[2]/text()"
}
]
#http://www.goubanjia.com/
#res = [i.xpath('./td/*/text()') for i in selector.xpath('.//*[@class="table table-hover"]/tbody//tr')]
#[[''.join(i[:-7])+':'+i[-7],]+i[-6:] for i in res] #结果不对
# 抓取代理ip及port
def get_proxy(inFile):
headers= {'User-Agent':str(UserAgent().random)}
fp = open(inFile, 'a+')
#利用一个正则就可以直接采集代理IP的站点抓取
pattern = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,4}")
for url in PROXY_SITES_BY_REGX['urls']:
try:
response = requests.get(url, headers = headers).text
#response.split('\r\n')
proxy_list = pattern.findall(response)
fp.write('\n'.join(proxy_list))
#fp.writelines([ip+'\n' for ip in proxy_list])
print('+++Success:', url)
except Exception as e:
print('---Failure:', url)
print(e)
#需要利用xpath定位代理IP的站点抓取
print('*'*30)
for i in range(len(PROXY_SITES_BY_XPATH)):
proxy_sites = PROXY_SITES_BY_XPATH[i]
#pattern = proxy_sites['ip_xpath'].strip('/td[1]/text()') #strip的坑
pattern = proxy_sites['ip_xpath'].replace('/td[1]/text()','')
for url in proxy_sites['urls']:
try:
response = requests.get(url, headers = headers).text
selector = etree.HTML(response)
proxy_list = [
':'.join(i.xpath('./td/text()')[:2])
for i in selector.xpath(pattern)
]
#fp.write('\n'.join(proxy_list))
fp.writelines([ip+'\n' for ip in proxy_list])
print('+++Success:', url)
except Exception as e:
print('---Failure:', url)
print(e)
fp.close()
# 代理输出位置,可用fake_useragent包替代
def Header_get(agentFile):
agents = []
for line in open(AgentFile, "r"):
agents.append(line.strip('\n\r')[1:-1])
fakeheader = {}
fakeheader['User-agent'] = agents[random.randint(0, len(agents)-1)]
return fakeheader
#这里没有完全将上面所有存在代理ip的地址全部爬取下来,你可以将那些网址上的ip直接拷贝写到文件上,然后测试哪个对你当前的网络能够使用,这里使用百度的网址进行测试
def inspect_ip(inFile, outFile):
import http.client
import threading
# requestHeaders = {
# 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
# }
requestHeaders= {'User-Agent': str(UserAgent().random)}
requestUrl = 'http://www.baidu.com/'
f_in = open(inFile, 'r')
f_out = open(outFile, 'w')
lock = threading.Lock()
while True:
lock.acquire()
ll = f_in.readline().strip()
lock.release()
if len(ll) == 0: break
line = ll.strip().split(':')
ip = line[0]
port = line[1]
try:
#http://ip:prot,http.client.HTTPConnection才是https
conn = http.client.HTTPConnection(ip, port, timeout=5.0)
conn.request(method='GET', url=requestUrl, headers=requestHeaders)
res = conn.getresponse()
lock.acquire()
print("+++Success:" + ip + ":" + port)
f_out.write(ll + "\n")
lock.release()
except:
print("---Failure:" + ip + ":" + port)
f_in.close()
f_out.close()
if __name__ == '__main__':
inFile = r'C:\Users\Administrator\Desktop\proxy.txt'
outFile = r'C:\Users\Administrator\Desktop\verified.txt'
#OUTPUT_FILE = "proxy_list.txt"
#AgentFile = r'C:\Users\Administrator\Desktop\user_agents.txt'
#get_proxy(inFile) #抓取代理ip
inspect_ip(inFile, outFile)
浙公网安备 33010602011771号