import requests
from lxml import etree
import time
import random
import csv
def test_ip(ip_address):
'''
测试ip是否可用
:param ip_address: 代理ip
'''
url = 'http://icanhazip.com/'
headers = {
# headers 头部文件
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
}
ip_pool = []
for ip_test in ip_address:
# print(ip_test)
try:
response = requests.get(url=url,headers=headers,proxies=ip_test,timeout=5)
if response.status_code == 200:
ip_pool.append(ip_test)
time.sleep(random.randint(2,8))
except Exception as e:
pass
print(ip_pool)
files_save(ip_pool)
def files_save(ip_list):
'''
将可用代理ip保存
:param ip_list:代理ip
:return:
'''
with open('./代理ip.csv','a+',encoding='utf-8')as f:
write = csv.writer(f)
write.writerow(ip_list)
pass
def get_page_data(nums):
'''
获取西刺代理的页面信息
:return:
'''
ip_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
}
for i in range(1,nums+1):
url = "https://www.xicidaili.com/nn/{}".format(i)
response = requests.request('get',url=url,headers=headers)
page_data = etree.HTML(response.text)
# 获取https信息
# https_infos = page_data.xpath(".//tr[@class='odd']")
# 获取http信息
# http_infos = page_data.xpath(".//tr[@class='']")
page_infos = page_data.xpath(".//tr[@class='odd']|.//tr[@class='']")
for info in page_infos:
ip_dict = {}
ip_address = info.xpath(".//td[2]/text()")[0]
ip_port = info.xpath(".//td[3]/text()")[0]
ip_type = info.xpath(".//td[6]/text()")[0].lower()
ip_dict[ip_type] = ip_type+'://'+ip_address+':'+ip_port
ip_list.append(ip_dict)
# print(ip_list)
test_ip(ip_list)
pass
pass
if __name__ == '__main__':
'''
爬取代理ip时应注意
需要测试此ip是否可用
爬取速度
分析:
url信息
页面 url
1 https://www.xicidaili.com/nn/
2 https://www.xicidaili.com/nn/2
3 https://www.xicidaili.com/nn/3
'''
# nums = int(input("请输入爬取页数>>"))
nums = 2
get_page_data(nums)