#!/usr/local/bin/python3.7
"""
@File : xicidaili.py
@Time : 2020/06/02
@Author : Mozili
"""
import urllib.request
import urllib.parse
from lxml import etree
import random
import time
def handler_request(url):
# 请求头
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
# 创建请求
req = urllib.request.Request(url=url, headers=headers)
# 发送请求
res = urllib.request.urlopen(req)
# 获取内容
cot = res.read().decode()
return cot
def preserve_data(ips, ports, types):
for ip in ips:
for i in range(len(ports)):
for j in range(len(types)):
str = types[j] + ' ' + ip + ':' + ports[i] + '\n'
# 删除列表中第一个元素
del types[0]
# print(types)
del ports[0]
# print(ports)
with open('Reptile/daili.txt', 'a', encoding='utf-8') as fp:
fp.write(str)
break
break
def download_content(tree):
# 获取ip
ips = tree.xpath("//tr[@class='odd']/td[2]/text()")
# print(ips)
# 获取端口
ports = tree.xpath("//tr[@class='odd']/td[3]/text()")
# print(ports)
# 获取类型
types = tree.xpath("//tr[@class='odd']/td[6]/text()")
# print(types)
# 保存数据到txt文档
preserve_data(ips, ports, types)
if __name__ == "__main__":
# 输入页码
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
# url列表
url_list= [
'https://www.xicidaili.com/nn/',
'https://www.xicidaili.com/nt/',
'https://www.xicidaili.com/wn/',
'https://www.xicidaili.com/wt/',
'https://www.xicidaili.com/qq/'
]
for url in url_list:
for page in range(start_page, end_page+1):
new_url = url + str(page)
# print(url)
# 创建请求
content = handler_request(new_url)
# print(content)
time.sleep(1)
# 创建对象,网络文件
tree = etree.HTML(content)
# 开始爬取内容
download_content(tree)