1 import requests
2 from lxml import etree
3
4 def request_header():
5 headers = {
6 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" #谷歌浏览器
7 }
8 return headers
9
10 '''
11 创建两个列表用来存放代理ip
12 '''
13 all_ip_list = [] #用于存放从网站上抓取到的ip
14 usable_ip_list = [] #用于存放通过检测ip后是否可以使用
15
16
17 def send_request():
18 #爬取7页,可自行修改
19 for i in range(1,8):
20 print(f'正在抓取第{i}页……')
21 response = requests.get(url=f'http://www.ip3366.net/free/?page={i}', headers=request_header())
22 text = response.text.encode('ISO-8859-1')
23 # print(text.decode('gbk'))
24 #使用xpath解析,提取出数据ip,端口
25 html = etree.HTML(text)
26 tr_list = html.xpath('/html/body/div[2]/div/div[2]/table/tbody/tr')
27 for td in tr_list:
28 ip_ = td.xpath('./td[1]/text()')[0] #ip
29 port_ = td.xpath('./td[2]/text()')[0] #端口
30 proxy = ip_ + ':' + port_ #115.218.5.5:9000
31 all_ip_list.append(proxy)
32 test_ip(proxy) #开始检测获取到的ip是否可以使用
33 print('抓取完成!')
34 # print(f'抓取到的ip个数为:{len(all_ip_list)}')
35 print(f'可以使用的ip个数为:{len(usable_ip_list)}')
36 print('分别有:\n', usable_ip_list)
37 #检测ip是否可以使用
38 def test_ip(proxy):
39 #构建代理ip
40 proxies = {
41 "http": "http://" + proxy,
42 "https": "http://" + proxy,
43 }
44 try:
45 response = requests.get(url='https://www.baidu.com/',headers=request_header(),proxies=proxies,timeout=1) #设置timeout,使响应等待1s
46 response.close()
47 if response.status_code == 200:
48 usable_ip_list.append(proxy)
49 print(proxy, '\033[31m可用\033[0m')
50 # 这里完成程序.............................
51 else:
52 print(proxy, '不可用')
53 except:
54 print(proxy,'请求异常')
55
56 if __name__ == '__main__':
57 send_request()