1 # coding = utf-8
2
3 __autor__ = 'litao'
4
5 import urllib.request
6 import urllib.request
7 import urllib.error
8 import socket
9 import gevent
10 from gevent import monkey
11 from bs4 import BeautifulSoup
12 import time
13 import random
14 home = "http://www.xicidaili.com/wt/"
15 first_proxy_list = []
16 end_proxy_list = []
17 # proxy_support = urllib.request.ProxyHandler({"http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080"})
18 headers = {
19 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
20 }
21 monkey.patch_all()
22 def test_proxy(proxy_key):
23 # for i in range(len(first_proxy_list)):
24 # proxy_support = urllib.request.ProxyHandler({"http":proxy_list[i]})
25 print(proxy_key)
26 proxy = {"http":proxy_key}
27 url = "https://www.baidu.com/"
28
29
30 proxy_support = urllib.request.ProxyHandler(proxy)
31 opener = urllib.request.build_opener(proxy_support)
32 urllib.request.install_opener(opener)
33 res = urllib.request.Request(url=url, headers=headers)
34 try:
35 response = urllib.request.urlopen(res,timeout=5)
36 if response.code == 200:
37 end_proxy_list.append(proxy_key)
38 except Exception as e:
39 print("error:",e)
40 # except socket.timeout as e:
41 # print("This proxy is socket.timeout")
42 # except urllib.error.URLError as e:
43 # print("This proxy is timeout")
44
45 def get_proxy_list():
46 for i in range(20):
47 url =home + str(i+1)
48 print(url)
49 # proxy_support = urllib.request.ProxyHandler({"http":"123.125.5.100:3128"})
50 # opener = urllib.request.build_opener(proxy_support)
51 # urllib.request.install_opener(opener)
52 res = urllib.request.Request(url=url, headers=headers)
53 response =urllib.request.urlopen(res,timeout=20).read().decode()
54 soup = BeautifulSoup(response,'html.parser')
55 print(response)
56 content = soup.find_all("table",attrs={"id":"ip_list"})[0].find_all('tr')[1:]
57 for i in range(len(content)):
58 result = content[i].find_all('td')
59 proxy_enum = result[1].text+":"+result[2].text
60 print(proxy_enum)
61 first_proxy_list.append(proxy_enum)
62 time.sleep(random.randint(120,240))
63
64
65 def join_gevent(first_proxy_list,gevent_list):
66 for i in range(len(first_proxy_list)):
67 gevent_list.append(gevent.spawn(test_proxy,first_proxy_list[i]))
68
69 def main():
70 gevent_list = []
71 get_proxy_list()
72 with open("proxy_first.txt",'a',encoding='utf-8') as f:
73 for item in first_proxy_list:
74 f.write(item+'\n')
75 join_gevent(first_proxy_list, gevent_list)
76 gevent.joinall(gevent_list)
77 print(end_proxy_list)
78 with open("proxy_end.txt",'a',encoding='utf-8') as f:
79 for item in end_proxy_list:
80 f.write(item+'\n')
81
82 if __name__ == "__main__":
83 main()