1 import requests
2 from pyquery import PyQuery as pq
3 import json
4 import jsonpath
5 from lxml import etree
6 import os
7 import re
8 import time
9
10 html = '''
11 <div>
12 <ul>
13 <li class="item-0">first item</li>
14 <li class="item-1"><a href="link2.html">second item</a></li>
15 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
16 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
17 <li class="item-0"><a href="link5.html">fifth item</a></li>
18 </ul>
19 </div>
20 '''
21
22 # html = requests.get('http://news.4399.com/gonglue/lscs/kptj/').content.decode('gbk')
23 num = 0
24 # def pq方法(url):
25 # global num
26 # html=requests.get(url).content.decode('gbk')
27 # doc = pq(html)
28 # items = doc('#dq_list > li').items()
29 # # print(doc)
30 # # print(type(doc))
31 # for item in items:
32 # url=item.find('img').attr('lz_src')
33 # num+=1
34 # print(str(num),url)
35 # url_content=requests.get(url).content
36 # name = item.find('.kp-name').text()
37
38 # with open('e:/py3/002/'+'{:0>4}'.format(str(num))+name+'.jpg','wb') as file:
39 # file.write(url_content)
40 # # print(url,name)
41
42 def transformCodec(re_data):#ascii (gbk) 转 unicode
43 try:
44 re_data = re_data.decode('gbk')
45 except Exception as error:
46 print (error)
47 print ('delete illegal string,try again...')
48
49 pos = re.findall(r'decodebytesinposition([\d]+)-([\d]+):illegal',str(error).replace(' ',''))
50 if len(pos)==1:
51 re_data = re_data[0:int(pos[0][0])]+re_data[int(pos[0][1]):]
52 re_data = transformCodec(re_data)
53 return re_data
54 return re_data
55
56
57 def lxml方法(url):
58 global num
59 header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'}
60 content=requests.get(url,headers=header).content
61 html=content.decode('utf-8')
62 # print(html.status_code)
63 # print(content)
64 # print(html)
65 r=etree.HTML(html)
66 # items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a/img/@lz_src")
67 items=r.xpath("//div[@id='list']/table//tr")
68 # print(items)
69 for item in items:
70 dl_ip=item.xpath("./td[1]/text()")
71 dl_port=item.xpath("./td[2]/text()")
72 dl_name=item.xpath("./td[5]/text()")
73 num+=1
74 dl_ip=dl_ip[0]+":" if len(dl_ip)>=1 else ''
75 dl_port=dl_port[0]+"#" if len(dl_port)>=1 else ''
76 dl_name=dl_name[0] if len(dl_name)>=1 else ''
77
78 # print(len(dl_ip))
79 # print(dl_ip)
80 # print(r'{}{}{}'.format(dl_ip,dl_port,dl_name))
81 with open("proxy.txt",'a',encoding='utf-8') as file:
82 file.write('{}{}{}\n'.format(dl_ip,dl_port,dl_name))
83 # lzcontent=requests.get(lzsrc).content
84 # with open('e:/py3/004/'+'{:0>4}'.format(str(num))+'_'+kpname+'.jpg','wb')as file:
85 # file.write(lzcontent)
86
87
88
89
90
91 if __name__ == '__main__':
92 with open("proxy.txt", 'w', encoding='utf-8') as file:
93 file.write(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2])+'_采集:\n')
94 # url='https://www.kuaidaili.com/free/inha/1/'
95 for i in range(1,11):
96 print('第'+str(i)+'次:\n')
97 url2 = r'https://www.kuaidaili.com/free/inha/'+str(i)+r'/'
98 print(url2)
99 lxml方法(url2)
100 time.sleep(5)
101
102 # header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'}
103 # pq方法()
104 # print(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2]))
105 print(str(num)+' ok!')
106
107
108 # 创建目录
109 '''
110 for dirnum in range(1,100):
111 dirnum2='{:0>3}'.format(str(dirnum))
112 mkpath="e:\\py3\\{}\\".format(dirnum2)
113 print(mkpath)
114 print('已存在!') if os.path.exists(mkpath) else os.makedirs(mkpath)
115 '''