Python 【爬取淘宝】

 1 import requests
 2 
 3 import re
 4 
 5 
 6 def getHTMLText(url):
 7     try:
 8         r = requests.get(url,timeout=30)
 9         r.raise_for_status()
10         r.encoding = r.apparent_encoding
11         html = r.text
12         return html
13     except:
14         return ""
15 
16 def parsePage(ilt,html):
17 
18     try:
19         plt =  re.findall(r'\"view_price\"\:\"[\d\.]*"',html)
20         tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
21         for i in range(len(plt)):
22             price = eval(plt[i].split(":")[1])
23             title = eval(tlt[i].split(":")[1])
24             ilt.append([price,title])
25     except:
26         print("")
27 
28 def printGoodsList(ilt):
29 
30     tplt = "{:4}\t{:8}\t{:16}"
31 
32     print(tplt.format("序号","价格","商品名称"))
33     count = 0
34     for g in ilt:
35         count = count + 1
36         print(tplt.format(count,g[0],g[1]))
37     
38 def main():
39 
40     goods = "书包"
41     depth = 2
42     start_url = "https://s.taobao.com/search?q=" + goods
43     infoList=[]
44     for i in range(depth):
45         try:
46             url = start_url + '&s=' +str(44*i)
47             html = getHTMLText(url)
48             parsePage(infoList,html)
49         except:
50 
51             continue
52     printGoodsList(infoList)        
53     
54 if __name__ == "__main__":
55     main()
56      
57     

 类似的模板:

 1 import re
 2 import requests
 3 
 4 
 5 def getHtml(url):
 6     headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\
 7                WebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
 8     try:
 9         r = requests.get(url,headers=headers)
10         r.raise_for_status()
11         r.encoding = r.apparent_encoding
12         html = r.text
13         return html
14     except:
15 
16         print("错误!")
17 
18         
19 def parseHtml(list,html):
20 
21     try:
22         pattern = r'<li><a href="(.*?)".*?><img src="(.*?)" />.*?<h3>(.*?)</h3></a></li>'
23         name = re.findall(pattern,html)
24         for i in range(len(name)):
25 
26             Name = (name[i][0])
27             Pic = (name[i][1])
28             Download = (name[i][2])
29             list.append([Name,Pic,Download])
30     except:
31         print("错误2!")
32 
33 
34 
35 def printInfo(list):
36 
37     tplt = "{:4}\t{:8}\t{:8}\t{:4}"
38 
39     print(tplt.format("序号","名称","图片地址","下载地址"))
40     count = 0
41     for g in list:
42         count = count + 1
43         print(tplt.format(count,g[0],g[1],g[2]))
44     
45 def main():
46 
47     start_url = ""
48     infoList = []
49     depth = 2
50     for i in range(depth):
51         try:
52             if i>1:
53                 continue
54             url = start_url + "index-" + str(i) + ".html"
55             html = getHtml(url)
56             parseHtml(infoList,html)
57             print(len(infoList))
58         except:
59             continue
60     printInfo(infoList)    
61         
62 if __name__ == "__main__":
63     main()
模板

 

posted @ 2018-08-12 18:52  Justice-V  阅读(146)  评论(0)    收藏  举报