Python 【爬取淘宝】
1 import requests 2 3 import re 4 5 6 def getHTMLText(url): 7 try: 8 r = requests.get(url,timeout=30) 9 r.raise_for_status() 10 r.encoding = r.apparent_encoding 11 html = r.text 12 return html 13 except: 14 return "" 15 16 def parsePage(ilt,html): 17 18 try: 19 plt = re.findall(r'\"view_price\"\:\"[\d\.]*"',html) 20 tlt = re.findall(r'\"raw_title\"\:\".*?\"',html) 21 for i in range(len(plt)): 22 price = eval(plt[i].split(":")[1]) 23 title = eval(tlt[i].split(":")[1]) 24 ilt.append([price,title]) 25 except: 26 print("") 27 28 def printGoodsList(ilt): 29 30 tplt = "{:4}\t{:8}\t{:16}" 31 32 print(tplt.format("序号","价格","商品名称")) 33 count = 0 34 for g in ilt: 35 count = count + 1 36 print(tplt.format(count,g[0],g[1])) 37 38 def main(): 39 40 goods = "书包" 41 depth = 2 42 start_url = "https://s.taobao.com/search?q=" + goods 43 infoList=[] 44 for i in range(depth): 45 try: 46 url = start_url + '&s=' +str(44*i) 47 html = getHTMLText(url) 48 parsePage(infoList,html) 49 except: 50 51 continue 52 printGoodsList(infoList) 53 54 if __name__ == "__main__": 55 main() 56 57
类似的模板:
1 import re 2 import requests 3 4 5 def getHtml(url): 6 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\ 7 WebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"} 8 try: 9 r = requests.get(url,headers=headers) 10 r.raise_for_status() 11 r.encoding = r.apparent_encoding 12 html = r.text 13 return html 14 except: 15 16 print("错误!") 17 18 19 def parseHtml(list,html): 20 21 try: 22 pattern = r'<li><a href="(.*?)".*?><img src="(.*?)" />.*?<h3>(.*?)</h3></a></li>' 23 name = re.findall(pattern,html) 24 for i in range(len(name)): 25 26 Name = (name[i][0]) 27 Pic = (name[i][1]) 28 Download = (name[i][2]) 29 list.append([Name,Pic,Download]) 30 except: 31 print("错误2!") 32 33 34 35 def printInfo(list): 36 37 tplt = "{:4}\t{:8}\t{:8}\t{:4}" 38 39 print(tplt.format("序号","名称","图片地址","下载地址")) 40 count = 0 41 for g in list: 42 count = count + 1 43 print(tplt.format(count,g[0],g[1],g[2])) 44 45 def main(): 46 47 start_url = "" 48 infoList = [] 49 depth = 2 50 for i in range(depth): 51 try: 52 if i>1: 53 continue 54 url = start_url + "index-" + str(i) + ".html" 55 html = getHtml(url) 56 parseHtml(infoList,html) 57 print(len(infoList)) 58 except: 59 continue 60 printInfo(infoList) 61 62 if __name__ == "__main__": 63 main()
一个二次元的生物

浙公网安备 33010602011771号