手动爬虫之京东笔记本栏(ptyhon3)

 1 import urllib.request as ur
 2 import urllib.error as ue
 3 import re
 4 # 目标网址
 5 url = 'https://list.jd.com/list.html?cat=670,671,672'
 6 # 存放路径
 7 save_path = 'E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo1/images/'
 8 # 代理服务器ip
 9 proxy_add = '115.174.66.148:8118'
10 
11 def get_JD_pictures(url, save_path, proxy_add, page):
12     # 根据页面设置url
13     url = url+"&page="+str(page)
14     # 添加报头
15     req = ur.Request(url)
16     req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0')
17 
18     # 设置代理
19     proxy = ur.ProxyHandler({'http': proxy_add})
20     opener = ur.build_opener(proxy, ur.HTTPHandler)
21     ur.install_opener(opener)
22 
23     # 爬取页面
24     info = ur.urlopen(req).read()
25     # 信息存档
26     info = str(info)
27     pattern_1 = '<div id="plist".+? <div class="page clearfix">'
28     info = re.compile(pattern=pattern_1).findall(info)
29     info = info[0]
30     pattern_2 = '<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
31     image_list = re.compile(pattern=pattern_2).findall(info)
32     x = 1
33     for image_url in image_list:
34         image_name = save_path+str(page)+"_"+str(x)+".jpg"
35         image_url = "http://"+image_url
36         try:
37             ur.urlretrieve(image_url, filename=image_name)
38         except ue.HTTPError as e:
39             if hasattr(e, 'code'):
40                 print(e.code)
41             if hasattr(e, 'reason'):
42                 print(e.reason)
43         except ue.URLError as e:
44             if hasattr(e, 'code'):
45                 print(e.code)
46             if hasattr(e, 'reason'):
47                 print(e.reason)
48         x += 1
49 
50 get_JD_pictures(url, save_path, proxy_add, 1)

 

posted on 2017-07-02 23:10  小明在线  阅读(328)  评论(0编辑  收藏  举报

导航