1 import urllib.request as ur
2 import urllib.error as ue
3 import re
4 # 目标网址
5 url = 'https://list.jd.com/list.html?cat=670,671,672'
6 # 存放路径
7 save_path = 'E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo1/images/'
8 # 代理服务器ip
9 proxy_add = '115.174.66.148:8118'
10
11 def get_JD_pictures(url, save_path, proxy_add, page):
12 # 根据页面设置url
13 url = url+"&page="+str(page)
14 # 添加报头
15 req = ur.Request(url)
16 req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0')
17
18 # 设置代理
19 proxy = ur.ProxyHandler({'http': proxy_add})
20 opener = ur.build_opener(proxy, ur.HTTPHandler)
21 ur.install_opener(opener)
22
23 # 爬取页面
24 info = ur.urlopen(req).read()
25 # 信息存档
26 info = str(info)
27 pattern_1 = '<div id="plist".+? <div class="page clearfix">'
28 info = re.compile(pattern=pattern_1).findall(info)
29 info = info[0]
30 pattern_2 = '<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
31 image_list = re.compile(pattern=pattern_2).findall(info)
32 x = 1
33 for image_url in image_list:
34 image_name = save_path+str(page)+"_"+str(x)+".jpg"
35 image_url = "http://"+image_url
36 try:
37 ur.urlretrieve(image_url, filename=image_name)
38 except ue.HTTPError as e:
39 if hasattr(e, 'code'):
40 print(e.code)
41 if hasattr(e, 'reason'):
42 print(e.reason)
43 except ue.URLError as e:
44 if hasattr(e, 'code'):
45 print(e.code)
46 if hasattr(e, 'reason'):
47 print(e.reason)
48 x += 1
49
50 get_JD_pictures(url, save_path, proxy_add, 1)