urlretrieve下载图片--爬虫

 1 from lxml import etree
 2 import requests
 3 from urllib import request
 4 
 5 url = 'http://www.haoduanzi.com/'
 6 headers = {
 7     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
 8 }
 9 url_content = requests.get(url, headers=headers).text
10 
11 tree = etree.HTML(url_content)
12 
13 div_list = tree.xpath('//div[@id="main"]/div')[2:-1]
14 
15 i = 0
16 for div in div_list:
17     img_url = div.xpath('./div/img/@src')[0]
18     img_content = requests.get(url=img_url, headers=headers).content
19     request.urlretrieve(url=img_url, filename='img' + str(i) + '.jpg')
20     i += 1

不要采用IO操作,容易出现问题,for循环执行效率要快于with open的效率,错误代码如下:

 1 from lxml import etree
 2 import requests
 3 from uuid import uuid4
 4 import time
 5 from urllib import request
 6 
 7 url = 'http://www.haoduanzi.com/'
 8 headers = {
 9     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
10 }
11 url_content = requests.get(url, headers=headers).text
12 
13 tree = etree.HTML(url_content)
14 
15 div_list = tree.xpath('//div[@id="main"]/div')[2:-1]
16 filename = uuid4()
17 # i = 0
18 for div in div_list:
19     img_url = div.xpath('./div/img/@src')[0]
20     img_content = requests.get(url=img_url, headers=headers).content
21     # request.urlretrieve(url=img_url, filename='img' + str(i) + '.jpg')
22     # i += 1
23     time.sleep(2)
24     with open(r'C:\jupyter\day02\%s.jpg' % filename, 'wb') as f:
25         f.write(img_content)

 

posted @ 2018-12-11 14:20  python杰  阅读(1527)  评论(0编辑  收藏  举报