1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # author:Momo time:2018/6/29
4
5
6 import urllib.request
7 import os
8 import urllib
9 import re
10 from lxml import etree
11
12 def get_html_code(url):
13 html_page = urllib.request.urlopen(url)
14 # 这里用 xpath 或者之前的 re 拿到img_url_list
15 html_code = html_page.read()
16 return html_code
17
18 # 通过url获取每个帖子链接
19 def getArticleLinks(url):
20 Selector = etree.HTML(get_html_code(url))
21 # 通过Xpath 获取每个帖子的url后缀
22 url_list = Selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
23 # 在每个后缀前加上百度贴吧的url前缀
24 for i in range(len(url_list)):
25 url_list[i] = 'http://tieba.baidu.com' + url_list[i]
26 return url_list
27
28 # 通过所给帖子链接,下载帖子中所有图片
29 def get_img(url):
30 Selector = etree.HTML(get_html_code(url))
31 img_url_list = Selector.xpath('//*[@class="BDE_Image"]/@src')
32 pic_name = 0
33 for each in img_url_list:
34 urllib.request.urlretrieve(each, 'pic_%s.jpg' % pic_name)
35 pic_name += 1
36
37 # 为每个帖子创建独立文件夹,并下载图片
38 def download_img(url_list,page):
39 # 该目录下创建一个downloads文件夹存放下载图片
40 if not os.path.exists('downloads'):
41 os.mkdir('downloads')
42 root_path = os.getcwd()
43 for i in range(page):
44 img_dir = 'downloads/' + url_list[i][23:].replace("/", '')
45 if not os.path.exists(img_dir):
46 os.mkdir(img_dir)
47 os.chdir(img_dir)
48 get_img(url_list[i])
49 os.chdir(root_path)
50
51 if __name__ == '__main__':
52 print('-----贴吧图片爬取装置2.0-----')
53 print('请输入贴吧地址:',)
54 targetUrl = input('')
55 if not targetUrl:
56 print('---没有地址输入正在使用默认地址(baidu壁纸吧)---')
57 targetUrl = 'http://tieba.baidu.com/f?kw=%E5%A3%81%E7%BA%B8&ie=utf-8'
58
59 page = ''
60 while True:
61 print('请输入你要下载的帖子数:',)
62 page = input('')
63 if re.findall(r'^[0-9]*[1-9][0-9]*$',page):
64 page = int(page)
65 break
66 print('----------正在下载图片---------')
67 ArticleLinks = getArticleLinks(targetUrl)
68 download_img(ArticleLinks,page)
69 print('-----------下载成功-----------')
70 input('Press Enter to exit')