1 # -*- coding:utf-8 -*-
2 import urllib, urllib2
3 import re
4 import sys
5
6 class Cuzz():
7 """这是一个类"""
8 def __init__(self, url, header, start_page, end_page):
9 self.url = url
10 self.header = header
11 self.start_page = start_page
12 self.end_page = end_page
13
14 def deal_url(self):
15 """处理url"""
16 for i in range(self.start_page, self.end_page+1):
17 num = 50*(i-1)
18 url = self.url+str(num)
19 request = urllib2.Request(url, headers=self.header)
20 response = urllib2.urlopen(request)
21 htmltext = response.read()
22 self.load_images(htmltext)
23
24
25 def load_images(self, htmltext):
26 """下载图片"""
27 # 找出这一页的所有帖子类似这样的/p/1111111111
28 pattern = re.findall(r"/p/\d+", htmltext)
29 for temp in pattern:
30 url = "http://tieba.baidu.com" + temp
31 request = urllib2.Request(url, headers=self.header)
32 response = urllib2.urlopen(request)
33 htmltext1 = response.read()
34 self.save_images(htmltext1)
35
36
37 def save_images(self, htmltext1):
38 """保存到本地"""
39 image_links = re.findall(r"https://imgsa\.baidu.+?\.jpg",htmltext1)
40 for url in image_links:
41 request = urllib2.Request(url, headers=self.header)
42 response = urllib2.urlopen(request)
43 htmltext1 = response.read()
44 print htmltext1
45 with open("./images/"+str(url[-10:-1]), "w") as f:
46 f.write(htmltext1)
47
48 def main():
49 """控制"""
50 header = {"User-Agent":"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1"}
51 title = raw_input("请输入您要下载的贴吧:")
52 keyword = {"kw":title} # 是一个字典的格式,转换后"kw=%34dgfdg%fg
53
54 keyword = urllib.urlencode(keyword) # 有中文需要转成url的格式
55
56 url = "https://tieba.baidu.com/f?" + keyword + "&ie=utf-8&pn="
57
58 start_page = int(raw_input("请输入起始页面:"))
59 end_page = int(raw_input("请输入截止页面:"))
60
61
62 cuzz = Cuzz(url, header, start_page, end_page)
63 cuzz.deal_url()