#本来想要爬取百度图片的图片,但是发现那些图片都是js加载的,具体怎么爬取现在还不能搞清。。。。,所以就选择了百度贴吧里面的图片
#!/usr/bin/python
#coding: utf-8
#
# name: download images from baiduTieba
#
#author: Hacker_MJW
#
#date: 2014-02-15
#
import urllib
import urllib2
import re
class reptile:
def __init__(self, url):
self.url = url
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
self.headers = {'User-Agent': user_agent}
def buil_re(self):
self.p = re.compile("<img.*?class=\"BDE_Image\"\s*src=\"(.*?)\".*?>")
def open_page(self):
self.req = urllib2.Request(url=self.url,
headers=self.headers
)
self.page = urllib2.urlopen(self.req).read().replace('\n', '')
def find(self):
self.img = self.p.findall(self.page)
def cbk(self):
self.per = 100.0 * a * b /c
if self.per > 100:
self.per = 100
print '%.2f%%' % self.per
def download(self, total): #这里只爬取了一页,如果需要爬去多页的话,找出下一页的链接即可
n_count = 0
print self.img
for img in self.img:
urllib.urlretrieve(img, "photos\%s.jpg"%str(n_count))
n_count = n_count + 1
if n_count > total:
break
print "%s张图片下载完成" % str(total)
if __name__ == '__main__':
img = reptile('http://tieba.baidu.com/p/2791466984') #这个网址是我自己选的,大家可以换成美女的网址,^_^
img.buil_re()
img.open_page()
img.find()
img.download(50)