爬图交互界面及翻页初尝式

# -*- coding:utf-8 -*-


import requests, re, os,urllib2
class TP:
    def __init__(self,baseUrl): #baseUrl是基本地址
    #url = 'http://tieba.baidu.com/p/5307547413' 
        self.baseURL=baseUrl
        #传入页码,获取该页帖子的代码
    def getPage(self,pageNum):
        try:
            url=self.baseURL+'?pn='+str(pageNum)
        res=urllib2.Request(url)
        html=urllib2.urlopen(res).read().decode('UTF-8')
        return html
    
    except urllib2.UrlError,e:#为什么这么写
        if hasattr(e,"reason"):#hasattr是什么意思
        print u'错误',e.reason
        return None    
    def getPageNum(self,page):  #获取页码
        page=self.getPage(1)#这个是第一页的网址信息
        pattern=re.compile('<li class="l_reply_num" .*?</span>.*?<span.*?>(.*?)</span>',re.S)
        result=re.search(pattern,page)
        if result:
            return result.group(1).strip()
        else:
            return None
    #提取图片
    def getContent(self,html):
            header = {
                   'Accept': '*/*',
                    'Accept-Encoding':'gzip,deflate,sdch',
                    'Accept-Language':'zh-CN,zh;q=0.8',
                    'Connection':'keep-alive'
                    }
            html = requests.get(url,headers = header)
            
            data = html.content.decode('utf-8')
            find = re.compile(r'<img class="BDE_Image" src="(.*?).jpg"')
            result = find.findall(data)     
            for img_url in result:
                name = img_url.split('/')[-1]
                img_url = img_url+'.jpg'
                html = requests.get(img_url,headers = header)
                im = html.content
                with open(name+'.jpg','wb')as f:
                    f.write(im)
      
 
    def start(self):
        indexPage=self.getPage(1)
        pageN=self.getPageNum(indexPage)
     
        if pageN==None:
            print "URL error"
            return
        try:
            print u'该帖子有'+str(pageN)+'页!'#
            for i in range(1,int(pageN)+1):
                print u'正在读入第'+str(i)+'页数据'
                page=self.getPage(i)
                contents=self.getContent(page)
        except IOError,e:
            print u'正在写入第'+str(i)+'页数据'
        finally:
            print u'爬取任务完成^_^'
print u'请写入帖子号码'
baseUrl='http://tieba.baidu.com/p/'+str(raw_input(u'http://tieba.baidu.com/p/'))
pt=TP(baseUrl)
pt.start()
            

问题尚未完成,无法翻页且图片出不来,明天瞅瞅语法仔细盘盘逻辑

posted @ 2017-09-13 22:58  子不语怪力乱神  阅读(248)  评论(0)    收藏  举报