爬取百度贴吧楼主图片

import urllib.request
from bs4 import BeautifulSoup
import re


def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    return new_title

url = "https://tieba.baidu.com/p/5407739329?see_lz=1"

request = urllib.request.Request(url)
response = urllib.request.urlopen(request)

soup = BeautifulSoup(response,'lxml')

try:
    #获取总页数
    a = soup.find(text=re.compile("回复贴"))
    total_page = a.find_next_sibling('span').string
    total_page = int(total_page)

    if total_page > 0 :
        for j in range(1,int(total_page) + 1):
            url = "https://tieba.baidu.com/p/5407739329?see_lz=1&pn="+str(j)
            request1 = urllib.request.Request(url)
            response1 = urllib.request.urlopen(request1)
            soup1 = BeautifulSoup(response1, 'lxml')
            title = soup1.title.string
            link = soup1.find_all('img',class_="BDE_Image")
            i = 1
            for li in link :
                print(li.get('src'))
                file_name = "D:/www/spider/" + validateTitle(title) + str(j) +"-"+ str(i) + ".jpg"
                print(file_name)
                urllib.request.urlretrieve(li.get('src'),file_name)
                i = i + 1
except Exception as e:
    print(e)

  

posted @ 2018-01-22 17:25  brady-wang  阅读(230)  评论(0)    收藏  举报