scrapy+lxml.etree爬取百度贴吧
分析:首先通过scrapy内置的xpath提取内容,发现为空,所以不行咯
采用正则re匹配出所有的<li>标签,也就是需要提取的所有内容
在把li标签通过resultTree = lxml.etree.HTML(articleBody),变成'lxml.etree._Element'
在通过resultTree.xpath()进行提取
注意 此时的xpath与scrapy的xpath是不一样的
# -*- coding: utf-8 -*-
import scrapy
from ..settings import MAX_PAGE
from ..items import TiebaBaiduItem
import re
import lxml.html
import lxml.etree
import json
class TiebaSpider(scrapy.Spider):
name = 'tieba'
allowed_domains = ['tieba.baidu.com']
start_urls = ['https://tieba.baidu.com/f?kw=%E9%83%91%E5%AE%B8&ie=utf-8&pn={}'.format(str(page * 50)) for page in range(MAX_PAGE + 1)]
def parse(self, response):
# 关键 是正则匹配出 那一段需要有用的 html代码 如下 就是把那一部分 <li>标签全取出来
articleBodyRe = re.search('<ul id="thread_list" class="threadlist_bright j_threadlist_bright">(.*?)<div class="thread_list_bottom clearfix">', response.text, re.DOTALL)
articleBody = ''
if articleBodyRe:
articleBody = articleBodyRe.group(1)
# 通过lxml.etree.HTML(articleBody) 变成html对象 再利用xpath进行提取
# 此时的xpath与scrapy使用的xpath略有不同
# 这是lxml模块中xpath的使用方式
resultTree = lxml.etree.HTML(articleBody)
articleList = resultTree.xpath('//li[contains(@class,"j_thread_list")]')
for articleElem in articleList:
articleInfo = {}
data_field = articleElem.xpath("@data-field")[0]
dataFieldJson = json.loads(data_field)
articleInfo['id'] = dataFieldJson['id']
articleInfo['author'] = dataFieldJson['author_name']
articleInfo['title'] = articleElem.xpath(".//div[@class='t_con cleafix']//a/@title")[0]
articleInfo['href'] = \
articleElem.xpath(".//div[@class='t_con cleafix']//a/@href")[0]
yield response.follow(
url = articleInfo['href'] + "?see_lz=1",
meta={'dont_redirect': True, 'articleInfo': articleInfo},
callback = self.parseArticleDetail,
errback = self.errorHandle
)
def parseArticleDetail(self, response):
print(
f"parseArticleDetail: statusCode = {response.status}, url = {response.url}")
contentLst = response.xpath(
"//div[contains(@id, 'post_content')]//text()").extract()
imgHrefLst = response.xpath(
"//div[contains(@id, 'post_content')]//img/@src").extract()
dateLst = response.xpath(
"//div[contains(@class, 'post_content_firstfloor')]//span[@class='tail-info']/text()").extract()
content = ''
for contentElem in contentLst:
content += contentElem.replace('\n', ',').replace(" ", '').strip()
content += ', '
print(f"content = {content}")
print(f"imgHrefLst = {imgHrefLst}")
articleInfo = response.meta['articleInfo']
articleItem = TiebaBaiduItem()
articleItem['item_type'] = 'articleDetail'
articleItem['_id'] = articleInfo['id']
articleItem['title'] = articleInfo['title']
articleItem['author'] = articleInfo['author']
articleItem['content'] = content
articleItem['fromUrl'] = response.url
articleItem['picHrefLst'] = imgHrefLst
articleItem['date'] = dateLst[1]
yield articleItem
# 请求错误处理:可以打印,写文件,或者写到数据库中
def errorHandle(self, failure):
print(f"request error: {failure.value.response}")
浙公网安备 33010602011771号