from qiubaiPro.items import QiubaiproItem
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.qiushibaike.com/text']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
#建议使用xpath进行指定内容的解析,因为框架集成了xpath解析的接口
#段子的内容和作者
#可以直接用response+点来调用xpath
div_list = response.xpath('//div[@id="content-left"]/div')
for div in div_list:
#extract()该方法可以将selector对象中存储的数据值拿到,是一个列表,取值要用索引【0】
# author = div.xpath('./div/a[2]/h2/text()').extract()[0]
# extract_first() 等于extract()[0]
author = div.xpath('./div/a[2]/h2/text()').extract_first()
content = div.xpath('.//div[@class="content"]/span/text()').extract_first()
#1 将解析到的数据值(author和cont)存储到items对象,需要去QiubaiproItem类里声明属性
item = QiubaiproItem()
item['author'] = author
item['content'] = content
#2 将item对象提交给管道,去pipelines文件编写代码
yield item
# 基于管道存储的代码
class QiubaiproPipeline(object):
fp = None
# 整个爬虫过程中,该方法只会在开始爬虫的时候被调用一次
def open_spider(self, spider):
print('开始爬虫')
self.fp = open('./qiubai_pipe.txt', 'w', encoding='utf-8')
# 该方法可以接受爬虫文件中提交过来的item对象,并且对item对象中存储的页面数据进行持久化存储
# 参数:item表示的就是接收到的item对象
# 每当爬虫文件向管道提交一次item,该方法就会被执行一次
def process_item(self, item, spider):
# 取出item对象中存储的数据值
author = item['author']
content = item['content']
# 持久化存储
self.fp.write(author + ':' + content+'\n\n\n')
return item
#该方法只会在爬虫结束的时候被调用一次
def close_spider(self,spider):
print('爬虫结束')
self.fp.close()
# 编写向mysql数据库中存储数据的相关代码
import pymysql
class QiubaiproPipeline(object):
conn = None
cursor = None
def open_spider(self, spider):
print('开始爬虫')
# 链接数据库
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='qiubai')
def process_item(self, item, spider):
# 1. 链接数据库(创建好数据库和要存的表)
# 2. 执行sql语句
sql = 'insert into qiubai values("%s","%s")' % (item['author'], item['content'])
#创建游标对象
self.cursor = self.conn.cursor()
# 3. 提交事务
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
print('爬虫结束')
self.cursor.close() #关闭游标对象
self.conn.close() #关闭连接对象
import redis
class QiubaiproPipeline(object):
conn = None
def open_spider(self, spider):
print('开始爬虫')
# 1. 链接数据库
self.conn = redis.Redis(host='127.0.0.1', port=6379)
def process_item(self, item, spider):
# 2. 执行语句
dict = {
'author': item['author'],
'content': item['content'],
}
# 创建一个名为data的列表,传入数据dict
self.conn.lpush('data', dict)
return item