关于使用scrapy框架爬取小说的进一步优化
一、背景:前面使用scrapy框架爬取小说时,需要修改两个文件内容才能开展新的爬取,考虑只修改一个文件,把需要修改的参数都放入爬虫文件中。
二、实现过程:
1、items.py中添加三个全局变量
(python) [root@DL xbiquge]# vi xbiquge/items.py
import scrapy
class XbiqugeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
url_firstchapter = scrapy.Field()
name_txt = scrapy.Field()
url = scrapy.Field()
preview_page = scrapy.Field()
next_page = scrapy.Field()
content = scrapy.Field()
2、pipelines.py修改如下:
(python) [root@DL xbiquge]# vi xbiquge/pipelines.py
import os
import time
import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors
class XbiqugePipeline(object):
#定义类初始化动作,包括连接数据库novels及建表
def __init__(self):
dbparams = {
'host':'127.0.0.1',
'port':3306,
'user':'root',
'password':'password',
'database':'novels',
'charset':'utf8mb4' #使用utf8mb4字符集可避免emoji表情符号无法存入数据表的错误,这是因为mysql的
utf8只支持3个字节的存储,而一般字符是3个字节,但是emoji表情符号是4字节。
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
self._sql = None
self.name_novel = ''
#爬虫开始
def open_spider(self, spider):
#self.createtable() #爬虫开始时先初始化小说存储表。(已放到爬虫程序中)
return
#建表
def createtable(self,name_novel):
self.cursor.execute("drop table if exists "+ name_novel)
self.cursor.execute("create table " + name_novel + " (id int unsigned auto_increment not null primary key, url varchar(80) not null, preview_page varchar(80), next_page varchar(80), content TEXT not null) charset=utf8mb4")
return
def process_item(self, item, spider):
if self.name_novel == '': #这里判断self.name_novel是否已由item传过来,若没有则需要更新变量。
self.name_novel = item['name']
self.url_firstchapter = item['url_firstchapter']
self.name_txt = item['name_txt']
self.cursor.execute(self.sql, (item['url'], item['preview_page'], item['next_page'], item['content']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = """
insert into """ + self.name_novel + """(id, url, preview_page, next_page, content) values(null, %s, %s, %s, %s)
"""
return self._sql
return self._sql
#从数据库取小说章节内容写入txt文件
def content2txt(self):
self.cursor.execute("select count(*) from " + self.name_novel)
record_num = self.cursor.fetchall()[0][0]
print(record_num)
counts=record_num
url_c = "\""+self.url_firstchapter+"\""
start_time=time.time() #获取提取小说内容程序运行的起始时间
f = open(self.name_txt+".txt", mode='w', encoding='utf-8') #写方式打开小说名称加txt组成的文件
for i in range(counts):
#print(i)
sql_c = "select content from " + self.name_novel + " where url=" + url_c #组合获取小说章节内容>的sql命令。此处需要修改数据库文件名称
#print(sql_c)
self.cursor.execute(sql_c)
record_content_c2a0=self.cursor.fetchall()[0][0] #获取小说章节内容
record_content=record_content_c2a0.replace(u'\xa0', u'') #消除特殊字符\xc2\xa0
f.write('\n')
f.write(record_content + '\n')
f.write('\n\n')
sql_n = "select next_page from " + self.name_novel + " where url=" + url_c #组合获取下一章链接
的sql命令。此处需要修改数据库文件名称
self.cursor.execute(sql_n)
url_c = "\"" + self.cursor.fetchall()[0][0] + "\"" #下一章链接地址赋值给url_c,准备下一次循环。
f.close()
print(time.time()-start_time)
print(self.name_txt + ".txt" + " 文件已生成!")
return
#爬虫结束,调用content2txt方法,生成txt文件
def close_spider(self,spider):
self.content2txt()
return
3、爬虫文件修改:
(python) [root@DL xbiquge]# vi xbiquge/spiders/sancun.py
# -*- coding: utf-8 -*- import scrapy from xbiquge.items import XbiqugeItem from xbiquge.pipelines import XbiqugePipeline class SancunSpider(scrapy.Spider): name = 'sancun' allowed_domains = ['www.xbiquge.la'] #start_urls = ['http://www.xbiquge.la/10/10489/'] url_ori= "http://www.xbiquge.la" url_firstchapter = "http://www.xbiquge.la/10/10489/4534454.html" #小说的第一章url name_txt = "./novels/三寸人间" #输出小说的目录及中文名 pipeline=XbiqugePipeline() pipeline.createtable(name) #创建数据表,以爬虫名称命名 item = XbiqugeItem() item['name'] = name item['url_firstchapter'] = url_firstchapter item['name_txt'] = name_txt def start_requests(self): start_urls = ['http://www.xbiquge.la/10/10489/'] for url in start_urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): dl = response.css('#list dl dd') #提取章节链接相关信息 for dd in dl: self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[0] #组合形成小说的各章节链接 #print(self.url_c) #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True) yield scrapy.Request(self.url_c, callback=self.parse_c) #以生成器模式(yield)调用parse_c方法 获得各章节链接、上一页链接、下一页链接和章节内容信息。 #print(self.url_c) def parse_c(self, response): #item = XbiqugeItem() self.item['url'] = response.url self.item['preview_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[1] self.item['next_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[3] title = response.css('.con_top::text').extract()[4] contents = response.css('#content::text').extract() text='' for content in contents: text = text + content #print(text) self.item['content'] = title + "\n" + text.replace('\15', '\n') #各章节标题和内容组合成content数 据,\15是^M的八进制表示,需要替换为换行符。 yield self.item #以生成器模式(yield)输出Item对象的内容给pipelines模块。
上述代码修改后,新爬取小说时,只需要修改爬虫文件(如:sancun.py)中的相应参数即可。

浙公网安备 33010602011771号