scrapy 框架:
糗事百科爬取:
from scrapy import cmdline
cmdline.execute("scrapy crawl xxx --nolog".split())
s= '''
cmdline 执行 cmd 命令()
cmdline.execute("scrapy crawl xxx".split()) == cmdline.execute("scrapy", "crawl", "xxx")
'''
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"referer": "https://www.qiushibaike.com/"
}
spider:
import scrapy
from first.items import FirstItem
class XxxSpider(scrapy.Spider):
name = 'xxx'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
def parse(self, response):
dic_list= response.xpath("//div[@class='col1 old-style-col1']/div")
for dic in dic_list:
author = dic.xpath(".//a[2]/h2/text()").extract_first()
content = dic.xpath(".//div[@class='content']/span/text()").extract()
content = "".join(content).strip()
# duanzi = {"author":author,"content":content}
#
# yield duanzi
item = FirstItem(author=author,content=content)
yield item
s= '''
错误处理:
get: 将 转为decode
getall --》 列表 == extract
strip()
pip__ json存储:
import json
class FirstPipeline(object):
def __init__(self):
self.fp = open("duanzi.json",'w',encoding='utf-8')
pass
def open_spider(self,spider):
print('爬虫开始')
pass
def process_item(self, item, spider):
item_json = json.dumps(dict(item),ensure_ascii=False)
self.fp.write(item_json+'\n')
return item
def close_spider(self,spider):
self.fp.close()
print('爬虫结束')
优化:
from scrapy.exporters import JsonLinesItemExporter
class FirstPipeline(object):
def __init__(self):
self.fp = open("duanzi.json",'wb')
self.exproter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
self.exproter.start_exporting()
def open_spider(self,spider):
print('爬虫开始')
pass
def process_item(self, item, spider):
self.exproter.export_item(item)
return item
def close_spider(self,spider):
self.exproter.finish_exporting()
self.fp.close()
print('爬虫结束')
问题:
1.文件大小
2.数据的量
拓展:
多页爬取
DOWNLOAD_DELAY = 1 设置时间停顿 (1s)
import scrapy
from first.items import FirstItem
class XxxSpider(scrapy.Spider):
name = 'xxx'
allowed_domains = ['qiushibaike.com']
base_url = "https://www.qiushibaike.com"
start_urls = ['https://www.qiushibaike.com/text/page/1/']
def parse(self, response):
dic_list= response.xpath("//div[@class='col1 old-style-col1']/div")
for dic in dic_list:
author = dic.xpath(".//a[2]/h2/text()").extract_first()
content = dic.xpath(".//div[@class='content']/span/text()").extract()
content = "".join(content).strip()
item = FirstItem(author=author,content=content)
yield item
next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract_first()
if not next_url:
return
else:
yield scrapy.Request(self.base_url+next_url,callback=self.parse)