scrapy 框架_01

scrapy 框架:

糗事百科爬取:


from scrapy import cmdline
cmdline.execute("scrapy crawl xxx --nolog".split())


s= '''
cmdline 执行 cmd 命令()
cmdline.execute("scrapy crawl xxx".split())  == cmdline.execute("scrapy", "crawl", "xxx")
'''

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
  "referer": "https://www.qiushibaike.com/"

}

spider:

import scrapy
from first.items import  FirstItem
class XxxSpider(scrapy.Spider):
    name = 'xxx'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/1/']

    def parse(self, response):

        dic_list= response.xpath("//div[@class='col1 old-style-col1']/div")
        for dic in dic_list:
            author = dic.xpath(".//a[2]/h2/text()").extract_first()
            content = dic.xpath(".//div[@class='content']/span/text()").extract()
            content = "".join(content).strip()
            # duanzi = {"author":author,"content":content}
            #
            # yield duanzi
            item = FirstItem(author=author,content=content)
            yield item



s= '''
错误处理:
    get: 将 转为decode
    getall --》 列表 == extract
    strip()

pip__ json存储:

import json

class FirstPipeline(object):
    def __init__(self):
        self.fp = open("duanzi.json",'w',encoding='utf-8')

        pass

    def open_spider(self,spider):
        print('爬虫开始')
        pass

    def process_item(self, item, spider):
        item_json = json.dumps(dict(item),ensure_ascii=False)
        self.fp.write(item_json+'\n')

        return item

    def close_spider(self,spider):
        self.fp.close()
        print('爬虫结束')

优化:

from scrapy.exporters import JsonLinesItemExporter
class FirstPipeline(object):
    def __init__(self):
        self.fp = open("duanzi.json",'wb')
        self.exproter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
        self.exproter.start_exporting()

    def open_spider(self,spider):
        print('爬虫开始')
        pass

    def process_item(self, item, spider):
        self.exproter.export_item(item)
    
        return item

    def close_spider(self,spider):
        self.exproter.finish_exporting()
        self.fp.close()
        print('爬虫结束')

问题:

1.文件大小
2.数据的量

拓展:

多页爬取

DOWNLOAD_DELAY = 1     设置时间停顿 (1s)

import scrapy
from first.items import  FirstItem
class XxxSpider(scrapy.Spider):
    name = 'xxx'
    allowed_domains = ['qiushibaike.com']
    base_url = "https://www.qiushibaike.com"
    start_urls = ['https://www.qiushibaike.com/text/page/1/']

    def parse(self, response):

        dic_list= response.xpath("//div[@class='col1 old-style-col1']/div")
        for dic in dic_list:
            author = dic.xpath(".//a[2]/h2/text()").extract_first()
            content = dic.xpath(".//div[@class='content']/span/text()").extract()
            content = "".join(content).strip()
            item = FirstItem(author=author,content=content)
            yield item
        next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract_first()
        if not next_url:
            return
        else:
            yield scrapy.Request(self.base_url+next_url,callback=self.parse)


posted @ 2020-04-25 23:03  black__star  阅读(71)  评论(0)    收藏  举报