Loading

scrapy 爬取示例

使用示例

1,抓取书籍示例

来源:http://books.toscrape.com/

settings.py

 $ grep   -v "^#" settings.py 
    
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 5 #下载延迟

FEED_EXPORT_FIELDS = ['name', 'auth', 'translator', 'publisher', 'date', 'price', 'rating', 'hot', 'url']
# 使用默认的输出流输出到csv 文件时 item 的顺序是随机的,所以需要在配置文件中固定下来

spider 编写

####
## $ cat  book_spider.py 
# -*- coding:utf-8 -*-
import scrapy
class BooksSpider(scrapy.Spider):
    # 每一个爬虫的唯一标识
    name = 'books'

    # 定义爬虫爬取的起始点,起始点可以是多个,这里只有一个
    start_urls = ['http://books.toscrape.com/']

    def parse(self, response):
        # 提取数据
        # 每一本书的信息在<article class="product_pod">中,我们使用
        # css()方法找到所有这样的 article 元素,并依次迭代
        # item 没有使用自定义的类,直接yield出来
        for book in response.css('article.product_pod'):
            name = book.xpath('./h3/a/@title').extract_first()
            price = book.css('p.price_color::text').extract_first()
            yield {
                'name': name,
                'price': price,
            }
    
        # 提取链接
        # 下一页的 url 在 ul.pager > li.next > a 里面
        # 例如:<li class="next"><a href="catelogue/page-2.html">next</a><li>
        next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
        if next_url:
            # 如果找到下一页的 URL,得到绝对路径,构造新的 Request 对象
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)

配置了item的版本

cat  items.py 
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BookItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    review_rating = scrapy.Field()
    review_num = scrapy.Field()
    upc = scrapy.Field()
    stock = scrapy.Field()

pipline,目的是把评分的one 转化为1

cat  pipelines.py 
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class BookPipeline(object):
    review_rating_map = {
     'One': 1,
     'Two': 2,
     'Three': 3,
     'Four': 4,
     'Five': 5,
     }
    def process_item(self, item, spider):
        rating = item.get('review_rating')
        if rating:
           item['review_rating'] = self.review_rating_map[rating]
        return item

spider

cat spiders/books.py 
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import BookItem
# import pudb; pudb.set_trace()

class BooksSpider(scrapy.Spider):
    name = 'books'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com/']

    def parse(self, response):
        le = LinkExtractor(restrict_css='article.product_pod h3')
        for link  in le.extract_links(response):
            yield  scrapy.Request(link.url, callback=self.parse_book)
        
        le = LinkExtractor(restrict_css='ul.pager li.next')
        links = le.extract_links(response)
        if links:
           next_url = links[0].url
           yield  scrapy.Request(next_url, callback=self.parse)
   
    def parse_book(self, response):
        book = BookItem()
        sel = response.css('div.product_main')
        book['name'] = sel.xpath('./h1/text()').extract_first()
        book['price'] = sel.css('p.price_color::text').extract_first()
        book['review_rating'] = sel.css('p.star-rating::attr(class)').re_first('star-rating ([A-Za-z]+)')
        sel = response.css('table.table.table-striped')
        book['upc'] = sel.xpath('(.//tr)[1]/td/text()').extract_first()
        book['stock'] = sel.xpath('(.//tr)[last()-1]/td/text()').re_first('\((\d+) available\)')
        book['review_num'] = sel.xpath('(.//tr)[last()]/td/text()').extract_first()
        yield book

2,豆瓣抓取书籍

示例来源网上收集

配置文件和示例1 一样,添加item的配置

item

import scrapy
class DouBanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    auth = scrapy.Field()
    translator = scrapy.Field()
    publisher = scrapy.Field()
    date = scrapy.Field()
    price = scrapy.Field()
    rating = scrapy.Field()
    hot = scrapy.Field()
    url = scrapy.Field()
    
    
    
#######settings.py 
 $ grep   -v "^#" settings.py 
    
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 5 #下载延迟

FEED_EXPORT_FIELDS = ['name', 'auth', 'translator', 'publisher', 'date', 'price', 'rating', 'hot', 'url']
# 使用默认的输出流输出到csv 文件时 item 的顺序是随机的,所以需要在配置文件中固定下来

spider

import scrapy
from example.items import DouBanItem
from scrapy.linkextractors import LinkExtractor

class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['book.douban.com']
    ## 这里的url 可以根据tag 随意替换
    #start_urls = ['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4']
    start_urls = ['https://book.douban.com/tag/%E7%BC%96%E7%A8%8B']

    def parse(self, response):
        item = DouBanItem()
        booklist = response.css('ul.subject-list') 
        bookinfos = booklist.css('div.info')
        for bookinfo in bookinfos:
            item['name'] = bookinfo.css('a::text').get().strip() 
            item['auth'] = bookinfo.css('div.pub::text').get().strip().split('/')[0]
            try:
                item['translator'] = bookinfo.css('div.pub::text').get().strip().split('/')[1] 
            except:
                item['translator'] = ""
            try:
                item['publisher'] = bookinfo.css('div.pub::text').get().strip().split('/')[-3]
            except:
                item['publisher'] = ""
            try:
                item['date'] = bookinfo.css('div.pub::text').get().strip().split('/')[-2]
            except:
                item['date'] = ""
            try:
                item['price'] = bookinfo.css('div.pub::text').get().strip().split('/')[-1]
            except:
                item['price'] = ""
            try:
                item['rating'] = bookinfo.css('span.rating_nums::text').get(default='not-found').strip() 
            except:
                item['rating'] = "none"
            try:
                item['hot'] = bookinfo.css('span.pl::text').re('\d+') 
            except:
                item['hot'] = ""
            try:
                item['url'] = bookinfo.css('a::attr(href)').get(default='not-found').strip() 
            except:
                item['url'] = ""
            yield item

        le = LinkExtractor(restrict_css='span.next')
        links = le.extract_links(response)
        if links[0].url:
            yield scrapy.Request(links[0].url, callback=self.parse)

执行

scrapy crawl  douban  -o  douban.csv
scrapy crawl  douban  -o    code.csv

3,不使用项目的爬虫

###  
## cat  quotes_spider.py 
import scrapy
#import heartrate; heartrate.trace(browser=True)


class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    start_urls = [
        'http://quotes.toscrape.com/tag/humor/',
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'author': quote.xpath('span/small/text()').get(),
                'text': quote.css('span.text::text').get(),
            }

        next_page = response.css('li.next a::attr("href")').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

执行

scrapy runspider quotes_spider.py

4.煎蛋的爬取

1.项目结构

(base) [root@mini-install jiandann]# tree 
.
├── jiandann
│   ├── __init__.py
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── __pycache__
│   │   ├── __init__.cpython-37.pyc
│   │   ├── items.cpython-37.pyc
│   │   ├── pipelines.cpython-37.pyc
│   │   └── settings.cpython-37.pyc
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       ├── jiandan.py
│       └── __pycache__
│           ├── __init__.cpython-37.pyc
│           └── jiandan.cpython-37.pyc
└── scrapy.cfg

4 directories, 14 files

2,item

 cat items.py 
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class JiandanItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()

3,piplines

 cat pipelines.py 
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline   #内置的图片管道

class JiandanPipeline(ImagesPipeline):#继承ImagesPipeline这个类

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            image_url = "http://" + image_url
            yield scrapy.Request(image_url)



    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item

4,settings.py

grep -v "^#" settings.py 

BOT_NAME = 'jiandann'
SPIDER_MODULES = ['jiandann.spiders']
NEWSPIDER_MODULE = 'jiandann.spiders'

IMAGES_STORE='/root/imagess'  # 下载的图像保存路径
# 启用pipline 
ITEM_PIPELINES = {
   'jiandann.pipelines.JiandanPipeline':1,
}
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 5
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
}

5,spider

###
### cat spiders/jiandan.py 
import scrapy
from jiandann.items import JiandanItem

class jiandanSpider(scrapy.Spider):
    name = 'jiandan'
    start_urls = ["http://jandan.net/ooxx"]

    def parse(self, response):

        item = JiandanItem()
        item['image_urls'] = response.xpath('//a[@class="view_img_link"]/@href').re('\/\/(.*)')
        yield item
       
        url = "http://" + response.css('a.previous-comment-page::attr(href)').re('\/\/(.*)')[0]
        if url:
            yield scrapy.Request(url, callback=self.parse)

通过这个示例可以加深映像, spider请求的地址为http://格式的,这个必须注意了,之前的好像是默认从配置文件里抽取的链接是带有http的

posted @ 2020-08-05 19:09  Lust4Life  阅读(222)  评论(0)    收藏  举报