scrapy 爬取示例
使用示例
1,抓取书籍示例
settings.py
$ grep -v "^#" settings.py
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 5 #下载延迟
FEED_EXPORT_FIELDS = ['name', 'auth', 'translator', 'publisher', 'date', 'price', 'rating', 'hot', 'url']
# 使用默认的输出流输出到csv 文件时 item 的顺序是随机的,所以需要在配置文件中固定下来
spider 编写
####
## $ cat book_spider.py
# -*- coding:utf-8 -*-
import scrapy
class BooksSpider(scrapy.Spider):
# 每一个爬虫的唯一标识
name = 'books'
# 定义爬虫爬取的起始点,起始点可以是多个,这里只有一个
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
# 提取数据
# 每一本书的信息在<article class="product_pod">中,我们使用
# css()方法找到所有这样的 article 元素,并依次迭代
# item 没有使用自定义的类,直接yield出来
for book in response.css('article.product_pod'):
name = book.xpath('./h3/a/@title').extract_first()
price = book.css('p.price_color::text').extract_first()
yield {
'name': name,
'price': price,
}
# 提取链接
# 下一页的 url 在 ul.pager > li.next > a 里面
# 例如:<li class="next"><a href="catelogue/page-2.html">next</a><li>
next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
if next_url:
# 如果找到下一页的 URL,得到绝对路径,构造新的 Request 对象
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
配置了item的版本
cat items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BookItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
review_rating = scrapy.Field()
review_num = scrapy.Field()
upc = scrapy.Field()
stock = scrapy.Field()
pipline,目的是把评分的one 转化为1
cat pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class BookPipeline(object):
review_rating_map = {
'One': 1,
'Two': 2,
'Three': 3,
'Four': 4,
'Five': 5,
}
def process_item(self, item, spider):
rating = item.get('review_rating')
if rating:
item['review_rating'] = self.review_rating_map[rating]
return item
spider
cat spiders/books.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import BookItem
# import pudb; pudb.set_trace()
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
le = LinkExtractor(restrict_css='article.product_pod h3')
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_book)
le = LinkExtractor(restrict_css='ul.pager li.next')
links = le.extract_links(response)
if links:
next_url = links[0].url
yield scrapy.Request(next_url, callback=self.parse)
def parse_book(self, response):
book = BookItem()
sel = response.css('div.product_main')
book['name'] = sel.xpath('./h1/text()').extract_first()
book['price'] = sel.css('p.price_color::text').extract_first()
book['review_rating'] = sel.css('p.star-rating::attr(class)').re_first('star-rating ([A-Za-z]+)')
sel = response.css('table.table.table-striped')
book['upc'] = sel.xpath('(.//tr)[1]/td/text()').extract_first()
book['stock'] = sel.xpath('(.//tr)[last()-1]/td/text()').re_first('\((\d+) available\)')
book['review_num'] = sel.xpath('(.//tr)[last()]/td/text()').extract_first()
yield book
2,豆瓣抓取书籍
示例来源网上收集
配置文件和示例1 一样,添加item的配置
item
import scrapy
class DouBanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
auth = scrapy.Field()
translator = scrapy.Field()
publisher = scrapy.Field()
date = scrapy.Field()
price = scrapy.Field()
rating = scrapy.Field()
hot = scrapy.Field()
url = scrapy.Field()
#######settings.py
$ grep -v "^#" settings.py
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 5 #下载延迟
FEED_EXPORT_FIELDS = ['name', 'auth', 'translator', 'publisher', 'date', 'price', 'rating', 'hot', 'url']
# 使用默认的输出流输出到csv 文件时 item 的顺序是随机的,所以需要在配置文件中固定下来
spider
import scrapy
from example.items import DouBanItem
from scrapy.linkextractors import LinkExtractor
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['book.douban.com']
## 这里的url 可以根据tag 随意替换
#start_urls = ['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4']
start_urls = ['https://book.douban.com/tag/%E7%BC%96%E7%A8%8B']
def parse(self, response):
item = DouBanItem()
booklist = response.css('ul.subject-list')
bookinfos = booklist.css('div.info')
for bookinfo in bookinfos:
item['name'] = bookinfo.css('a::text').get().strip()
item['auth'] = bookinfo.css('div.pub::text').get().strip().split('/')[0]
try:
item['translator'] = bookinfo.css('div.pub::text').get().strip().split('/')[1]
except:
item['translator'] = ""
try:
item['publisher'] = bookinfo.css('div.pub::text').get().strip().split('/')[-3]
except:
item['publisher'] = ""
try:
item['date'] = bookinfo.css('div.pub::text').get().strip().split('/')[-2]
except:
item['date'] = ""
try:
item['price'] = bookinfo.css('div.pub::text').get().strip().split('/')[-1]
except:
item['price'] = ""
try:
item['rating'] = bookinfo.css('span.rating_nums::text').get(default='not-found').strip()
except:
item['rating'] = "none"
try:
item['hot'] = bookinfo.css('span.pl::text').re('\d+')
except:
item['hot'] = ""
try:
item['url'] = bookinfo.css('a::attr(href)').get(default='not-found').strip()
except:
item['url'] = ""
yield item
le = LinkExtractor(restrict_css='span.next')
links = le.extract_links(response)
if links[0].url:
yield scrapy.Request(links[0].url, callback=self.parse)
执行
scrapy crawl douban -o douban.csv
scrapy crawl douban -o code.csv
3,不使用项目的爬虫
###
## cat quotes_spider.py
import scrapy
#import heartrate; heartrate.trace(browser=True)
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = [
'http://quotes.toscrape.com/tag/humor/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'author': quote.xpath('span/small/text()').get(),
'text': quote.css('span.text::text').get(),
}
next_page = response.css('li.next a::attr("href")').get()
if next_page is not None:
yield response.follow(next_page, self.parse)
执行
scrapy runspider quotes_spider.py
4.煎蛋的爬取
1.项目结构
(base) [root@mini-install jiandann]# tree
.
├── jiandann
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── items.cpython-37.pyc
│ │ ├── pipelines.cpython-37.pyc
│ │ └── settings.cpython-37.pyc
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── jiandan.py
│ └── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── jiandan.cpython-37.pyc
└── scrapy.cfg
4 directories, 14 files
2,item
cat items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class JiandanItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
3,piplines
cat pipelines.py
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline #内置的图片管道
class JiandanPipeline(ImagesPipeline):#继承ImagesPipeline这个类
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
image_url = "http://" + image_url
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
4,settings.py
grep -v "^#" settings.py
BOT_NAME = 'jiandann'
SPIDER_MODULES = ['jiandann.spiders']
NEWSPIDER_MODULE = 'jiandann.spiders'
IMAGES_STORE='/root/imagess' # 下载的图像保存路径
# 启用pipline
ITEM_PIPELINES = {
'jiandann.pipelines.JiandanPipeline':1,
}
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 5
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
}
5,spider
###
### cat spiders/jiandan.py
import scrapy
from jiandann.items import JiandanItem
class jiandanSpider(scrapy.Spider):
name = 'jiandan'
start_urls = ["http://jandan.net/ooxx"]
def parse(self, response):
item = JiandanItem()
item['image_urls'] = response.xpath('//a[@class="view_img_link"]/@href').re('\/\/(.*)')
yield item
url = "http://" + response.css('a.previous-comment-page::attr(href)').re('\/\/(.*)')[0]
if url:
yield scrapy.Request(url, callback=self.parse)
通过这个示例可以加深映像, spider请求的地址为http://格式的,这个必须注意了,之前的好像是默认从配置文件里抽取的链接是带有http的

浙公网安备 33010602011771号