使用scrapy框架爬取豆瓣电影TOP250

创建一个项目

scrapy startproject dbdianying

创建爬虫
scrapy genspider douban douban.com

设置settings.py

1.USER_AGENT

2.

ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'dbdianying.pipelines.ImagePipeline': 300,
}
#定义图片保存的位置
IMAGES_STORE = "d:/doubandianying"

 

douban.py#爬虫文件

 1 import scrapy
 2 
 3 
 4 class DoubanSpider(scrapy.Spider):
 5     name = 'douban'
 6     allowed_domains = ['douban.com']
 7     start_urls = ['https://movie.douban.com/top250']
 8 
 9     def parse(self, response):
10         movie_urls = response.xpath('//div[@class="pic"]/a/@href')
11         yield from scrapy.follow_all(movie_urls, callback=self.parse_movie)
12 
13         next_url = response.xpath('//span[@class="next"]/a/@href').get()
14         yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
15 
16     def parse_movie(self, response):
17         name = response.xpath('//div[@id="content"]/h1/span[1]/text()').extract_first()
18         director = response.xpath('//span[@class="attrs"]/a[@rel="v:directedBy"]/text()').extract_first()
19         type = ''.join(response.xpath('//span[@property="v:genre"]/text()').extract())
20         score = response.xpath('//div[@typeof="v:Rating"]/strong/text()').extract_first()
21         img_url = response.xpath('//div[@id="mainpic"]/a/img/@src').extract_first()
22         yield {
23             'name': name,
24             'director': director,
25             'type': type,
26             'score': score,
27             'img_url': img_url,
28         }

pipeline.py

 1 # Define your item pipelines here
 2 #
 3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 
 6 
 7 # useful for handling different item types with a single interface
 8 from itemadapter import ItemAdapter
 9 from scrapy.pipelines.images import ImagesPipeline
10 from scrapy import Request
11 
12 # class DbdianyingPipeline:
13 #     def open_spider(self,spider):
14 #         self.filename = open('movie.txt','a',encoding='utf-8')
15 #     def process_item(self, item, spider):
16 #         info = item['name'] + '\n' + item['director'] + '\n' + item['type'] + '\n' + item['score'] + '\n' + item['img_url'] + '\n'
17 #         self.filename.write(info)
18 #         self.filename.flush()
19 #         print(item)
20 #         return item
21 #     def close_spider(self,spider):
22 #         self.filename.close()
23 class ImagePipeline(ImagesPipeline):
24     def get_media_requests(self, item, info):
25         yield Request(item['img_url'],meta={'name': item['name']})
26 
27     def file_path(self, request, response=None, info=None):
28         name = request.meta['name']
29         name = name.replace(':', '')
30         return name + '.jpg'

数据导出执行

 scrapy crawl zongheng -o movie.csv 

posted @ 2021-02-03 15:37  简单de人  阅读(87)  评论(0)    收藏  举报