使用scrapy框架爬取豆瓣电影TOP250
创建一个项目
scrapy startproject dbdianying
创建爬虫
scrapy genspider douban douban.com
设置settings.py
1.USER_AGENT
2.
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'dbdianying.pipelines.ImagePipeline': 300,
}
#定义图片保存的位置
IMAGES_STORE = "d:/doubandianying"
douban.py#爬虫文件
1 import scrapy 2 3 4 class DoubanSpider(scrapy.Spider): 5 name = 'douban' 6 allowed_domains = ['douban.com'] 7 start_urls = ['https://movie.douban.com/top250'] 8 9 def parse(self, response): 10 movie_urls = response.xpath('//div[@class="pic"]/a/@href') 11 yield from scrapy.follow_all(movie_urls, callback=self.parse_movie) 12 13 next_url = response.xpath('//span[@class="next"]/a/@href').get() 14 yield scrapy.Request(response.urljoin(next_url), callback=self.parse) 15 16 def parse_movie(self, response): 17 name = response.xpath('//div[@id="content"]/h1/span[1]/text()').extract_first() 18 director = response.xpath('//span[@class="attrs"]/a[@rel="v:directedBy"]/text()').extract_first() 19 type = ''.join(response.xpath('//span[@property="v:genre"]/text()').extract()) 20 score = response.xpath('//div[@typeof="v:Rating"]/strong/text()').extract_first() 21 img_url = response.xpath('//div[@id="mainpic"]/a/img/@src').extract_first() 22 yield { 23 'name': name, 24 'director': director, 25 'type': type, 26 'score': score, 27 'img_url': img_url, 28 }
pipeline.py
1 # Define your item pipelines here 2 # 3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 6 7 # useful for handling different item types with a single interface 8 from itemadapter import ItemAdapter 9 from scrapy.pipelines.images import ImagesPipeline 10 from scrapy import Request 11 12 # class DbdianyingPipeline: 13 # def open_spider(self,spider): 14 # self.filename = open('movie.txt','a',encoding='utf-8') 15 # def process_item(self, item, spider): 16 # info = item['name'] + '\n' + item['director'] + '\n' + item['type'] + '\n' + item['score'] + '\n' + item['img_url'] + '\n' 17 # self.filename.write(info) 18 # self.filename.flush() 19 # print(item) 20 # return item 21 # def close_spider(self,spider): 22 # self.filename.close() 23 class ImagePipeline(ImagesPipeline): 24 def get_media_requests(self, item, info): 25 yield Request(item['img_url'],meta={'name': item['name']}) 26 27 def file_path(self, request, response=None, info=None): 28 name = request.meta['name'] 29 name = name.replace(':', ':') 30 return name + '.jpg'
数据导出执行
scrapy crawl zongheng -o movie.csv

浙公网安备 33010602011771号