爬虫之scrapy简单案例之猫眼

在爬虫py文件下

class TopSpider(scrapy.Spider):
    name = 'top'
    allowed_domains = ['maoyan.com']
    start_urls = ['https://maoyan.com/board/4']

    def parse(self, response):
        dds = response.xpath('//dl/dd')
        for dd in dds:
            dic = MaoyanItem()
            # dic = {}
            dic['name'] = dd.xpath('.//p[@class="name"]//text()').extract_first()
            dic['star'] = dd.xpath('.//p[@class="star"]/text()').extract_first().replace('\n', '').replace(' ', '')
            dic['releasetime'] = dd.xpath('.//p[@class="releasetime"]/text()').extract_first()
            score1 = dd.xpath('.//p[@class="score"]/i[1]/text()').extract_first()
            score2 = dd.xpath('.//p[@class="score"]/i[2]/text()').extract_first()
            dic['score'] = score1 + score2
            # 详情页
            xqy_url = 'https://maoyan.com' + dd.xpath('.//p[@class="name"]/a/@href').extract_first()
            yield scrapy.Request(xqy_url, callback=self.xqy_parse, meta={'dic': dic})
        # 翻页
        next_url = response.xpath('//a[text()="下一页"]/@href').extract_first()
        if next_url:
            url = 'https://maoyan.com/board/4' + next_url
            yield scrapy.Request(url, callback=self.parse)

    def xqy_parse(self,response):
        dic = response.meta['dic']
        dic['type'] = response.xpath('//ul/li[@class="ellipsis"][1]/text()').extract_first()
        dic['area_time'] = response.xpath('//ul/li[@class="ellipsis"][2]/text()').extract_first().replace('\n', '').replace(' ', '')
        yield dic

在items.py 文件中写入要展示的字段

class DoubanItem(scrapy.Item):
    title = scrapy.Field()
    inf = scrapy.Field()
    score = scrapy.Field()
    peo = scrapy.Field()
    brief = scrapy.Field()

在pipelines.py文件写入要打印的文本

class DoubanPipeline(object):
    def open_spider(self, spider):
        self.file = open('douban.txt', 'a', encoding='utf-8')

    def process_item(self, item, spider):
        self.file.write(str(item)+'\n')
        
    def close_spider(self, spider):
        self.file.close()

pipelines.py文件也可用MongoDB书写

 1 from pymongo import MongoClient
 2 
 3 
 4 class DoubanPipeline(object):
 5     def open_spider(self,spider):
 6         # self.file = open('douban.txt','a',encoding='utf8')
 7         self.client = MongoClient()
 8         self.collection = self.client['库名']['集合名']
 9         self.count = 0
10 
11     def process_item(self, item, spider):
12         # self.file.write(str(item)+'\n')
13         item['_id'] = self.count
14         self.count += 1
15         self.collection.insert_one(item)
16         return item
17 
18     def close_spider(self, spider):
19         # self.file.close()
20         self.client.close()

 

另外,记得在setting.py文件中配置一些信息,如

 

 或者ROBOTS协议以及其他

posted @ 2019-10-25 21:52  一心取信  阅读(380)  评论(0)    收藏  举报