Scrapy框架登陆抽屉并点赞新闻

import scrapy
from scrapy.http import Request
from scrapy.selector import Selector

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://chouti.com/']
    cookie_dict={}

    def start_requests(self):
        '''
        循环将url和回调函数封装成request对象放入调度器
        :return: 
        '''
        for url in self.start_urls:
            yield Request(url,dont_filter=True,callback=self.parse1)

    def parse1(self, response):
        #获取cookie对象
        from scrapy.http.cookies import CookieJar
        cookie_jar = CookieJar()
        cookie_jar.extract_cookies(response,response.request)
        for k, v in cookie_jar._cookies.items():
            for i, j in v.items():
                for m, n in j.items():
                    self.cookie_dict[m] = n.value

        post_dict={
            'phone':'',
            'password':'',
            'oneMonth':1,
        }

        # 登陆抽屉
        import urllib.parse
        yield Request(
            url='http://dig.chouti.com/login',
            method='POST',
            cookies=self.cookie_dict,
            body=urllib.parse.urlencode(post_dict),
            headers={'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8'},
            callback=self.parse2
        )

    def parse2(self, response):
        '''
        获取抽屉新闻页
        :param response:
        :return:
        '''
        yield Request(url='http://dig.chouti.com/',cookies=self.cookie_dict,callback=self.parse3)

    def parse3(self, response):
        #获取每页的新闻点赞ID生成链接并点赞
        hxs=Selector(response)
        link_id_list=hxs.xpath('//div[@class="part2"]/@share-linkid').extract()
        for link_id in link_id_list:
            url='http://dig.chouti.com/link/vote/?linksId=%s'%(link_id,)
            yield Request(url=url,method='POST',cookies=self.cookie_dict,callback=self.parse4)

        #获取所有新闻页面的链接
        page_list=hxs.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
        for url in page_list:
            page_url='http://dig.chouti.com%s'%(url,)
            yield Request(page_url,method='GET',callback=self.parse3)

    def parse4(self, response):
        print(response.text)

 

posted @ 2017-11-13 23:12  魅力宁波  阅读(170)  评论(0)    收藏  举报