(27)爬虫类方法综合实例

import requests
from fake_useragent import UserAgent
from lxml import etree
#url管理
class URLManger(object):
    def __init__(self):
        self.new_url=[]
        self.old_url=[]

    def get_new_url(self):
        url = self.new_url.pop()
        self.old_url.append(url)
        return url

    def add_new_url(self,url):
        if url not in self.new_url and url and url not in self.old_url:
            self.new_url.append(url)

    def add_new_urls(self,urls):
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):
        return self.get_new_url_size() > 0

    def get_new_url_size(self):
        return len(self.new_url)

    def get_old_url_size(self):
        return len(self.old_url)

#下载

class Downloader:

    def download(self,url):
        response = requests.get(url,headers = {'User-Agent':UserAgent().random})
        if response.status_code==200:
            response.encoding='utf-8'
            return response.text
        else:
            return None


#解析

class Parser:
    def parse(self,html):
        e = etree.HTML(html)
        data_s = self.parse_info(e)
        urls = self.parse_url(e)
        return data_s,urls
    def parse_info(self,e):
        spans = e.xpath('''//div[@class='content']/span''')
        data = []
        for span in spans:
            data.append(span.xpath('string(.)'))
        return data
    def parse_url(self,e):
        url_s = []
        base_url='https://www.qiushibaike.com{}'
        for url in e.xpath('''//ul[@class='pagination']/li/a/@href'''):
            url_s.append(base_url.format(url))
        return url_s

#数据处理
class DataOutput:
    def save(self,data_s):
        with open('duanzi.txt','a',encoding='utf-8') as f:
            for data in data_s:
                f.write(data)



#调度
class DiaoDu:
    def __init__(self):
        self.downloader=Downloader()
        self.url_manger = URLManger()
        self.paeser= Parser()
        self.data_saver = DataOutput()
    def run(self,url):
        self.url_manger.add_new_url(url)
        while self.url_manger.has_new_url():
            url=self.url_manger.get_new_url()
            html=self.downloader.download(url)
            data,urls=self.paeser.parse(html)
            self.data_saver.save(data)
            self.url_manger.add_new_urls(urls)


if __name__ == '__main__':
    diao_du = DiaoDu()
    diao_du.run('https://www.qius')
posted @ 2020-08-08 10:38  kuanleung  阅读(9)  评论(0)    收藏  举报  来源