python爬取之url管理器

class UrlManager():
    #url管理器

    def __init__():#设置新老url数组,分别为未爬取和已爬取
        self.new_urls = set()
        self.old_urls = set()

    def add_new_url(self,url):#添加单个url
        if url is None or len(url) == 0:
            return
        if url in self.new_urls or url in self.old_urls:
            return
        self.new_urls.add(url)

    def add_new_urls(self,urls):#添加多个url
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    def get_url(self):#得到新的url最后一位
        if self.has_new_url():
            url = self.new_urls.pop()
            self.old_urls.add(url)
            return url
        else:
            return Nome

    def has_new_url(self):#查看新url数组是否还有
        return len(self.new_urls) > 0

if __name__=="__main__":
    url_manger = UrlManager()

 

posted @ 2023-07-17 14:33  子过杨梅  阅读(44)  评论(0)    收藏  举报