链接收集器

链接收集器:它可以访问网站并收集每个页面上的链接,代码如下

from urllib.request import urlopen
from urllib.parse import urlparse
import re
import sys
LINK_REGEX = re.compile("<a [^>]*href=['\"]([^'\"]+)['\"][^>]*>")


class LinkCollector:
    def __init__(self, url):
        self.url = "http://%s" % urlparse(url).netloc  # 格式化主url
        self.collected_links = {}  # 字典来存主页面的连接
        self.visited_links = set()  # 用集合来存主页面下子页面的连接

    def collect_links(self, path="/"):
        full_url = self.url + path
        self.visited_links.add(full_url)
        page = str(urlopen(full_url).read())
        links = LINK_REGEX.findall(page)  # 获取页面数据
        links = {self.normalize_url(path, link) for link in links}  # 用集合进行排重,不让他重复访问同一个连接
        self.collected_links[full_url] = links
        for link in links:
            self.collected_links.setdefault(link, set())
        unvisited_links = links.difference(self.visited_links)
        for link in unvisited_links:
            if link.startswith(self.url):
                self.collect_links(urlparse(link).path)  # 循环调用这个方法,达到访问每个页面连接的效果

    def normalize_url(self, path, link):  # 页面上URL格式化逻辑
        if link.startswith("http://"):
            return link
        elif link.startswith("/"):
            return self.url + link
        else:
            return self.url + path.rpartition('/')[0] + '/' + link


if __name__ == "__main__":
    collector = LinkCollector(sys.argv[1])  # 把命令行的第一个参数传进来
    collector.collect_links()
    for link, item in collector.collected_links.items():
        print("{}:{}".format(link, item))

 

执行方法:进入脚本文件的路径,在CMD输入命令:python collection_links.py "http://www.xxx.com"

posted @ 2020-03-16 10:24  海澜时见鲸  阅读(256)  评论(0)    收藏  举报