链接收集器
链接收集器:它可以访问网站并收集每个页面上的链接,代码如下
from urllib.request import urlopen from urllib.parse import urlparse import re import sys LINK_REGEX = re.compile("<a [^>]*href=['\"]([^'\"]+)['\"][^>]*>") class LinkCollector: def __init__(self, url): self.url = "http://%s" % urlparse(url).netloc # 格式化主url self.collected_links = {} # 字典来存主页面的连接 self.visited_links = set() # 用集合来存主页面下子页面的连接 def collect_links(self, path="/"): full_url = self.url + path self.visited_links.add(full_url) page = str(urlopen(full_url).read()) links = LINK_REGEX.findall(page) # 获取页面数据 links = {self.normalize_url(path, link) for link in links} # 用集合进行排重,不让他重复访问同一个连接 self.collected_links[full_url] = links for link in links: self.collected_links.setdefault(link, set()) unvisited_links = links.difference(self.visited_links) for link in unvisited_links: if link.startswith(self.url): self.collect_links(urlparse(link).path) # 循环调用这个方法,达到访问每个页面连接的效果 def normalize_url(self, path, link): # 页面上URL格式化逻辑 if link.startswith("http://"): return link elif link.startswith("/"): return self.url + link else: return self.url + path.rpartition('/')[0] + '/' + link if __name__ == "__main__": collector = LinkCollector(sys.argv[1]) # 把命令行的第一个参数传进来 collector.collect_links() for link, item in collector.collected_links.items(): print("{}:{}".format(link, item))
执行方法:进入脚本文件的路径,在CMD输入命令:python collection_links.py "http://www.xxx.com"