利用Collections以及Set模块提取网站所有链接
import requests from collections import deque import optparse import sys from lxml import etree import re import time class MySpider: def __init__(self) -> None: self.start_url = self.url_prefix(self.get_params()) self.url_list = deque([self.start_url]) self.scraped_urls = set() # self.emails = set() self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:101.0) Gecko/20100101 Firefox/101.0" } def get_params(self): parser = optparse.OptionParser('Usage: < Program > -u url') parser.add_option('-u', '--url', dest='url', type='string', help='Specify url to scrape') options, args = parser.parse_args() if options.url is None: print(parser.usage) sys.exit() return options.url def url_prefix(self,url): if url.startswith('http://'): return url elif url.startswith('https://'): return url else: return 'http://' + url def retrieve_web_page(self,url): try: print('[-] Scraping %s' % url) response = requests.get(url=url,headers=self.headers) self.scraped_urls.add(url) if response.status_code == 200: return response.text except Exception as e: print(e) pass def retrieve_links(self, response): html = etree.HTML(response) links = html.xpath('//a/@href') for link in links: # if link.startswith('/'): # link = url+link if link.startswith('#'): continue if link not in self.url_list and link not in self.scraped_urls: print(link) self.url_list.append(link) def run(self): i = 0 while len(self.url_list): print("[-] Scraping %d" % i) i += 1 url = self.url_list.popleft() response = self.retrieve_web_page(url) self.retrieve_links(response) time.sleep(2) print(self.scraped_urls) if __name__ == '__main__': myspider = MySpider() myspider.run()
STRIVE FOR PROGRESS,NOT FOR PERFECTION

浙公网安备 33010602011771号