python 线程实现爬网
import threading import requests import time import timeit import re class SpiderThreading(threading.Thread): def __init__(self, url, file_name): super(SpiderThreading, self).__init__() self.url = url self.file_name = file_name self.header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" } http_proxy = "http://XXXX" https_proxy = "http://XXXX" self.proxyDict = { "http": http_proxy, "https": https_proxy, } def get_response(self): print("get response from ", self.url) response = requests.get( url=self.url, headers=self.header, proxies=self.proxyDict, timeout=10, verify=False) http = response.text print("Crawl from {0}; content: {1}".format(self.url, http[:500])) return http def transfer_html(self, html): pattern = re.compile( '<a class="titlelnk" href="(.*?)" target="_blank">(.*?)</a>', re.S) items = re.findall(pattern, html) results = [] for item in items: yield [item[0].strip(),item[1].strip()] # results.append([item[0].strip(), item[1].strip()]) # return results def save_results(self, results): file_full_path = r"../showcase/data/{0}".format(self.file_name) for index, result in enumerate(results): with open(file_full_path, 'a+', encoding='utf-8') as f: f.write("{0};{1}\n".format(result[0], result[1])) print("save file done to ", file_full_path) def run(self): html = self.get_response() results = self.transfer_html(html) self.save_results(results) def main(): base_url = "https://www.cnblogs.com/#p{}" urls = [base_url.format(index) for index in range(1, 20)] file_name = "blog_threading.txt" threads = [SpiderThreading(url, file_name) for url in urls] for t in threads: t.start() for t in threads: t.join() if __name__ == '__main__': start = time.time() # t = timeit.timeit( # stmt='main()', # setup='from __main__ import main', # number=2) # print(t) main() print("timing ", time.time() - start)
结果
timing 1.3821380138397217

浙公网安备 33010602011771号