python进程实现爬网
from multiprocessing import Process, Queue import time import requests class SpiderProcess(Process): def __init__(self,url): super(SpiderProcess,self).__init__() self.url = url self.header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" } self.http_proxy = "http://XXXX" self.https_proxy = "http://XXXX" self.proxyDict = { "http": self.http_proxy, "https": self.https_proxy, } def get_response(self): print("get response from ",self.url) response = requests.get(url=self.url, headers=self.header, proxies=self.proxyDict,timeout=10, verify=False) http = response.text print("Crawl from {0}; content: {1}".format(self.url,http[:500])) def run(self): self.get_response() if __name__ == '__main__': start = time.time() base_url = "https://www.cnblogs.com/#p{}" urls = [base_url.format(index) for index in range(1, 20)] process_list = [] for url in urls: spider = SpiderProcess(url) spider.start() process_list.append(spider) for t in process_list: t.join() print("Total timing ", time.time() - start)
结果为:7.3...

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号