python进程实现爬网

 

from multiprocessing import Process, Queue
import time
import requests

class SpiderProcess(Process):
    def __init__(self,url):
        super(SpiderProcess,self).__init__()
        self.url = url
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        }
        self.http_proxy = "http://XXXX"
        self.https_proxy = "http://XXXX"
        self.proxyDict = {
            "http": self.http_proxy,
            "https": self.https_proxy,
        }

    def get_response(self):
        print("get response from ",self.url)
        response = requests.get(url=self.url, headers=self.header, proxies=self.proxyDict,timeout=10, verify=False)
        http = response.text
        print("Crawl from {0}; content: {1}".format(self.url,http[:500]))

    def run(self):
        self.get_response()

if __name__ == '__main__':
    start = time.time()
    base_url = "https://www.cnblogs.com/#p{}"
    urls = [base_url.format(index) for index in range(1, 20)]
    process_list = []
    for url in urls:
        spider = SpiderProcess(url)
        spider.start()
        process_list.append(spider)

    for t in process_list:
        t.join()
    print("Total timing ", time.time() - start)

结果为:7.3...

posted @ 2019-05-21 15:58  Bradwarden  阅读(217)  评论(0)    收藏  举报