python 线程实现爬网

 

import threading
import requests
import time
import timeit
import re


class SpiderThreading(threading.Thread):
    def __init__(self, url, file_name):
        super(SpiderThreading, self).__init__()
        self.url = url
        self.file_name = file_name
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        }
        http_proxy = "http://XXXX"
        https_proxy = "http://XXXX"
        self.proxyDict = {
            "http": http_proxy,
            "https": https_proxy,
        }

    def get_response(self):
        print("get response from ", self.url)
        response = requests.get(
            url=self.url,
            headers=self.header,
            proxies=self.proxyDict,
            timeout=10,
            verify=False)
        http = response.text
        print("Crawl from {0}; content: {1}".format(self.url, http[:500]))
        return http

    def transfer_html(self, html):
        pattern = re.compile(
            '<a class="titlelnk" href="(.*?)" target="_blank">(.*?)</a>', re.S)
        items = re.findall(pattern, html)
        results = []
        for item in items:
            yield [item[0].strip(),item[1].strip()]
            # results.append([item[0].strip(), item[1].strip()])
        # return results

    def save_results(self, results):
        file_full_path = r"../showcase/data/{0}".format(self.file_name)
        for index, result in enumerate(results):
            with open(file_full_path, 'a+', encoding='utf-8') as f:
                f.write("{0};{1}\n".format(result[0], result[1]))
        print("save file done to ", file_full_path)

    def run(self):
        html = self.get_response()
        results = self.transfer_html(html)
        self.save_results(results)


def main():
    base_url = "https://www.cnblogs.com/#p{}"
    urls = [base_url.format(index) for index in range(1, 20)]
    file_name = "blog_threading.txt"
    threads = [SpiderThreading(url, file_name) for url in urls]
    for t in threads:
        t.start()

    for t in threads:
        t.join()


if __name__ == '__main__':
    start = time.time()
    # t = timeit.timeit(
    #     stmt='main()',
    #     setup='from __main__ import main',
    #     number=2)
    # print(t)
    main()
    print("timing ", time.time() - start)

结果

timing  1.3821380138397217

posted @ 2019-05-21 15:55  Bradwarden  阅读(251)  评论(0)    收藏  举报