1 # -*- coding=utf-8 -*-
2 # software: scrapy
3 # datetime:2020/4/8 2:48 下午
4 import gevent
5 from gevent import monkey
6 monkey.patch_all()
7 import requests
8 from lxml import etree
9 import time
10 from concurrent.futures.thread import ThreadPoolExecutor
11 from concurrent.futures.process import ProcessPoolExecutor
12
13
14 def timer(func):
15 def warpper(*args, **kwargs):
16 start_time = time.time()
17 ret = func(*args, **kwargs)
18 print(f"耗时:{func}", time.time() - start_time)
19 return ret
20
21 return warpper
22
23
24 class OrderSpider(object):
25
26 def __init__(self):
27 self.url = "http://www.bewindoweb.com/dwg.php"
28
29 def request(self):
30 res = requests.get(self.url)
31 if res.status_code == 200:
32 return self.parse(res.text)
33
34 def parse(self, html):
35 node = etree.HTML(html)
36 return node.xpath("//div[@class='card-dwg-hrefc']/a/div/div[2]/text()")
37
38
39 @timer
40 def run(function, n):
41 """
42 顺序抓取
43 :param function:
44 :param n:
45 :return:
46 """
47 a = None
48 for i in range(n):
49 a = function()
50 print(a)
51
52
53 def callback(future):
54 """
55 回调函数
56 :param future:
57 :return:
58 """
59 return future.result()
60
61
62 @timer
63 def thread_run(function, n):
64 """
65 多线程抓取
66 :param function:
67 :param n:
68 :return:
69 """
70 pools = ThreadPoolExecutor(6)
71 for i in range(n):
72 result = pools.submit(function)
73 result.add_done_callback(callback)
74 print(result.result())
75 pools.shutdown(wait=True)
76
77
78 @timer
79 def process_run(function, n):
80 """
81 多进程抓取
82 :param function:
83 :param n:
84 :return:
85 """
86 pools = ProcessPoolExecutor(6)
87 for i in range(n):
88 result = pools.submit(function)
89 result.add_done_callback(callback)
90 print(result.result())
91 pools.shutdown(wait=True)
92
93
94 @timer
95 def gevent_run(function, n):
96 """
97 多协程抓取
98 :param function:
99 :param n:
100 :return:
101 """
102 tasks = []
103 for i in range(n):
104 tasks.append(gevent.spawn(function))
105 gevent.joinall(tasks)
106 a = None
107 for task in tasks:
108 a = task.value
109 print(a)
110
111
112 if __name__ == '__main__':
113 n = 100
114 order_spider = OrderSpider()
115 run(order_spider.request, n)
116 thread_run(order_spider.request, n)
117 process_run(order_spider.request, n)
118 gevent_run(order_spider.request, n)