web爬虫02-多线程爬虫
多线程爬虫
原理
利用CPU和IO可以同时执行的原理,让CPU不会干巴巴的等待IO的完成
#网站:还是豆瓣250 https://movie.douban.com/top250
import requests
from lxml import etree
import time
from threading import Thread
import codecs
# 时间装饰器
def timer(func):
def inner(*args, **kw):
t1 = time.time()
func(*args, **kw)
t2 = time.time()
print("-------一共花费时间:{}秒".format(t2-t1))
return t2 - t1
return inner
def get_content(url):
res = requests.get(url, headers=headers)
deal_content(res.text)
def deal_content(content):
res = etree.HTML(content)
items = res.xpath('//div[@id="wrapper"]//ol//li/div[@class="item"]/div[@class="info"]')
infos = []
for item in items:
name = item.xpath('./div/a/span[1]/text()')[0]
description = item.xpath('./div/p/text()')[0].strip()
evaluate = item.xpath('./div[2]/div[@class="star"]/span[4]/text()')[0]
infos.append(name)
infos.append(description)
infos.append(evaluate)
f.write(u'{movies}\n'.format(movies='\n'.join(infos)))
@timer
def multi_thread(urls):
threads = []
for url in urls:
threads.append(
Thread(target=get_content, args=(url,))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
urls = ['https://movie.douban.com/top250?start={}&filter='.format(i*25) for i in range(10)]
with codecs.open('moviess', 'wb', encoding='utf-8') as f:
multi_thread(urls)
花费时间

单线程为:3.102s
多线程为:0.348s
大致为10倍速度

浙公网安备 33010602011771号