多线程版本的网页爬取
首先
多线程和多进程都是一样的格式,这个是python本身语言的封装优势
也就是说python中,多线程和多进程的调用方法几乎是一模一样的
就是把调用线程的函数换成了调用进程的函数
其次就是
调用线程和进程的函数库是python自带的原生库
不用自己去pip下载!!!!!
from concurrent.futures import ThreadPoolExecutor
直接导入即可
最后就是
线程和进程的调用很简单
就是创建一个线程池(进程池,此处以线程池来举例)
with ThreadPoolExecutor(10) as x:
for i in range(1,50):
x.submit(down_1,html)
解释:
创建一个含有10个线程的线程池
每次循环,最高至50
调用函数,传入该函数的参数
以下为我们学校举例的完整代码
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
def base_url(i):
if (i == 1):
url = 'https://www.qjnu.edu.cn/channels/9260.html'
else:
url='https://www.qjnu.edu.cn/channels/9260_'+str(i)+'.html'
request = requests.get(url)
request.encoding = 'utf-8'
html = etree.HTML(request.text)
return html
def down_1(html):
name_list = html.xpath('//div[@class="media"]/h4/a/text()')
url_list = html.xpath('//div[@class="media"]/h4/a/@href')
for i in range(len(name_list)):
if key in name_list[i]:
with open('学校党员主题网址.txt', 'a', encoding='UTF-8') as fp:
fp.write(url_list[i] + '\n')
new_url = url_list[i]
request = requests.get(new_url)
request.encoding = 'utf-8'
html2 = etree.HTML(request.text)
tex_list = html2.xpath('//div[@class="field-item even"]//p/span/text()')
name = name_list[i]
with open(name + '.txt', 'w', encoding='UTF-8') as fp:
fp.write(str(tex_list))
if __name__ == '__main__':
page = int(input('请输入需要爬取的页数:'))
key = str(input('请输入要查找的关键词:'))
with ThreadPoolExecutor(10) as x:
for i in range(1,page+1):
html = base_url(i)
x.submit(down_1,html)
修改版
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
def base_url(i,url):
if (url==[]):
if (i == 1):
url = 'https://www.qjnu.edu.cn/channels/9260.html'
else:
url='https://www.qjnu.edu.cn/channels/9260_'+str(i)+'.html'
request = requests.get(url)
request.encoding = 'utf-8'
html = etree.HTML(request.text)
return html
def down_1(html):
name_list = html.xpath('//div[@class="media"]/h4/a/text()')
url_list = html.xpath('//div[@class="media"]/h4/a/@href')
for i in range(len(name_list)):
if key in name_list[i]:
with open('学校党员主题网址.txt', 'a', encoding='UTF-8') as fp:
fp.write(url_list[i] + '\n')
url = url_list[i]
html2 = base_url(i,url)
tex_list = html2.xpath('//div[@class="field-item even"]//p/span/text()')
name = name_list[i]
with open(name + '.txt', 'w', encoding='UTF-8') as fp:
fp.write(str(tex_list))
if __name__ == '__main__':
page = int(input('请输入需要爬取的页数:'))
key = str(input('请输入要查找的关键词:'))
with ThreadPoolExecutor(10) as x:
for i in range(1,page+1):
url = []
html = base_url(i,url)
x.submit(down_1,html)

浙公网安备 33010602011771号