爬虫基础 - 并发
1. 协程的使用
1.1 在协程环境中执行多个任务
import asyncio
async def work_1():
for _ in range(5):
print('work-1...')
"""
await:
1.可以获取IO任务执行成功之后的返回值
2.在这个任务执行之前我会堵塞这个程序
"""
await asyncio.sleep(1)
print(123)
async def work_2():
for _ in range(5):
print('work-2...')
await asyncio.sleep(1)
print(456)
loop = asyncio.get_event_loop()
coro_list = [work_1(), work_2()]
loop.run_until_complete(asyncio.wait(coro_list))
1.2 在较新版本解释器中执行协程任务
# 在python3.10及之上版本
import asyncio
async def work_1():
for _ in range(5):
print('work-1...')
await asyncio.sleep(1)
async def work_2():
for _ in range(5):
print('work-2...')
await asyncio.sleep(1)
async def main():
tasks = [asyncio.create_task(work_1()), asyncio.create_task(work_2())]
await asyncio.wait(tasks)
asyncio.run(main())
1.3 获取协程任务的返回值
import asyncio
from asyncio import as_completed
async def work_1():
await asyncio.sleep(3)
return 'hello world - 1'
async def work_2():
await asyncio.sleep(10)
return 'hello world - 2'
if __name__ == '__main__':
loop = asyncio.get_event_loop()
coro_list = [work_1(), work_2()]
# 获取协程任务的返回值 - 1
# done, pending = loop.run_until_complete(asyncio.wait(coro_list))
# for task in done:
# print(task.result())
# 获取协程任务的返回值 - 2 (如果其中一个任务的耗时过长则以最长的任务耗时返回)
# result_list = loop.run_until_complete(asyncio.gather(*coro_list))
# print(result_list)
# 获取协程任务的返回值 - 3
for task in as_completed(coro_list):
res = loop.run_until_complete(task)
print(res)
1.4 在协程环境下执行爬虫程序
import asyncio
import requests
from functools import partial
from bs4 import BeautifulSoup
url = 'https://movie.douban.com/top250?start={}&filter='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}
async def get_movie_info(page_number):
# loop = asyncio.get_running_loop()
response = await loop.run_in_executor(None, partial(requests.get, url.format(page_number * 25), headers=headers))
soup = BeautifulSoup(response.text, 'lxml')
div_list = soup.find_all('div', class_='hd')
for title in div_list:
print(title.get_text())
async def main():
task_list = [asyncio.create_task(get_movie_info(i)) for i in range(10)]
await asyncio.wait(task_list)
if __name__ == '__main__':
# 当前执行的代码必须在3.9以下版本执行
loop = asyncio.get_event_loop()
coro_list = [get_movie_info(i) for i in range(10)]
loop.run_until_complete(asyncio.wait(coro_list))
# 3.10以上版本的解释器使用以下方式执行
# asyncio.run(main())
# 如何创建loop事件循环对象
# loop = asyncio.new_event_loop() # 无论当前协程环境中是否存在事件循环都会创建一个新的事件循环对象
# loop = asyncio.get_event_loop() # 如果当前协程环境中存在事件循环对象,则返回当前事件循环对象,否则创建一个新的事件循环对象
# loop = asyncio.get_running_loop() # 如果当前协程环境中存在事件循环对象,则返回当前事件循环对象,否则报错
1.5 aiohttp的简单使用
由于requests
爬虫库本身不支持异步,在asyncio
中需要开启线程池才能使用。在使用上稍微有些麻烦,为了解决这个问题,我们使用支持异步操作的aiohttp
来完成爬虫任务。
aiohttp
是一个异步的网络库,可以实现HTTP
客户端,也可以实现HTTP
服务器,爬虫阶段我们只用它来实现HTTP
客户端功能。
官网:https://docs.aiohttp.org/en/stable/
aiohttp
客户端相关的官方文档:https://docs.aiohttp.org/en/stable/client.html#aiohttp-client
# pip install aiohttp
import asyncio
import aiohttp
url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}
# 使用协程函数访问百度页面并获取响应内容
async def get_baidu_page():
# 1.创建请求对象
# session = aiohttp.ClientSession()
# 2.发送请求
# response = await session.get(url, headers=headers)
# 3.输出响应数据
# print(await response.text())
# 4.释放资源
# await session.close()
# response.close()
# 使用异步上下文管理器的方式完成爬虫逻辑
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as response:
# 3.10以下
print(await response.text())
# 3.10以上
# return await response.text()
async def main():
task = asyncio.create_task(get_baidu_page())
result = await task
print(result)
if __name__ == '__main__':
# 3.10以下
loop = asyncio.get_event_loop()
loop.run_until_complete(get_baidu_page())
# 3.10以上
# asyncio.run(main())
1.6 获取协程爬虫任务的返回值
import time
import asyncio
import aiohttp
url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}
async def get_baidu_page():
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as response:
await asyncio.sleep(3)
return await response.text()
# 方式一
# async def main():
# result_1 = await get_baidu_page() # 堵塞代码
# result_2 = await get_baidu_page()
# print(result_1)
# print(result_2)
# 方式二
# async def main():
# task = [asyncio.create_task(get_baidu_page()) for _ in range(2)]
# done, pending = await asyncio.wait(task)
# for temp in done:
# print(temp.result())
# 方式三
# async def main():
# task = [asyncio.create_task(get_baidu_page()) for _ in range(2)]
# result_list = await asyncio.gather(*task) # gather一次接收一个协程任务
# print(result_list)
# 方式四
def callback(task):
print(task.result())
async def main():
task_list = [asyncio.create_task(get_baidu_page()) for _ in range(2)]
for task in task_list:
task.add_done_callback(callback)
await task
if __name__ == '__main__':
start = time.time()
asyncio.run(main())
end = time.time()
print(f'总耗时:{end - start}')
1.7 使用协程爬虫获取豆瓣电影信息
import asyncio
import aiohttp
from bs4 import BeautifulSoup
url = 'https://movie.douban.com/top250?start={}&filter='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}
async def get_movie_info(page):
async with aiohttp.ClientSession() as session:
async with session.get(url=url.format(page * 25), headers=headers) as response:
soup = BeautifulSoup(await response.text(), 'lxml')
div_list = soup.find_all('div', class_='hd')
result_list = list()
for title in div_list:
result_list.append(title.get_text())
return result_list
async def main():
task_list = [asyncio.create_task(get_movie_info(page_number)) for page_number in range(10)]
result_list = await asyncio.gather(*task_list)
for temp in result_list:
print(temp, str(len(temp)) + '个')
if __name__ == '__main__':
asyncio.run(main())
1.8 aiomysql的简单使用
用python3
中新加入的异步关键词async/await
, 我们使用各种异步操作为来执行各种异步的操作,如使用aiohttp
来代替requests
来执行异步的网络请求操作,使用motor
来代替同步的pymongo
库来操作mongo
数据库,我们在开发同步的python
程序时,我们会使用PyMySQL
来操作mysql
数据库,同样,我们会使用aiomysql
来异步操作mysql
数据库。
# pip install aiomysql
import asyncio
import aiomysql
async def get_sql_result():
# 1.连接数据库
# db = await aiomysql.connect(host='localhost', port=3306, user='root', password='root', db='py_spider')
# 2.获取游标对象
# cursor = await db.cursor()
# 3.查询数据
# sql = 'select * from tx_work;'
# await cursor.execute(sql)
# 4.打印查询结果
# result = await cursor.fetchall()
# print(result)
# 5.释放资源
# await cursor.close()
# db.close()
async with aiomysql.connect(host='localhost', port=3306, user='root', password='root', db='py_spider') as db:
async with db.cursor() as cursor:
sql = 'select * from tx_work;'
await cursor.execute(sql)
result = await cursor.fetchall()
print(result)
asyncio.run(get_sql_result())
1.9 协程案例1:汽车之家 - MySQL
使用asyncio
完成汽车之家的汽车参数信息并保存到mysql
数据库中。
网址:https://www.che168.com/china/a0_0msdgscncgpi1ltocsp7exf4x0/?pvareaid=102179#currengpostion
思路分析:
- 当前页面数据为静态数据,在翻页时
url
中的sp1
会变更为sp2
,所以当前页面可以使用xpath
提取数据。 - 通过首页进入到详情页有当前汽车的配置信息,汽车配置信息页中的数据是动态数据,可以使用抓包的方式获取
api
。 - 根据获取的
api
链接发现当前链接中存在查询字符串:specid
- 回到首页,在汽车列表中通过元素发现
li
标签中存在汽车的id
值,获取id
值拼接api
链接地址。 - 构造请求访问构造好的
api
地址获取数据。
注意点:
-
查看
api
接口返回的数据我们发现当前返回的数据类型并不是json
数据,需要对返回的数据进行处理。处理方式有以下两种: -
- 拿到返回数据后进行字符串切片,保留
json
数据部分 - 将
api
链接中的callback=configTitle
查询字符串参数删除
- 拿到返回数据后进行字符串切片,保留
-
汽车之家页面编码格式会随机变换,需要使用
chardet
第三方包实时监测编码格式,并且当页面编码格式为UTF-8-SIG
时specid
数据不存在。
import redis
import chardet # 主要用于获取返回的数据的编码集(pip install chardet)
import hashlib
import asyncio
import aiohttp
import aiomysql
from lxml import etree
class CarSpider:
def __init__(self):
self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
self.redis_client = redis.Redis()
# 获取汽车列表页中的汽车id
async def get_car_id(self, page, session, pool):
async with session.get(self.url.format(page), headers=self.headers) as response:
# 获取当前页面返回的字节数据
content = await response.read()
# 获取当前页面返回的字符集编码
encoding = chardet.detect(content)['encoding']
if encoding == 'GB2312' or encoding == 'ISO-8859-1':
result = content.decode('gbk')
else:
print('请求频繁...')
result = content.decode(encoding)
tree = etree.HTML(result)
id_list = tree.xpath("//ul[@class='viewlist_ul']/li/@specid")
if id_list:
print('id信息:', id_list)
tasks = [asyncio.create_task(self.get_car_info(spec_id, session, pool)) for spec_id in id_list]
await asyncio.wait(tasks)
# 获取汽车数据
async def get_car_info(self, spec_id, session, pool):
async with session.get(self.api_url.format(spec_id), headers=self.headers) as response:
result = await response.json()
if result['result'].get('paramtypeitems'):
item = dict()
item['name'] = result['result']['paramtypeitems'][0]['paramitems'][0]['value']
item['price'] = result['result']['paramtypeitems'][0]['paramitems'][1]['value']
item['brand'] = result['result']['paramtypeitems'][0]['paramitems'][2]['value']
item['altitude'] = result['result']['paramtypeitems'][1]['paramitems'][2]['value']
item['breadth'] = result['result']['paramtypeitems'][1]['paramitems'][1]['value']
item['length'] = result['result']['paramtypeitems'][1]['paramitems'][0]['value']
print(item)
await self.save_car_info(item, pool)
else:
print('暂无数据...')
# 数据去重
@staticmethod
def get_md5(dict_item):
md5 = hashlib.md5()
md5.update(str(dict_item).encode())
return md5.hexdigest()
# 数据保存
async def save_car_info(self, item, pool):
"""pool: aiomysql支持创建数据库连接池,减少数据库连接的创建和销毁的次数,提高性能"""
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
value_md5 = self.get_md5(item)
redis_result = self.redis_client.sadd('car:filter', value_md5)
if redis_result:
sql = """
insert into car_info values (%s, %s, %s, %s, %s, %s, %s);
"""
try:
await cursor.execute(sql, (
0,
item['name'],
item['price'],
item['brand'],
item['altitude'],
item['breadth'],
item['length'],
))
await conn.commit()
print('数据保存成功:', item)
except Exception as e:
print('数据保存失败:', e)
await conn.rollback() # 数据回滚
else:
print('数据已存在...')
async def main(self):
async with aiomysql.create_pool(user='root', password='123456', db='py_spider') as pool:
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
create_table_sql = """
create table car_info(
id int primary key auto_increment,
name varchar(100),
price varchar(100),
brand varchar(100),
altitude varchar(100),
breadth varchar(100),
length varchar(100)
);
"""
# 单独判断表是否存在
check_table_sql = "show tables like 'car_info';"
result = await cursor.execute(check_table_sql)
if not result:
await cursor.execute(create_table_sql)
async with aiohttp.ClientSession() as session:
# 使用信号量控制传递的任务数量
# semaphore = asyncio.Semaphore(5)
tasks = [asyncio.create_task(self.get_car_id(page, session, pool)) for page in range(1, 11)]
await asyncio.wait(tasks)
if __name__ == '__main__':
car_spider = CarSpider()
asyncio.run(car_spider.main())
1.10 协程案例2:汽车之家 - MongoDB
import redis
import chardet
import hashlib
import asyncio
import aiohttp
from lxml import etree
from motor.motor_asyncio import AsyncIOMotorClient # pip install motor
class CarSpider:
def __init__(self):
self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
self.redis_client = redis.Redis()
self.mongo_client = AsyncIOMotorClient('localhost', 27017)['py_spider']['car_info']
# 获取汽车列表页中的汽车id
async def get_car_id(self, page, session):
async with session.get(self.url.format(page), headers=self.headers) as response:
# 获取当前页面返回的字节数据
content = await response.read()
# 获取当前页面返回的字符集编码
encoding = chardet.detect(content)['encoding']
if encoding == 'GB2312' or encoding == 'ISO-8859-1':
result = content.decode('gbk')
else:
print('请求频繁...')
result = content.decode(encoding)
tree = etree.HTML(result)
id_list = tree.xpath("//ul[@class='viewlist_ul']/li/@specid")
if id_list:
print('id信息:', id_list)
tasks = [asyncio.create_task(self.get_car_info(spec_id, session)) for spec_id in id_list]
await asyncio.wait(tasks)
# 获取汽车数据
async def get_car_info(self, spec_id, session):
async with session.get(self.api_url.format(spec_id), headers=self.headers) as response:
result = await response.json()
if result['result'].get('paramtypeitems'):
item = dict()
item['name'] = result['result']['paramtypeitems'][0]['paramitems'][0]['value']
item['price'] = result['result']['paramtypeitems'][0]['paramitems'][1]['value']
item['brand'] = result['result']['paramtypeitems'][0]['paramitems'][2]['value']
item['altitude'] = result['result']['paramtypeitems'][1]['paramitems'][2]['value']
item['breadth'] = result['result']['paramtypeitems'][1]['paramitems'][1]['value']
item['length'] = result['result']['paramtypeitems'][1]['paramitems'][0]['value']
print(item)
await self.save_car_info(item)
else:
print('暂无数据...')
# 数据去重
@staticmethod
def get_md5(dict_item):
md5 = hashlib.md5()
md5.update(str(dict_item).encode())
return md5.hexdigest()
# 保存数据
async def save_car_info(self, item):
md5_hash = self.get_md5(item)
redis_result = self.redis_client.sadd('car:filter', md5_hash)
if redis_result:
try:
await self.mongo_client.insert_one(item)
print('数据保存成功:', item)
except Exception as e:
print('数据保存失败:', e)
else:
print('数据已存在...')
async def main(self):
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(self.get_car_id(page, session)) for page in range(1, 11)]
await asyncio.wait(tasks)
if __name__ == '__main__':
car_spider = CarSpider()
loop = asyncio.get_event_loop()
loop.run_until_complete(car_spider.main())
2. 线程的使用
上面使用了asyncio
的方式完成了并发爬虫,但是大多数时候最常用的还是基于多线程的方式来完成爬虫需求。
线程基础回顾
import time
import threading
def work_1():
for _ in range(5):
print('work_1')
time.sleep(1)
def work_2():
for _ in range(5):
print('work_2')
time.sleep(1)
t1 = threading.Thread(target=work_1)
t2 = threading.Thread(target=work_2)
t2.daemon = True
t1.start()
# t1.join() # 堵塞主线程: 等待t1执行完毕之后再继续执行t2
t2.start()
"""
1.守护线程: 主线程任务执行完毕之后不会等待未完成的子线程任务直接退出
2.线程堵塞: 主线程必须等待子线程任务执行完毕之后才能继续向下执行
"""
2.1 使用线程获取豆瓣电影信息
import requests
import threading
from lxml import etree
url = 'https://movie.douban.com/top250?start={}&filter='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
def get_movie_info(page_number):
response = requests.get(url.format(page_number * 25), headers=headers).text
tree = etree.HTML(response)
result = tree.xpath("//div[@class='hd']/a/span[1]/text()")
print(result)
if __name__ == '__main__':
thread_list = [threading.Thread(target=get_movie_info, args=(page,)) for page in range(10)]
for thread in thread_list:
thread.start()
2.2 使用线程池的方式获取豆瓣电影信息
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, as_completed
url = 'https://movie.douban.com/top250?start={}&filter='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
def get_movie_info(page_number):
response = requests.get(url.format(page_number * 25), headers=headers).text
tree = etree.HTML(response)
result = tree.xpath("//div[@class='hd']/a/span[1]/text()")
return result
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=5) as pool:
futures = [pool.submit(get_movie_info, page) for page in range(10)]
# for future in futures:
# print(future.result()) # result()方法获取线程池中任务的返回值, result方法会堵塞主线程
for future in as_completed(futures): # as_completed()方法获取线程池中任务的返回值, 不会堵塞主线程, 返回的是一个生成器对象
print(future.result())
3. 进程的使用
因为在Python
中存在GIL
锁,无法充分利用多核优势。所以为了能够提高程序运行效率我们也会采用进程的方式来完成代码需求。
进程代码回顾
from multiprocessing import Process
# 创建进程对象
p = Process(target=func, args=(,))
# 设置守护进程
p.daemon = True
# 启动进程
p.start()
进程中的队列
多进程中使用普通的队列模块会发生阻塞,对应的需要使用multiprocessing
提供的JoinableQueue
模块,其使用过程和在线程中使用的queue
方法相同。
代码示例
接下来我们通过腾讯招聘代码案例学习如何在进程中使用JoinableQueue
队列模块。
import pymongo
import requests
import jsonpath
from multiprocessing import Process, JoinableQueue as Queue
url = 'https://careers.tencent.com/tencentcareer/api/post/Query'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
mongo_client = pymongo.MongoClient()
db = mongo_client['py_spider']['process_tx_work']
def get_work_info(page_num, queue):
params = {
"timestamp": "1741526713421",
"countryId": "",
"cityId": "",
"bgIds": "",
"productId": "",
"categoryId": "",
"parentCategoryId": "",
"attrId": "",
"keyword": "python",
"pageIndex": page_num,
"pageSize": "10",
"language": "zh-cn",
"area": "cn"
}
response = requests.get(url, params=params, headers=headers).json()
# 在某些页面中不存在当前的json数据会抛出异常
try:
for info in response['Data']['Posts']:
work_info_dict = dict()
work_info_dict['recruit_post_name'] = jsonpath.jsonpath(info, '$..RecruitPostName')[0]
work_info_dict['country_name'] = jsonpath.jsonpath(info, '$..CountryName')[0]
work_info_dict['location_name'] = jsonpath.jsonpath(info, '$..LocationName')[0]
work_info_dict['category_name'] = jsonpath.jsonpath(info, '$..CategoryName')[0]
work_info_dict['responsibility'] = jsonpath.jsonpath(info, '$..Responsibility')[0]
work_info_dict['last_update_time'] = jsonpath.jsonpath(info, '$..LastUpdateTime')[0]
queue.put(work_info_dict)
except TypeError:
print('数据不存在:', params.get('pageIndex'))
def save_work_info(queue):
while True:
dict_info = queue.get()
db.insert_one(dict_info)
print('数据插入成功:', dict_info)
# 如果任务计数器为0则退出当前进程
queue.task_done()
if __name__ == '__main__':
dict_info_queue = Queue()
process_list = list()
for page in range(1, 39):
p_get_info = Process(target=get_work_info, args=(page, dict_info_queue))
process_list.append(p_get_info)
p_save_work = Process(target=save_work_info, args=(dict_info_queue,))
for p in process_list:
p.start()
p_save_work.daemon = True
p_save_work.start()
# 必须让主进程执行完p_get_info任务才能继续向下执行
for p in process_list:
# 等待p_get_info任务全部结束后释放主进程
p.join()
# 当队列计数器为0再次释放主进程
dict_info_queue.join()
# 爬虫任务完成
print('任务结束...')
mongo_client.close()
print('释放连接对象...')
4. 综合案例
4.1 实战案例1:使用队列+多线程获取爱奇艺片库信息
import pymongo
import requests
import threading
from queue import Queue
class AiQiYi:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
self.mongo_client = pymongo.MongoClient()
self.db = self.mongo_client['py_spider']['thread_aqy_movie']
self.api_url = 'https://pcw-api.iqiyi.com/search/recommend/list?channel_id=2&data_type=1&mode=11&page_id={}&ret_num=48&session=85dd981b69cead4b60f6d980438a5664&three_category_id=15;must'
self.url_queue = Queue() # 存放url的队列
self.json_queue = Queue() # 存放api返回的json数据的队列
self.content_dict_queue = Queue() # 存放解析之后的数据的队列
def get_url(self):
for page in range(1, 11):
self.url_queue.put(self.api_url.format(page))
def get_api_url(self):
while True:
url = self.url_queue.get()
response = requests.get(url, headers=self.headers).json()
self.json_queue.put(response)
self.url_queue.task_done()
def parse_movie_info(self):
while True:
json_info = self.json_queue.get()
for movie in json_info['data']['list']:
item = dict()
item['title'] = movie['title']
item['playUrl'] = movie['playUrl']
item['description'] = movie['description']
self.content_dict_queue.put(item)
self.json_queue.task_done()
def save_movie_info(self):
while True:
item = self.content_dict_queue.get()
self.db.insert_one(item)
print('数据保存成功:', item)
self.content_dict_queue.task_done()
def main(self):
thread_list = list()
self.get_url()
for _ in range(3):
t_get_json = threading.Thread(target=self.get_api_url)
thread_list.append(t_get_json)
for _ in range(3):
t_parse = threading.Thread(target=self.parse_movie_info)
thread_list.append(t_parse)
t_save_info = threading.Thread(target=self.save_movie_info)
thread_list.append(t_save_info)
for thread_obj in thread_list:
thread_obj.daemon = True
thread_obj.start()
# 队列任务等待
for q in [self.url_queue, self.json_queue, self.content_dict_queue]:
q.join()
# 释放资源
self.mongo_client.close()
print('爬虫结束...')
if __name__ == '__main__':
aqy = AiQiYi()
aqy.main()
4.2 实战案例2:使用线程池获取百度招聘信息
import pymysql
import requests
from dbutils.pooled_db import PooledDB
from concurrent.futures import ThreadPoolExecutor, as_completed
class BaiDuWorkSpider:
def __init__(self):
self.pool = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=1,
maxcached=2,
maxshared=3,
blocking=True, # 连接耗尽时是否阻塞等待
host='localhost',
port=3306,
user='root',
password='123456',
database='py_spider',
charset='utf8'
)
self.api_url = 'https://talent.baidu.com/httservice/getPostListNew'
self.headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
"Origin": "https://talent.baidu.com",
"Pragma": "no-cache",
"Referer": "https://talent.baidu.com/jobs/social-list?search=",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\""
}
def get_work_info(self, page_num):
post_data = {
'recruitType': 'SOCIAL',
'pageSize': 10,
'keyWord': '',
'curPage': page_num,
'projectType': '',
}
cookies = {
"BIDUPSID": "76C653EDEB40C10F7F060DF3835854E3",
"PSTM": "1742473418",
"BAIDUID": "AD2B22AA47105A367D8F718EF45D4DB7:FG=1",
"H_WISE_SIDS": "62325_62969_63018_63056",
"BAIDUID_BFESS": "AD2B22AA47105A367D8F718EF45D4DB7:FG=1",
"ZFY": "wyT2CVZBsmiD2N7HpYAmmlD2cuhAfyL:Bl:AO7H4Z:BHN0:C",
"H_PS_PSSID": "61027_62325_62484_62969_63056_63140_63188_63195_63211_63241_63248_63253_63266_63074",
"Hm_lvt_50e85ccdd6c1e538eb1290bc92327926": "1747748555",
"HMACCOUNT": "B332573DD7B815D7",
"Hm_lpvt_50e85ccdd6c1e538eb1290bc92327926": "1747748942",
"RT": "\"z=1&dm=baidu.com&si=abb7eb1e-1e10-4864-89ec-1bf5c2cae096&ss=mawkd7ob&sl=4&tt=41q9&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=8gcx\""
}
response = requests.post(self.api_url, headers=self.headers, data=post_data, cookies=cookies).json()
return response
def parse_work_info(self, response):
work_list = response['data']['list']
for work_info in work_list:
education = work_info['education'] if work_info['education'] else '空'
name = work_info['name']
service_condition = work_info['serviceCondition']
self.save_work_info(0, name, education, service_condition)
def save_work_info(self, *args):
"""
args: id, education, name, service_condition
"""
with self.pool.connection() as db:
with db.cursor() as cursor:
sql = "insert into baidu_work_thread_pool values (%s, %s, %s, %s);"
try:
cursor.execute(sql, args)
db.commit()
print(f'保存数据成功:{args}')
except Exception as e:
db.rollback()
print(f'保存数据失败:{e}')
def create_table(self):
with self.pool.connection() as db:
with db.cursor() as cursor:
sql = """
create table if not exists baidu_work_thread_pool(
id int primary key auto_increment,
name varchar(100),
education varchar(200),
service_condition text
);
"""
try:
cursor.execute(sql)
print('创建表成功')
except Exception as e:
print(f'创建表失败:{e}')
def main(self):
self.create_table()
with ThreadPoolExecutor(max_workers=10) as pool:
futures = [pool.submit(self.get_work_info, page) for page in range(1, 51)]
for future in as_completed(futures):
pool.submit(self.parse_work_info, future.result())
if __name__ == '__main__':
baidu_work_spider = BaiDuWorkSpider()
baidu_work_spider.main()
4.3 实战案例3:使用队列+多进程获取芒果片库信息
import redis
import pymongo
import hashlib
import requests
from multiprocessing import Process, JoinableQueue as Queue
class MovieInfo:
"""
在进程环境中, 数据库连接对象无法在构造函数中创建, 需要将连接对象创建成类属性而不是实例属性
"""
redis_client = redis.Redis(host='localhost', port=6379, db=0)
mongo_client = pymongo.MongoClient(host='localhost', port=27017)
mongo_db = mongo_client['py_spider']['process_mongo_movie']
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
self.api_url = 'https://pianku.api.mgtv.com/rider/list/pcweb/v3'
self.params_queue = Queue() # 存储查询字符串相关信息
self.json_queue = Queue()
self.content_queue = Queue()
def put_params(self):
for page in range(1, 6):
params_dict = {
"allowedRC": "1",
"platform": "pcweb",
"channelId": "2",
"pn": page,
"pc": "80",
"hudong": "1",
"_support": "10000000",
"kind": "19",
"area": "10",
"year": "all",
"chargeInfo": "a1",
"sort": "c2",
"feature": "all"
}
self.params_queue.put(params_dict)
def get_movie_info(self):
while True:
params_dict = self.params_queue.get()
response = requests.get(url=self.api_url, params=params_dict, headers=self.headers).json()
self.json_queue.put(response)
self.params_queue.task_done()
def parse_movie_info(self):
while True:
response = self.json_queue.get()
movie_list = response['data']['hitDocs']
for movie in movie_list:
item = dict()
item['title'] = movie['title']
item['subtitle'] = movie['subtitle']
item['story'] = movie['story']
self.content_queue.put(item)
self.json_queue.task_done()
@staticmethod
def get_md5(dict_item):
md5_hash = hashlib.md5(str(dict_item).encode('utf-8')).hexdigest()
return md5_hash
def save_movie_info(self):
while True:
item = self.content_queue.get()
md5_hash = self.get_md5(item)
redis_result = self.redis_client.sadd('process_mg_movie:filter', md5_hash)
if redis_result:
try:
self.mongo_db.insert_one(item)
print('数据插入成功:', item)
except Exception as e:
print('数据插入失败:', e)
else:
print('数据已存在...')
self.content_queue.task_done()
def close_spider(self):
self.mongo_client.close()
self.redis_client.close()
print('爬虫任务结束...')
def main(self):
self.put_params()
process_list = list()
for _ in range(3):
p_get_movie_info = Process(target=self.get_movie_info)
process_list.append(p_get_movie_info)
p_parse_movie_info = Process(target=self.parse_movie_info)
process_list.append(p_parse_movie_info)
p_save_movie_info = Process(target=self.save_movie_info)
process_list.append(p_save_movie_info)
for process_obj in process_list:
process_obj.daemon = True
process_obj.start()
for q in [self.params_queue, self.json_queue, self.content_queue]:
q.join()
self.close_spider()
if __name__ == '__main__':
movie_info = MovieInfo()
movie_info.main()
4.4 实战案例4:使用协程获取英雄皮肤
import os
import asyncio
import aiohttp
import aiofile # pip install aiofile
class HeroSkin:
def __init__(self):
self.json_url = 'https://pvp.qq.com/web201605/js/herolist.json'
self.skin_url = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
async def get_image_content(self, session, e_name, c_name):
for skin_id in range(1, 31):
async with session.get(self.skin_url.format(e_name, e_name, skin_id), headers=self.headers) as response:
if response.status == 200:
content = await response.read()
async with aiofile.async_open('./images/' + c_name + '-' + str(skin_id) + '.jpg', 'wb') as f:
await f.write(content)
print('下载成功:', c_name + '-' + str(skin_id) + '.jpg')
else:
break
async def main(self):
task_list = list()
async with aiohttp.ClientSession() as session:
async with session.get(self.json_url, headers=self.headers) as response:
result = await response.json(content_type=None)
for item in result:
e_name = item['ename']
c_name = item['cname']
# python3.7版本以下,将create_task替换为ensure_future
task_obj = asyncio.create_task(self.get_image_content(session, e_name, c_name))
task_list.append(task_obj)
await asyncio.wait(task_list)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
if not os.path.exists('./images'):
os.mkdir('./images')
hero_skin = HeroSkin()
loop.run_until_complete(hero_skin.main())