1 import redis
2 import chardet
3 import hashlib
4 import asyncio
5 import aiohttp
6 from lxml import etree
7 from fake_useragent import UserAgent
8 from motor.motor_asyncio import AsyncIOMotorClient
9
10
11 class CarSpider:
12 user_agent = UserAgent()
13 redis_client = redis.Redis()
14 mongo_client = AsyncIOMotorClient('localhost', 27017)['py_spider']['car_info']
15
16 def __init__(self):
17 self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
18 self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
19
20 def __del__(self):
21 # 爬虫完毕时关闭redis服务
22 self.redis_client.close()
23
24 # 获取汽车id
25 async def get_car_id(self, page, session):
26 async with session.get(self.url.format(page), headers={'User-Agent': self.user_agent.random}) as response:
27 content = await response.read()
28 encoding = chardet.detect(content)['encoding']
29
30 if encoding == 'GB2312' or encoding == 'ISO-8859-1':
31 result = content.decode('gbk')
32 tree = etree.HTML(result)
33 id_list = tree.xpath('//ul[@class="viewlist_ul"]/li/@specid')
34 if id_list:
35 # 创建获取汽车详细信息的task任务
36 tasks = [loop.create_task(self.get_car_info(spec_id, session)) for spec_id in id_list]
37 await asyncio.wait(tasks)
38 else:
39 print('id为空...')
40 else:
41 print('错误页面...')
42
43 # 获取汽车详细信息
44 async def get_car_info(self, spec_id, session):
45 async with session.get(self.api_url.format(spec_id), headers={'User-Agent': self.user_agent.random}) as response:
46 result = await response.json()
47 if result['result'].get('paramtypeitems'):
48 item = dict()
49 item['name'] = result['result']['paramtypeitems'][0]['paramitems'][0]['value']
50 item['price'] = result['result']['paramtypeitems'][0]['paramitems'][1]['value']
51 item['brand'] = result['result']['paramtypeitems'][0]['paramitems'][2]['value']
52 item['altitude'] = result['result']['paramtypeitems'][1]['paramitems'][2]['value']
53 item['breadth'] = result['result']['paramtypeitems'][1]['paramitems'][1]['value']
54 item['length'] = result['result']['paramtypeitems'][1]['paramitems'][0]['value']
55 await self.save_car_info(item)
56 else:
57 print('数据不存在...')
58
59 # 数据去重
60 @staticmethod
61 def get_md5(dict_item):
62 md5 = hashlib.md5()
63 md5.update(str(dict_item).encode('utf-8'))
64 return md5.hexdigest()
65
66 # 数据保存
67 async def save_car_info(self, item):
68 md5_hash = self.get_md5(item)
69 redis_result = self.redis_client.sadd('car:filter', md5_hash)
70 if redis_result:
71 await self.mongo_client.insert_one(item)
72 print('数据插入成功:', item)
73 else:
74 print('数据重复...')
75
76 async def main(self):
77 async with aiohttp.ClientSession() as session:
78 tasks = [asyncio.create_task(self.get_car_id(page, session)) for page in range(1, 101)]
79 await asyncio.wait(tasks)
80
81
82 if __name__ == '__main__':
83 loop = asyncio.get_event_loop()
84 car_spider = CarSpider()
85 loop.run_until_complete(car_spider.main())