4.4 获取产品列表
完成出发地和目的地的构建后,输入以下代码以便获取产品列表。
import requests import urllib import time import pymongo client = pymongo.MongoClient('localhost',27017) book_qunar = client['qunar'] sheet_qunar_zyx = book_qunar['qunar_zyx'] url = 'https://touch.dujia.qunar.com/depCities.qunar' strhtml = requests.get(url) dep_dict = strhtml.json() for dep_item in dep_dict['data']: for dep in dep_dict['data'][dep_item]: a = [] #新增去重代码 # print(dep) url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep)) time.sleep(1) strhtml = requests.get(url) arrive_dict = strhtml.json() for arr_item in arrive_dict['data']: for arr_item_1 in arr_item['subModules']: for query in arr_item_1['items']: if query['query'] not in a: #新增去重代码 a.append(query['query']) #新增去重代码 for item in a: url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep),urllib.request.quote(item),urllib.request.quote(item) ) time.sleep(1) #设置header,添加cookie 去请求,他们反爬 是看你有没有cookie判断你是否是机器去请求 headers = { "cookie":"QN1=00001d80075816179a202bf8; QN300=organic; QN48=tc_f16fbad98113c317_16b788ad2eb_c6fe; _RF1=36.110.118.134; _RSG=s2q26YN7uhCLRxFTBFcimB; _RDG=28a5b8a8ad72ce2" } strhtml = requests.get(url,headers=headers) routeCount = int(strhtml.json()['data']['limit']['routeCount']) for limit in range(0,routeCount,20): url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep),urllib.request.quote(item), urllib.request.quote(item),limit) time.sleep(1) strhtml = requests.get(url) result = { 'date':time.strftime('%Y-%m-%d',time.localtime(time.time())), 'dep':dep, 'arrive':item, 'limit':limit, 'result':strhtml.json() } sheet_qunar_zyx.insert_one(result)
代码运行结果如下图所示。

通过观察发现,每个出发地对应的目的地都会有很多哥产品,而产品数就在data.limit.routeCount键中,如下图所示。

接下来取出产品数,代码如下。
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep),urllib.request.quote(item),urllib.request.quote(item) ) time.sleep(1) #设置header,添加cookie 去请求,他们反爬 是看你有没有cookie判断你是否是机器去请求 headers = { "cookie":"QN1=00001d80075816179a202bf8; QN300=organic; QN48=tc_f16fbad98113c317_16b788ad2eb_c6fe; _RF1=36.110.118.134; _RSG=s2q26YN7uhCLRxFTBFcimB; _RDG=28a5b8a8ad72ce2" } strhtml = requests.get(url,headers=headers) # print(strhtml.text) routeCount = int(strhtml.json()['data']['limit']['routeCount'])
然后,以routeCount作为迭代次数的终点,代码如下。
for limit in range(0,routeCount,20):
注意:由于去哪儿网中的反爬机制是是看你有没有cookie判断否是机器去请求,所以应该在每次抓取数据之前添加cookie,代码如下。
headers = { "cookie":"QN1=00001d80075816179a202bf8; QN300=organic; QN48=tc_f16fbad98113c317_16b788ad2eb_c6fe; _RF1=36.110.118.134; _RSG=s2q26YN7uhCLRxFTBFcimB; _RDG=28a5b8a8ad72ce2" } strhtml = requests.get(url,headers=headers)
获取相应cookie值的方法如下:

浙公网安备 33010602011771号