4.4 获取产品列表

完成出发地和目的地的构建后,输入以下代码以便获取产品列表。

import requests
import urllib
import time
import pymongo
client = pymongo.MongoClient('localhost',27017)
book_qunar = client['qunar']
sheet_qunar_zyx = book_qunar['qunar_zyx']
url = 'https://touch.dujia.qunar.com/depCities.qunar'
strhtml = requests.get(url)
dep_dict = strhtml.json()
for dep_item in dep_dict['data']:
    for dep in dep_dict['data'][dep_item]:
        a = []  #新增去重代码
        # print(dep)
        url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
        time.sleep(1)
        strhtml = requests.get(url)
        arrive_dict = strhtml.json()
        for arr_item in arrive_dict['data']:
            for arr_item_1 in arr_item['subModules']:
                for query in arr_item_1['items']:
                    if query['query'] not in a:  #新增去重代码
                        a.append(query['query'])  #新增去重代码
        for item in a:
            url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format(
                urllib.request.quote(dep),urllib.request.quote(item),urllib.request.quote(item)
            )
            time.sleep(1)
            #设置header,添加cookie 去请求,他们反爬 是看你有没有cookie判断你是否是机器去请求
            headers = {
                "cookie":"QN1=00001d80075816179a202bf8; QN300=organic; QN48=tc_f16fbad98113c317_16b788ad2eb_c6fe; _RF1=36.110.118.134; _RSG=s2q26YN7uhCLRxFTBFcimB; _RDG=28a5b8a8ad72ce2"
            }
            strhtml = requests.get(url,headers=headers)
            routeCount = int(strhtml.json()['data']['limit']['routeCount'])
            for limit in range(0,routeCount,20):
                url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format(
                    urllib.request.quote(dep),urllib.request.quote(item),
                    urllib.request.quote(item),limit)
                time.sleep(1)
                strhtml = requests.get(url)
                result = {
                    'date':time.strftime('%Y-%m-%d',time.localtime(time.time())),
                    'dep':dep,
                    'arrive':item,
                    'limit':limit,
                    'result':strhtml.json()
                }
                sheet_qunar_zyx.insert_one(result)

代码运行结果如下图所示。

通过观察发现,每个出发地对应的目的地都会有很多哥产品,而产品数就在data.limit.routeCount键中,如下图所示。

 接下来取出产品数,代码如下。

url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format(
                urllib.request.quote(dep),urllib.request.quote(item),urllib.request.quote(item)
            )
            time.sleep(1)
            #设置header,添加cookie 去请求,他们反爬 是看你有没有cookie判断你是否是机器去请求
            headers = {
                "cookie":"QN1=00001d80075816179a202bf8; QN300=organic; QN48=tc_f16fbad98113c317_16b788ad2eb_c6fe; _RF1=36.110.118.134; _RSG=s2q26YN7uhCLRxFTBFcimB; _RDG=28a5b8a8ad72ce2"
            }
            strhtml = requests.get(url,headers=headers)
            # print(strhtml.text)
            routeCount = int(strhtml.json()['data']['limit']['routeCount'])

然后,以routeCount作为迭代次数的终点,代码如下。

for limit in range(0,routeCount,20):

注意:由于去哪儿网中的反爬机制是是看你有没有cookie判断否是机器去请求,所以应该在每次抓取数据之前添加cookie,代码如下。

 headers = {
                "cookie":"QN1=00001d80075816179a202bf8; QN300=organic; QN48=tc_f16fbad98113c317_16b788ad2eb_c6fe; _RF1=36.110.118.134; _RSG=s2q26YN7uhCLRxFTBFcimB; _RDG=28a5b8a8ad72ce2"
            }
            strhtml = requests.get(url,headers=headers)

获取相应cookie值的方法如下:

 

posted @ 2019-06-24 14:40  taoziya  阅读(245)  评论(0)    收藏  举报