import re
import json
from odps import ODPS
from threading import Thread
import threading
from urllib import parse
import datetime
from lxml import etree

import random 
import requests
import time

from models import *

# def write_txt(html_data):
#     f = open("a.txt", 'a+')
#     f.write(html_data)
#     f.write("\n")
#     f.close()

domain_hotel = "https://hotel.qunar.com/cn/"
district_url = "https://hotel.qunar.com/napi/seo?path=%2Fseo%2Fnav&city="
hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" # 获取酒店评论数据的url,需要替换
#domain_hotel = "https://hotel.qunar.com/cn/sanya/?fromDate=2020-08-03&toDate=2020-08-04&cityName=%E4%B8%89%E4%BA%9A"

def get_cookies():
    session = requests.Session()
    url = "https://hotel.qunar.com/cn/bazhong/?fromDate=" + str(datetime.datetime.now().strftime('%Y-%m-%d')) + "&toDate=" + str((datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')) + "&cityName=巴中"
    false = False
    true = True
    payload = {
    'b':{
        'bizVersion':17,
        'cityUrl':'bazhong',
        'fromDate':str(datetime.datetime.now().strftime('%Y-%m-%d')),
        'toDate':str((datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')),
        'q':'',
        'qFrom':3,
        'start':640,
        'num':20,
        'minPrice':0,
        'maxPrice':-1,
        'level':'',
        'sort':0,
        'cityType':1,
        'fromForLog':1,
        'uuid':'',
        'userName':'',
        'userId':'',
        'fromAction':'',
        'searchType':0,
        'hourlyRoom':false,
        'locationAreaFilter':[],
        'comprehensiveFilter':[],
        'channelId':1
        },
    'qrt':'h_hlist',
    'source':'website'
    }
    session.post(url,data=payload)
    cookies = requests.utils.dict_from_cookiejar(session.cookies)
    return cookies['QN1']
    # print(session.cookies)
    # print(cookies)
    # print(type(cookies))
    # print(cookies['QN1'])

def change_cookie(headers_data): # 改变cookie
    headers_data_0 = headers_data
    cookie_data = get_cookies()
    cookies_temp = headers_data['Cookie']
    cookies_temp = cookies_temp.replace('QN1=00001480319827120b981f99',"QN1="+ str(cookie_data))
    headers_data_0['Cookie'] = cookies_temp
    print(headers_data_0)
    return headers_data_0

# 获取城市数据,存储,利用获取到的json文本数据
def save_city_list():
    with open('cityList.json','r',encoding='utf8')as fp:
        json_data = json.load(fp)
        for data in json_data:
            for data_0 in data:
                for data_value in data_0['value']:
                    district_url_0 = district_url + str(data_value['url'])
                    response = requests.request("GET", district_url_0)
                    json_city = json.loads(response.text)
                    #if len(json_city) > 0 and len(json_city['data'] > 0):
                    try:
                        if  (json_city['data'][0]["name"] == data_value['name'] + "行政区酒店") and (json_city['data'][0]['type'] == "city"):
                            for item in json_city['data'][0]['list']:
                                data_i = item["name"].split("酒店")[0]
                                data_i = data_i.split(" ")[0]
                                catalogue = qunar_List_City()
                                catalogue.district_name = data_i # 行政区域名字
                                catalogue.district_spell = item['id'] # 行政区域拼音
                                catalogue.city_name = data_value['name'] # 城市名称
                                catalogue.city_spell = data_value['url'] # 城市拼音
                                catalogue.create_time = datetime.datetime.now() # 抓取时间
                                existed_id = qunar_List_City.select().where(qunar_List_City.district_spell==item['id'])
                                if existed_id:
                                    pass  
                                else:
                                    catalogue.save(force_insert=True)
                        else:
                            catalogue = qunar_List_City()
                            catalogue.district_name = data_value['name'] # 行政区域名字
                            catalogue.district_spell = data_value['url'] # 行政区域拼音
                            catalogue.city_name = data_value['name'] # 城市名称
                            catalogue.city_spell = data_value['url'] # 城市拼音
                            catalogue.create_time = datetime.datetime.now() # 抓取时间
                            existed_id = qunar_List_City.select().where(qunar_List_City.city_name==data_value['name'])
                            if existed_id:
                                pass  
                            else:
                                catalogue.save(force_insert=True)
                    except:
                        #print(response.status_code)
                        print("非大陆数据或者城市酒店数据为空")
                        print(district_url_0)  

#根据catalogue存储的数据来获取城市的信息,用来拼接url
def save_hotel_url_to_redis():
    id_data = qunar_List_City.select()
    for item in id_data:
        city_name = item.city_name
        city_spell = item.city_spell
        district_name = item.district_name # 行政区域名字
        from_date = datetime.datetime.now().strftime('%Y-%m-%d')
        to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')  
        #url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + city_name
        url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + parse.quote(city_name) + " " + str(district_name)
        r.lpush('test.com:hotel_url',url)#酒店数据的爬取url


#根据catalogue存储的数据来获取门票的信息,用来拼接url
def save_ticket_url_to_redis():
    id_data = qunar_List_City.select()
    for item in id_data:
        district_name = item.district_name # 行政区域名字
        url = tickect_url.replace('%E5%A6%82%E7%9A%8B%E5%B8%82',str(parse.quote(district_name)))  
        r.lpush('test.com:ticket_url',url)#ticket票据数据的爬取url

domain_vacation = "https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_"
#'https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_%E8%8B%8F%E5%B7%9E_all?ti=3&tm=l01_all_search_newc'
#根据catalogue存储的数据来获取城市的信息,用来拼接url
def save_vacation_url_to_redis():
    id_data = qunar_List_City.select()
    for item in id_data:
        district_name = item.district_name
        url = domain_vacation + parse.quote(district_name) + '_all?ti=3&tm=l01_all_search_newc' + " " + str(item.city_name) 
        r.lpush('test.com:vacation_url',url)#度假商品的url

def get_nodes_json():
    url =  r.lpop('test.com:hotel_url')
    #url = 'https://hotel.qunar.com/cn/wuzhishan/?fromDate=2020-08-06&toDate=2020-08-07&cityName=%E4%BA%94%E6%8C%87%E5%B1%B1'
    city_spell = re.search(r"cn\/(.*)\/\?",url).group(1) # 此处获取城市的对应拼音
    city_name = re.search(r"cityName=(.*)",url).group(1) # 此处获取城市的名字
    district_name = re.search(r"([\u2E80-\u9FFF]+)",url).group(1) # 此处获取行政区域的名字

    url = url.split(" ")[0]
    # city_name = parse.quote(city_name) # 对城市名字进行转码

    from_date = datetime.datetime.now().strftime('%Y-%m-%d')
    to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    
    payload_data = payload
    payload_data = payload_data.replace("\"cityUrl\":\" \"","\"cityUrl\":\"" + city_spell + "\"")
    payload_data = payload_data.replace("\"大兴区\"","\"" + district_name + "\"") # 行政区域名字
    # payload_data = payload_data.replace("\"num\":20","\"num\":20") # 酒店翻页数量
    #payload_data = payload_data.replace("\"cityName\":\" \"","\"cityName\":\"" + city_name + "\"")
    payload_data = payload_data.replace("\"fromDate\":\" \"","\"fromDate\":\"" + from_date + "\"")
    payload_data = payload_data.replace("\"toDate\":\" \"","\"toDate\":\"" + to_date + "\"")
    # payload_data = payload_data.encode("utf-8")

    headers_data = headers
    # cookie_data = get_cookies()
    # cookies_temp = headers_data['Cookie']
    # cookies_temp = cookies_temp.replace('QN1=00001480319827120b981f99',"QN1="+ str(cookie_data))
    # print(cookies_temp)
    # headers_data['Cookie'] = cookies_temp
    headers_data['referer'] = url
    # print(payload_data)
    # print(headers_data)
    hotel_number = 0

    flag_num = 0 # 用来标记请求次数,请求10次也没有获取到数据,说明地级县市没有数据
    while(1):
        payload_data_0 = payload_data.encode("utf-8")
        #headers_data = change_cookie(headers_data)
        response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data_0)
        json_data = json.loads(response.text)
        #print(response.status_code,url,len(response.text))
        print(district_name)
        if flag_num > 15:
            break
        if response.status_code == 200:
            flag_num = flag_num + 1
            print(json_data['bstatus'])
            if json_data['bstatus']['code'] == 0:
                hotel_number = json_data['data']['tcount']    
                break
            else:
                #print(json_data['bstatus']['code'],url,len(response.text)) 
                continue
                
        
    print(hotel_number,"酒店总数量")
    start_num = 0
    before_num = 0
    while(1): 
        if hotel_number  > 0:
            #print(before_num,start_num,hotel_number,"before_num","start_num","hotel_number")
            print(hotel_number,"剩余未处理酒店数量")
            payload_data = payload_data.replace("\"start\":" + str(before_num),"\"start\":" + str(start_num)) # 起始酒店序号
            payload_data_0 = payload_data.encode("utf-8")
            process_response_data(headers_data,payload_data_0,hotel_number,district_name)
            before_num = start_num
            #payload_data = payload_data.replace("\"num\":20","\"num\":" + str(20 if (hotel_number > 20)else hotel_number)) # 酒店翻页数量
            start_num = start_num + 20
            hotel_number = hotel_number - 20
        else:
            break

# 获取酒店详细评论量
def get_hotel_comment(hotel_id):
    # try:
    hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
    hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
    response = requests.request("GET", hotel_comment)
    while len(response.text) < 40:
        response = requests.request("GET", hotel_comment)
    #print(hotel_comment,response.status_code,len(response.text))
    json_data = json.loads(response.text)
    negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
    neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
    positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
    return [negativeCount,neutralCount,positiveCount]
    # except:
    #     time.sleep(3)
    #     try:
    #         hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
    #         hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
    #         response = requests.request("GET", hotel_comment)
    #         # print(hotel_comment,response.status_code)
    #         json_data = json.loads(response.text)
    #         negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
    #         neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
    #         positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
    #         return [negativeCount,neutralCount,positiveCount]
    #     except:
    #         hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
    #         hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
    #         print(hotel_comment,"没有评论数的酒店信息")
    #         return [0,0,0]
    

def process_response_data(headers_data,payload_data,hotel_number,district_name): # 处理response的相应信息
    connect_times = 20 # 设置重连次数20次
    flag_num = 0 # 设置标志位,达到条件则获取新的cookies
    while(connect_times):
        flag_num = flag_num + 1
        if flag_num % 200 == 0:
            headers_data = change_cookie(headers_data)
        response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data)
        if response.status_code == 200:
            json_data = json.loads(response.text)
            if json_data['bstatus']['code'] == 0:
                time.sleep(random.randint(0,2)) # 设置随机休眠时间
                connect_times = 0 # 重置重连次数 
                hotel_city = json_data['data']['cityName'] # 酒店所在的城市
                print(len(json_data['data']['hotels']),"hotels的数量")
                if hotel_number > 20 and len(json_data['data']['hotels']) != 20: # 此处代码用来判断数据大于20的时候,取值缺少数据
                    connect_times = 20
                    print(f"当前hotel_number={hotel_number}")
                    print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                    print("休眠120s")
                    time.sleep(120)
                    continue
                if hotel_number < 20 and len(json_data['data']['hotels']) != hotel_number: # 此处代码用来判断数据大于0的时候,取值数据缺少
                    connect_times = 20
                    print(f"当前hotel_number={hotel_number}")
                    print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                    print("休眠120s")
                    time.sleep(120)
                    continue

                for data_hotel in json_data['data']['hotels']:
                    #print(data_hotel)
                    hotel_data = qunar_Hotel_data()
                    hotel_data.hotel_district = district_name
                    hotel_data.hotel_city = hotel_city
                    hotel_data.hotel_name = data_hotel['name']
                    #write_txt(data_hotel['name'])
                    hotel_data.hotel_level = data_hotel['dangciText']
                    hotel_data.hotel_score = data_hotel['score']
                    hotel_data.hotel_price = float(data_hotel['price']) # print(data_hotel['price'] + data_hotel['currencySign'])
                    #print(data_hotel['price'])
                    hotel_data.hotel_commentCount = data_hotel['commentCount']
                    negativeCount,neutralCount,positiveCount = get_hotel_comment(data_hotel["seqNo"])

                    hotel_data.hotel_negativeCount = negativeCount
                    hotel_data.hotel_neutralCount = neutralCount
                    hotel_data.hotel_positiveCount = positiveCount

                    hotel_data.hotel_Number = data_hotel['phoneNumber']
                    hotel_data.hotel_LocationInfo = data_hotel['locationInfo']
                    hotel_data.hotel_image = data_hotel["imageid"]
                    hotel_data.create_time = datetime.datetime.now() # 抓取时间
                    
                    hotel_data.save(force_insert=True)       
            else :
                if json_data['bstatus']['code'] == -1000:
                    print("搜索条件修改")
                    time.sleep(3)
                    pass 
                else:
                    #print(f"第{20 - connect_times + 1}次尝试连接")
                    connect_times = connect_times -1
                    if 20 - connect_times + 1 > 18:
                        connect_times = 20
                        print("连接次数达到上线,休眠900s")
                        time.sleep(120)
                    pass    
        else:
            print("网页请求错误")

class parse_qunar_url_Thread(Thread):
    def run(self):
        while(1):
            get_nodes_json()
        #保存最终的数据

if __name__ == "__main__":
    create_tables()
    save_city_list()
    save_hotel_url_to_redis()
    save_vacation_url_to_redis()
    save_ticket_url_to_redis()
    # for i in range(20):
    #     parse_qunar_url_thread = parse_qunar_url_Thread()     
    #     parse_qunar_url_thread.start()