#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Fade Zhao'
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pymongo import MongoClient
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
class QunaSpider(object):
def get_hotel(self,driver,toCity,stop_Page=3,starData=None,endData=None):
# 切换到酒店搜索框
driver.find_element(By.XPATH,'//*[@id="js_nva_cgy"]/li[2]').click()
time.sleep(1)
# City输入框
city_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='toCity']")
# 酒店名 地标输入框
query_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='q']")
# 起始日期
startDate_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='fromDate']")
# 结束日期
endDate_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='toDate']")
city_ele.clear()
city_ele.send_keys(toCity)
startDate_ele.send_keys(starData)
endDate_ele.send_keys(endData)
# 提交按钮
sub_ele = driver.find_element(By.XPATH,"//*[@id='js_hotel_searchbox']//button")
sub_ele.click()
page_num = 0
while page_num < stop_Page:
try:
WebDriverWait(driver,5).until(
EC.title_contains(toCity)
)
except Exception as e:
print(e)
break
# # 通过JS代码来实现将页面下拉
driver.execute_script('''
window.scrollTo(0,document.body.scrollHeight);
''')
# 等待2秒,Ajax加载
time.sleep(5)
# 获取页面数据
html_content = driver.page_source
soup = BeautifulSoup(html_content,'html.parser')
info_list = soup.find_all(class_='item_hotel_info')
data_list= []
for item in info_list:
hotel_data = {}
title = item.find('span',class_='hotel_item')
name = title.a.get_text()
hotel_url = title.a['href']
hotel_type = title.em.get_text()
hotel_address = item.find('span',class_='area_contair').get_text().replace('\n','').replace('\t','').strip()
common_score = item.find('td',class_='hotel_facilities').find('p',class_='score').get_text()
if common_score is None:
common_score = '暂无评分'
else:
common_score = common_score.split('/')[0]
print(common_score)
hotel_price = item.find('div',class_='hotel_price').b.get_text()
hotel_data['name']=name
hotel_data['url']=hotel_url
hotel_data['score']=common_score
hotel_data['type']=hotel_type
hotel_data['price']=hotel_price
hotel_data['address']=hotel_address
print('hotel_price=',hotel_price)
print('name=',name)
print('url=',hotel_url)
print('hotel_type=',hotel_type)
data_list.append(hotel_data)
# 保存数据到MongoDB
self.save_data(data_list)
try:
next_page = WebDriverWait(driver,10).until(
EC.visibility_of(driver.find_element(By.CSS_SELECTOR,'.item.next'))
)
next_page.click()
page_num +=1
except Exception as e:
print('错误:',e)
break
time.sleep(5)
driver.quit()
def save_data(self,data):
'''保存数据到MongoDB,传入类型为列表'''
conn = MongoClient('localhost',27017)
db = conn.mydb # 连接mydb数据库,没有则自动创建
hotel = db.hotel
# 批量插入
hotel.insert(data)
def crawl(self,root_url,to_city):
today = datetime.date.today().strftime('%Y-%m-%d')
tomorrow = datetime.date.today()+ datetime.timedelta(days=1)
tomorrow = tomorrow.strftime('%Y-%m-%d')
driver = webdriver.Chrome()
driver.set_page_load_timeout(20)
driver.get(root_url)
driver.maximize_window()
driver.implicitly_wait(10)
self.get_hotel(driver,to_city,4,today,tomorrow)
if __name__ =='__main__':
url = 'https://www.qunar.com/'
spider = QunaSpider()
spider.crawl(url,'杭州')
# 有待完善