Fork me on GitHub

利用Python爬取去哪儿景点

 1 from tqdm import tqdm
 2 import time
 3 from selenium import webdriver
 4 from selenium.common.exceptions import TimeoutException, WebDriverException
 5 import pandas as pd
 6 import numpy as np
 7 
 8 
 9 position = ["北京","天津","上海","重庆",
10          "河北","山西","辽宁","吉林",
11          "黑龙江","江苏","浙江","安徽",
12          "福建","江西","山东","河南",
13          "湖北","湖南","广东","海南",
14          "四川","贵州","云南","陕西",
15          "甘肃","青海","台湾","内蒙古",
16          "广西","西藏","宁夏","新疆",
17          "香港","澳门"]
18 positions = ['湖北']
19 
20 name,level,hot,address,num=[],[],[],[],[]
21 def get_one_page(key,page):
22    try:
23    #打开浏览器窗口
24       option_chrome = webdriver.ChromeOptions()
25       option_chrome.add_argument('--headless')
26 
27       driver = webdriver.Chrome(chrome_options=option_chrome)
28       time.sleep(1)
29 
30       url = "http://piao.qunar.com/ticket/list.htm?keyword="+str(key)+"&region=&from=mpl_search_suggest&page="+str(page)
31       driver.get(url)
32       infor = driver.find_elements_by_class_name("sight_item")
33       for i in range(len(infor)):
34          #获取景点名字
35          name.append(infor[i].find_element_by_class_name("name").text)
36          #获取景点评级
37          try:
38             level.append(infor[i].find_element_by_class_name("level").text)
39          except:
40             level.append("")
41          #获取景点热度
42          hot.append(infor[i].find_element_by_class_name("product_star_level").text[3:])
43          #获取景点地址
44          address.append(infor[i].find_element_by_class_name("area").text)
45          #获取景点销量
46          try:
47             num.append(infor[i].find_element_by_class_name("hot_num").text)
48          except:
49             num.append(0)
50 
51       driver.quit()
52       return
53    except TimeoutException or WebDriverException:
54       return get_one_page()
55 
56 for key in tqdm(position):
57    print ("正在爬取{}".format(key))
58    # 取前13页
59    for page in range(1,14):
60       print ("正在爬取第{}页".format(page))
61       get_one_page(key,page)
62 
63 sight = {'name': name, 'level': level, 'hot': hot, 'address': address, 'num':num}
64 sight = pd.DataFrame(sight, columns=['name', 'level', 'hot', 'address', 'num'])
65 sight.to_csv("sight.csv",encoding="utf_8_sig")

主要是用的selenium库,在调用Chrome时,要将googlrdriver.exe放在python文件的相同位置下,否则会报错;
在存进到csv信息后,可以进一步对数据进行分析,获得想要的信息。

 

posted @ 2018-10-03 23:18  西西嘛呦  阅读(1154)  评论(0)    收藏  举报