1 from tqdm import tqdm
2 import time
3 from selenium import webdriver
4 from selenium.common.exceptions import TimeoutException, WebDriverException
5 import pandas as pd
6 import numpy as np
7
8
9 position = ["北京","天津","上海","重庆",
10 "河北","山西","辽宁","吉林",
11 "黑龙江","江苏","浙江","安徽",
12 "福建","江西","山东","河南",
13 "湖北","湖南","广东","海南",
14 "四川","贵州","云南","陕西",
15 "甘肃","青海","台湾","内蒙古",
16 "广西","西藏","宁夏","新疆",
17 "香港","澳门"]
18 positions = ['湖北']
19
20 name,level,hot,address,num=[],[],[],[],[]
21 def get_one_page(key,page):
22 try:
23 #打开浏览器窗口
24 option_chrome = webdriver.ChromeOptions()
25 option_chrome.add_argument('--headless')
26
27 driver = webdriver.Chrome(chrome_options=option_chrome)
28 time.sleep(1)
29
30 url = "http://piao.qunar.com/ticket/list.htm?keyword="+str(key)+"®ion=&from=mpl_search_suggest&page="+str(page)
31 driver.get(url)
32 infor = driver.find_elements_by_class_name("sight_item")
33 for i in range(len(infor)):
34 #获取景点名字
35 name.append(infor[i].find_element_by_class_name("name").text)
36 #获取景点评级
37 try:
38 level.append(infor[i].find_element_by_class_name("level").text)
39 except:
40 level.append("")
41 #获取景点热度
42 hot.append(infor[i].find_element_by_class_name("product_star_level").text[3:])
43 #获取景点地址
44 address.append(infor[i].find_element_by_class_name("area").text)
45 #获取景点销量
46 try:
47 num.append(infor[i].find_element_by_class_name("hot_num").text)
48 except:
49 num.append(0)
50
51 driver.quit()
52 return
53 except TimeoutException or WebDriverException:
54 return get_one_page()
55
56 for key in tqdm(position):
57 print ("正在爬取{}".format(key))
58 # 取前13页
59 for page in range(1,14):
60 print ("正在爬取第{}页".format(page))
61 get_one_page(key,page)
62
63 sight = {'name': name, 'level': level, 'hot': hot, 'address': address, 'num':num}
64 sight = pd.DataFrame(sight, columns=['name', 'level', 'hot', 'address', 'num'])
65 sight.to_csv("sight.csv",encoding="utf_8_sig")
主要是用的selenium库,在调用Chrome时,要将googlrdriver.exe放在python文件的相同位置下,否则会报错;
在存进到csv信息后,可以进一步对数据进行分析,获得想要的信息。