



























url= 'https://travelsearch.fliggy.com/index.htm?spm=181.15077045.1398723350.1.48f3620d7UbQ9z&searchType=product&keyword=%E6%9D%AD%E5%B7%9E&category=MULTI_SEARCH&pagenum='+str(page)+'&-1=popular&conditions=-1%3Apopular'












 1 # 主题
 2     title = html.xpath("//*       
 3               [@id='content']/div[6]/div[1]/div[1]/div/div[{}]
 4               /div[2]/div[1]/a/h3/div/text()"
 6 # 价格
 7     price = html.xpath("//*    
 8                [@id='content']/div[6]/div[1]/div[1]/div/div[{}]
 9                 /div[3]/div/div/span/text()"
11 # 销售数
12     sell = html.xpath("//*    
13              [@id='content']/div[6]/div[1]/div[1]/div/div[{}]
14              /div[2]/p[2]/span[1]/text()"
16 # 评论数
17     coumm = html.xpath("//* 
18                    [@id='content']/div[6]/div[1]/div[1]/div/div[{}]
19                    /div[2]/p[2]/span[2]/text()"     




 1 # 综合排序
 2 def synthesize(page):
 3     # 创建Feizhu_synthesize.xlsx
 4     file = open("Feizhu_synthesize.xlsx", "a")
 5     file.write("title" + "," + "price" + "," + "sell" + "," + "coumm" + '\n')
 6     file = file.close()
 7     try:
 8         for i in range(page):
 9             # 请求访问
10             url = 'https://travelsearch.fliggy.com/index.htm?spm=181.15077045.1398723350.1.48f3620d7UbQ9z&searchType=product&keyword=%E6%9D%AD%E5%B7%9E&category=MULTI_SEARCH&pagenum='+str(page)+'&-1=popular&conditions=-1%3Apopular'
11             res = requests.get(url, headers=headers)
12             res.encoding = 'utf-8'
13             html = etree.HTML(res.text)
14             coun = 1
15             # 主题title、价格price、已售sell、评论数coumm
16             for i in range(48):
17                 title = html.xpath("//*[@id='content']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/div[1]/a/h3/div/text()".format(coun))
18                 for i in title:
19                     title = i
20                 price = html.xpath("//*[@id='content']/div[6]/div[1]/div[1]/div/div[{}]/div[3]/div/div/span/text()".format(coun))
21                 for i in price:
22                     price = i
23                 sell = html.xpath("//*[@id='content']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[1]/text()".format(coun))
24                 sell1 = []
25                 for i in sell:
26                     sell = i.strip('月售')
27                     sell = sell.strip('')
28                 if sell == sell1:
29                     sell = '0'
30                     # print(sell)
31                 coumm = html.xpath("//*[@id='content']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[2]/text()".format(coun))
32                 coumm1 = []
33                 for i in coumm:
34                     if i in '评价':
35                         pass
36                     elif i in '':
37                         pass
38                     elif int(i) > 1:
39                         coumm = i
40                 if coumm == coumm1:
41                     coumm = '0'
42                 coun += 1
43                 # 保存数据
44                 with open("Feizhu_synthesize.xlsx", "a", encoding='utf-8') as f2:
45                     f2.writelines(title + "," + price + "," + sell + "," + coumm + "," + '\n')
46                 print('主题:', title, '\n',
47                       '价格:', price, '元\n',
48                       '已售出:', sell, '笔\n',
49                       '评论:', coumm, '条\n')
50             page +=1
51             time.sleep(1)
52     except:
53         pass





1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 Fz_data = pd.read_excel('Feizhu_synthesize.xlsx')




 1 # 重复值处理
 2 Fz_data = Fz_data.drop_duplicates()
 3 # Nan处理
 4 Fz_data = Fz_data.dropna(axis = 0)
 5 # 删除无效行
 6 Fz_data = Fz_data.drop([''], axis = 1)
 7 # 空白值处理
 8 Fz_data = Fz_data.dropna()
 9 # 替换值
10 Fz_data.replace('', '0',inplace = True)
11 Fz_data.replace('', '0',inplace = True)


 1 # 价格进行降序排列分析
 2 Fz_data.sort_values(by=["price"],inplace=True,ascending=[False])
 3 x = Fz_data['title'].head(50)
 4 y = Fz_data['price'].head(50)
 5 fig = plt.figure(figsize=(16, 8), dpi=80)
 6 plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
 7 plt.rcParams['axes.unicode_minus']=False
 8 plt.xticks(rotation=90)
 9 plt.plot(x,y,'s-',color = 'r',label="价格")# s-:方形
10 plt.legend(loc = "best")# 图例
11 plt.title("杭州旅行项目趋势图价格TOP50",fontsize=18)
12 plt.ylabel("价格/元")# 纵坐标名字
13 plt.show()



 1 # 销售情况进行降序排列分析
 2 Fz_data.sort_values(by=["sell"],inplace=True,ascending=[False])
 3 x = Fz_data['title'].head(50)
 4 y = Fz_data['price'].head(50)
 5 fig = plt.figure(figsize=(16, 8), dpi=80)
 6 plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
 7 plt.rcParams['axes.unicode_minus']=False
 8 plt.xticks(rotation=90)
 9 plt.plot(x,y,'s-',color = 'lightcoral')# s-:方形
10 plt.legend(loc = "best")# 图例
11 plt.title("杭州旅行项目销售趋势图TOP50",fontsize=18)
12 plt.ylabel("价格/元")# 纵坐标名字
13 plt.show()


 1 Fz_data.sort_values(by=["coumm"],inplace=True,ascending=[False])
 2 x = Fz_data['title'].head(50)
 3 y = Fz_data['price'].head(50)
 4 fig = plt.figure(figsize=(16, 8), dpi=80)
 5 plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
 6 plt.rcParams['axes.unicode_minus']=False
 7 plt.xticks(rotation=90)
 8 plt.plot(x,y,'s-',color = 'plum')# s-:方形
 9 plt.legend(loc = "best")# 图例
10 plt.title("杭州旅行项目热度趋势图TOP50",fontsize=18)
11 plt.ylabel("价格/元")# 纵坐标名字
12 plt.show()



 1 x = Fz_data['title'].head(50)
 2 y = Fz_data['price'].head(50)
 3 fig = plt.figure(figsize=(16, 8), dpi=80)
 4 plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
 5 plt.rcParams['axes.unicode_minus']=False
 6 plt.xticks(rotation=90)
 7 plt.bar(x,y,alpha=0.2, width=0.6, color='b', lw=3)
 8 plt.legend(loc = "best")# 图例
 9 plt.title("飞猪杭州旅游项目柱状图",fontsize=18)
10 plt.ylabel("价格/元")# 纵坐标名字
11 plt.show()



1 # 水平图
2 x = Fz_data['title'].head(40)
3 y = Fz_data['price'].head(40)
4 fig = plt.figure(figsize=(16, 8), dpi=80)
5 plt.barh(x,y, alpha=0.2, height=0.6, color='coral')
6 plt.title("飞猪杭州旅游项目水平图",fontsize=18)
7 plt.legend(loc = "best")# 图例
8 plt.xticks(rotation=90)
9 plt.xlabel("下载次数/亿次",)# 横坐标名字





 1 # 散点图
 2 x = Fz_data['title']
 3 y = Fz_data['price']
 4 fig = plt.figure(figsize=(16, 8), dpi=80)
 5 ax = plt.subplot(1, 1, 1)
 6 plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
 7 plt.scatter(x,y,color='lightgreen',marker='o',s=60,alpha=1)
 8 plt.xticks(rotation=90)
 9 plt.xticks([])
10 plt.ylabel("价格/元")# 纵坐标名字
11 plt.title("飞猪杭州旅游项目价格散点图",fontsize=16)



 1 # 散点图
 2 x = Fz_data['title']
 3 y = Fz_data['coumm']
 4 fig = plt.figure(figsize=(16, 8), dpi=80)
 5 ax = plt.subplot(1, 1, 1)
 6 plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
 7 plt.scatter(x,y,color='blueviolet',marker='o',s=60,alpha=1)
 8 plt.xticks(rotation=90)
 9 plt.xticks([])
10 plt.ylabel("评论数")# 纵坐标名字
11 plt.title("飞猪杭州旅游项目评论散点图",fontsize=16)



1 # 盒图
2 y = Fz_data['coumm']
3 plt.boxplot(y)
4 plt.title("飞猪杭州旅游项目评论盒图",fontsize=16)
5 plt.show()


1 # 盒图
2 y = Fz_data['price']
3 plt.boxplot(y)
4 plt.title("飞猪杭州旅游项目价格盒图",fontsize=16)
5 plt.ylabel("价格/元")# 纵坐标名字
6 plt.show()



 1 # 词云
 2 import random
 3 import wordcloud as wc
 4 import matplotlib.pyplot as plt
 6 # 定义图片尺寸
 7 word_cloud = wc.WordCloud(
 8    background_color='mintcream',  
 9    font_path='msyhbd.ttc',  
10    max_font_size=300, 
11    random_state=50,  
12                        )
13 text = Fz_data['title']
14 text = " ".join(text)
15 # 绘制词云
16 fig = plt.figure(figsize=(10, 5), dpi=80)
17 ax = plt.subplot(1, 1, 1)
18 word_cloud.generate(text)
19 plt.imshow(word_cloud)
20 plt.show()



1 # 散点图
2 import seaborn as sns
3 import matplotlib.pyplot as plt
4 import pandas as pd
5 import numpy as np
6 import warnings
7 warnings.filterwarnings("ignore")
8 four=pd.DataFrame(pd.read_excel('C:/Users/TR/Desktop/Feizhu_synthesize.xlsx'))
9 sns.regplot(x='price',y='coumm',data=four,color='r')


 1 # 线性回归方程
 2 from sklearn import datasets
 3 from sklearn.linear_model import LinearRegression
 4 import pandas as pd
 5 import numpy as np
 6 import seaborn as sns
 7 predict_model = LinearRegression()
 8 three=pd.DataFrame(pd.read_excel('C:/Users/TR/Desktop/Feizhu_synthesize.xlsx'))
 9 X = three['price'].values
10 X = X.reshape(-1,1)
11 predict_model.fit(X , three['coumm'])
12 np.set_printoptions(precision = 3, suppress = True)
13 a = predict_model.coef_
14 b = predict_model.intercept_
15 print("回归方程系数{}".format(predict_model.coef_))
16 print("回归方程截距{0:2f}".format(predict_model.intercept_))
17 print("线性回归预测模型表达式为{}*x+{}".format(predict_model.coef_,predict_model.intercept_))


