Kaggle竞赛酒店预订问题(Hotel Booking Prediction),翻译别人热度较高的代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
import folium
from folium.plugins import HeatMap
import plotly.express as px
plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)#将所有列都可以显示出来,不然中间会以省略号显示
df=pd.read_csv(r"C:\Users\14034\Desktop\hotel_bookings.csv")
df.head()#读文件,观察前5行
df.describe()#显示每一列数量,各个分位数,均值和方差
df.info()#查看每列的非空数和类别
Total=df.isnull().sum().sort_values(ascending=False)
Percent=((df.isnull().sum())/(df.isnull().count())).sort_values(ascending=False)
data=pd.concat([Total,Percent],axis=1)#查看每列空值与其占比
df.fillna(0,inplase=True)#观察发现每列为空时,说明没有,所以用0填充
filter = (df.children == 0) & (df.adults == 0) & (df.babies == 0)#找出这三列均为0的序号,说明没人,要删除的
df=df[~filter]#只保留非fliter序号的行
country_wise_guests=df[df["is_canceled']==0]['country'].value_counts().reset_index()
country_wise_guests.colums=['Country','Number']
basemap = folium.Map() guests_map = px.choropleth(country_wise_guests, locations = country_wise_guests['country'], color = country_wise_guests['No of guests'], hover_name = country_wise_guests['country']) guests_map.show()
#求出未取消预订的人的来源分布
data=df[df['is_canceled']==0]
px.box(data_frame=data,x='x = 'reserved_room_type', y = 'adr', color = 'hotel', template = 'plotly_dark')#reserved_room_type类别变量用箱型图,观察人均价格
data_resort=df[(df['hotel']=='Resort Hotel')&(df['is_canceled']==0')#求出Resort Hotel数据
data_city=df[(df['hotel']=='City Hotel')&(df['is_canceled']==0')]#求出City Hotel数据
resort_data=data_resort.groupby('arrival_date_month')['adr'].mean().reset_index()#data_resort按arrival_date_month分组,求出人均价格的均值
city_data=data_resort.groupby('arrival_date_month')['adr'].mean().reset_index()#city_data按arrival_date_month分组,求出人均价格的均值
final_hotel = resort_hotel.merge(city_hotel, on = 'arrival_date_month')#合并两数据 final_hotel.columns = ['month', 'price_for_resort', 'price_for_city_hotel']
这里的月份不是按正常月排序的,还要引入sort-dataframeby-monthorweek,sorted-months-weekdays包对月份排序
!pip install sort-dataframeby-monthorweek
!pip install sorted-months-weekdays
import sort_dataframeby_monthorweek as sd
def sort_month(df, column_name):
return sd.Sort_Dataframeby_Month(df,column_name)
final_prices = sort_month(final_data, 'arrival_date_month')#对月份排序
def sort_month(df, column_name):
return sd.Sort_Dataframeby_Month(df,column_name)
final_prices = sort_month(final_data, 'arrival_date_month')#对月份排序
plt.figure(figsize=(12,8))
px.line(final_prices ,x='arrival_date_month',y=['price_for_resort','price_for_city_hotel'])#画折线图
resort_number=data_resort['arrival_date_month'].value_counts().reset_index()
resort_number.columns=['arrival_date_month','resorts']
city_number=data_resort['arrival_date_month'].value_counts().reset_index()
city_number.columns=['arrival_date_month','cities']
all_data=resort_number.merge(city_number,on='arrival_date_month')
final_data=sort_month(all_data,'arrival_date_month')
plt.figure(figsize=(12,8))
px.line(final_data,x='arrival_date_month',y=['cities','resorts'],title='Total no of guests per Months')
data=df[df['is_canceled']==0]
data['total']=data['stays_in_weekend_nights']+data['stays_in_week_nights']
stay=data.groupby(['total','hetel'].agg('count').reset_index()#对数据分组计数
final_data=stay.iloc[:,0:3]#取数据所有行,前3列
final_data=final_data.rename(columns={''is_canceled':'Number of stays'})
plt.figure(figsize=(12,8))
px.bar(final_data,x='total',y=['Number of stays'],color='hotel')plt.figure(figzize=(20,12))
corr=df.corr()#求相关系数矩阵
sns.heatmap(corr,linewidths=1,annot=True)#画矩阵热力图
useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes','reservation_status', 'country', 'days_in_waiting_list']#根据相关系数删除值小的列
df.drop(useless_col,axis=1,inplae=True)
cat_df=df[cat_cols]
cat_df['reservation_status_date']=pd.to_datetime(cat_df['reservation_status_date'])
cat_df['year']=cat_df['reservation_status_date'].dt.year
cat_df['month']=cat_df['reservation_status_date'].dt.month
cat_df['day']=cat_df['reservation_status_date'].dt.day
cat_df.drop(['reservation_status_date','arrival of month'],axis=1)
for columns in df.columns:
print(f"{columns}\n{cat_df[ columns].unique()}\n")

浙公网安备 33010602011771号