Python共享单车数据分析

一、选题背景:

共享单车在2015年起开始在国内掀起热潮,目前已经逐渐成为了一种新的出行方式,日常生活中短途出行和代步出行,人们更加倾向于选择共享单车,这样既可以免去了等车的时间和花费,也更加的方便快捷、经济实惠,共享单车的出现为我们的生活带来了诸多便利。

二、数据说明:

kaggle的Bike Sharing Demand项目提供了美国某城市的共享单车2011年到2012年的数据集,该数据包括了租车日期,租车季节,租车气温,租车空气湿度等数据。

三、实施过程及代码:

导入库

1 import numpy as np
2 import pandas as pd
3 import calendar
4 import seaborn as sn
5 import matplotlib.pyplot as plt

查看数据大小

1 #查看数据大小
2 train=pd.read_csv("train.csv")
3 test=pd.read_csv("test.csv")
4 print('训练数据集:',train.shape,'测试数据集:',test.shape)

1 #查看数据情况
2 train.head()

 

1 #查看数据情况
2 test.head()

 

 

比较上述的两个表,我们可以知道test比train少了“casual”,“registered”,“count”三个字段

查看数据总体情况

1 #查看数据总体信息
2 print('训练集数据信息: ',train.info(),'测试集数据信息: ',test.info())

 

数据清洗

时间特征处理

 1 #时间特征处理  
 2 #创建一个新的表框
 3 periodDf=train[['datetime','season','holiday','workingday','count']]
 4 #避免报错
 5 periodDf.is_copy = None
 6 #日期处理,把日期提取出来(用匿名函数分离出来)
 7 periodDf['date']=periodDf['datetime'].apply(lambda x: x.split()[0])
 8 periodDf['time']=periodDf['datetime'].apply(lambda x: x.split()[1])
 9 periodDf['year']=periodDf['date'].apply(lambda x: x.split('-')[0])
10 periodDf['month']=periodDf['date'].apply(lambda x: x.split('-')[1])
11 periodDf['day']=periodDf['date'].apply(lambda x: x.split('-')[2])
12 periodDf['hour']=periodDf['time'].apply(lambda x: x.split(':')[0])
13 #星期
14 periodDf['weekday']=periodDf['datetime'].apply(lambda x: pd.to_datetime(x).weekday())
15 #看看处理后的periodDf
16 periodDf.head()

 

绘图

 1 #绘图
 2 fig1=plt.figure(figsize=(16,4))
 3 ax1=plt.subplot(111)
 4 df1=periodDf.groupby(['month','year']).sum().unstack()['count']#unstack(),将列索引变为行索引
 5 df1.plot(kind='area',ax=ax1,alpha=0.6)
 6 ax1.set_title('2011-2012 bikes sharing demand by month')
 7 ax1.set_xlabel('Figure 1')
 8 ax1.set_xticks(list(range(12)))
 9 ax1.set_xticklabels(['Jan','Feb','Mar','Apr','May','June','July','Aug','Sep','Oct','Nov','DeC'])
10 ax1.set_xlim(0,11)

 

 通过上图分析:我们可以看到2012年共享单车的租借数量比2011年是有提升的,一年中6-10月是租借的高峰期。

节假日和非节假日租车情况

1 #节假日和非节假日租车情况
2 fig2=plt.figure(figsize=(16,6))
3 ax2=plt.subplot(111)
4 df2=periodDf[['count','holiday']]
5 df2.boxplot(by='holiday',ax=ax2)
6 ax2.set_title('2011-2012 bike sharing demand by holiday')
7 ax2.set_xlabel('Figure 2')
8 ax2.set_xticklabels(['Non holiday','holiday'],rotation='horizontal')
9 ax2.set_ylim(0,800)

 工作日和周末的租车情况

1 #工作日和周末的租车情况
2 fig3=plt.figure(figsize=(16,6))
3 ax3=plt.subplot(111)
4 df3=periodDf[['count','weekday']]
5 df3.boxplot(by='weekday',ax=ax3)
6 ax3.set_title('2011-2012 bike sharing demand by weekday')
7 ax3.set_xlabel('Figure 3')
8 ax3.set_xticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], rotation='horizontal')
9 ax3.set_ylim(0,800)

租车数量随季节变化趋势 

 1 #租车数量随季节变化趋势
 2 fig5=plt.figure(figsize=(14,4))
 3 ax5=plt.subplot(111)
 4 df51=periodDf.groupby(['hour','holiday']).mean().unstack()['count'].rename(columns={0:'Non holiday',1:'holiday'})
 5 df52=periodDf.groupby(['hour','workingday']).mean().unstack()['count'].rename(columns={0:'weekend',1:'workingday'})
 6 df51.plot(ax=ax5,style=':,')
 7 df52.plot(ax=ax5,style='-o')
 8 ax5.set_title('2011-2012 bike sharing demand by hours')
 9 ax5.set_xlabel('figure 5')
10 ax5.set_xticks(list(range(24)))
11 ax5.set_xticklabels(list(range(24)))
12 ax5.set_xlim(0,23)
13 ax5.legend()
14 plt.show()

 

非时间特征处理

1 #天气、温度、湿度、风速信息统计
2 climateDf=train[['weather','temp','atemp','humidity','windspeed','count']]
3 climateDf=pd.concat([climateDf,periodDf['hour']],axis=1)
 1 #查看天气和风速对租车数量的影响
 2 fig,axes=plt.subplots(1,2,figsize=(20,6))
 3 ax6=plt.subplot(1,2,1)
 4 df11=climateDf.groupby('weather').sum()['count']
 5 df12=climateDf.groupby('weather').mean()['count']
 6 df1=pd.concat([df11,df12],axis=1).reset_index()
 7 df1.columns=['weather','sum','mean']
 8 df1['sum'].plot(kind='bar',width=0.4,ax=ax6,alpha=0.6,label='')
 9 df1['mean'].plot(style='r-',alpha=0.6,ax=ax6,secondary_y=True,label='mean')
10 
11 ax6.set_xlabel('weather')
12 ax6.set_xticks(df1.index)
13 ax6.set_xticklabels(['sunny&cloudy','Fog&overcast','light rain&light snow','bad weather'], rotation='horizontal')
14 ax6.set_ylabel('total')
15 ax6.right_ax.set_ylabel('mean')
16 ax6.set_title('2011-2012 bike sharing demand by weather')
17 ax7=plt.subplot(1,2,2)
18 
19 df21=climateDf.groupby('windspeed').sum()['count']
20 df22=climateDf.groupby('windspeed').mean()['count']
21 df2=pd.concat([df21,df22],axis=1).reset_index()
22 df2.columns=['windspeed','sum','mean']
23 df2['sum'].plot(kind='area',ax=ax7,alpha=0.7,color='orange',label='')
24 df2['mean'].plot(style='-',alpha=0.6,color='red',ax=ax7,secondary_y=True,label='mean')
25 ax7.set_xlabel('windspeed')
26 ax7.set_ylabel('total')
27 ax7.right_ax.set_ylabel('mean')
28 ax7.set_title('2011-2012 bike sharing demand by windspeed')
29 plt.show()

 

 

左图柱状图反应了不同天气下租车总数的变化,大雨大雪大雾这种恶劣天气最低。折现图反应了各种天气下平均租车数量,异常的是平均数量在恶劣天气下反而显著增加;

右图反应了随着风速变大,租车的总数量趋向于0,但平均租车数却最高。

1 train[train['weather']==4]

 

1 train[train['windspeed']>50]

 

通过查看原始数据发现,这种极端情况的数据仅为个例,所以造成了异常现象

湿度、温度对租车数量的影响

 1 #查看湿度、温度对租车数量的影响
 2 fig=plt.subplots(1,2,figsize=(20,8))
 3 
 4 ax1=plt.subplot(1,2,1)
 5 df1=climateDf[['humidity','count']]
 6 ax1.scatter(df1['humidity'],df1['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
 7 ax1.set_title('2011-2012 bike sharing demand by humidity')
 8 ax1.set_xlabel('humidity')
 9 ax1.set_ylabel('count')
10 
11 ax2=plt.subplot(1,2,2)
12 df2=climateDf[['temp','count']]
13 ax2.scatter(df2['temp'],df2['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
14 ax2.set_title('2011-2012 bike sharing demand by temperature')
15 ax2.set_xlabel('temperature')
16 ax2.set_ylabel('count')
17 plt.show()

 

 

最适合的湿度为30-40附近,温度越高,租车数量减少,最适合的温度在25-30左右

租车数量和其它变量的相关性

1 #查看租车数量和其它变量的相关性
2 df=pd.concat([periodDf.iloc[:,-5:].astype(int),train.iloc[:,1:]],axis=1)
3 corrDf=df.corr()
4 mask=np.array(corrDf)
5 mask[np.tril_indices_from(mask)]=False
6 fig=plt.figure(figsize=(15,15))
7 sn.heatmap(corrDf,mask=mask,annot=True,square=True)
8 plt.show()

 

 

完整代码

  1 import numpy as np
  2 import pandas as pd
  3 import calendar
  4 import seaborn as sn
  5 import matplotlib.pyplot as plt
  6 
  7 #查看数据大小
  8 train=pd.read_csv("train.csv")
  9 test=pd.read_csv("test.csv")
 10 print('训练数据集:',train.shape,'测试数据集:',test.shape)
 11 
 12 #查看数据情况
 13 train.head()
 14 test.head()
 15 
 16 #查看数据总体信息
 17 print('训练集数据信息: ',train.info(),'测试集数据信息: ',test.info())
 18 
 19 
 20 #时间特征处理  
 21 #创建一个新的表框
 22 periodDf=train[['datetime','season','holiday','workingday','count']]
 23 #避免报错
 24 periodDf.is_copy = None
 25 #日期处理,把日期提取出来(用匿名函数分离出来)
 26 periodDf['date']=periodDf['datetime'].apply(lambda x: x.split()[0])
 27 periodDf['time']=periodDf['datetime'].apply(lambda x: x.split()[1])
 28 periodDf['year']=periodDf['date'].apply(lambda x: x.split('-')[0])
 29 periodDf['month']=periodDf['date'].apply(lambda x: x.split('-')[1])
 30 periodDf['day']=periodDf['date'].apply(lambda x: x.split('-')[2])
 31 periodDf['hour']=periodDf['time'].apply(lambda x: x.split(':')[0])
 32 #星期
 33 periodDf['weekday']=periodDf['datetime'].apply(lambda x: pd.to_datetime(x).weekday())
 34 #看看处理后的periodDf
 35 periodDf.head()
 36 
 37 
 38 #绘图
 39 fig1=plt.figure(figsize=(16,4))
 40 ax1=plt.subplot(111)
 41 df1=periodDf.groupby(['month','year']).sum().unstack()['count']#unstack(),将列索引变为行索引
 42 df1.plot(kind='area',ax=ax1,alpha=0.6)
 43 ax1.set_title('2011-2012 bikes sharing demand by month')
 44 ax1.set_xlabel('Figure 1')
 45 ax1.set_xticks(list(range(12)))
 46 ax1.set_xticklabels(['Jan','Feb','Mar','Apr','May','June','July','Aug','Sep','Oct','Nov','DeC'])
 47 ax1.set_xlim(0,11)
 48 
 49 
 50 #节假日和非节假日租车情况
 51 fig2=plt.figure(figsize=(16,6))
 52 ax2=plt.subplot(111)
 53 df2=periodDf[['count','holiday']]
 54 df2.boxplot(by='holiday',ax=ax2)
 55 ax2.set_title('2011-2012 bike sharing demand by holiday')
 56 ax2.set_xlabel('Figure 2')
 57 ax2.set_xticklabels(['Non holiday','holiday'],rotation='horizontal')
 58 ax2.set_ylim(0,800)
 59 
 60 
 61 #工作日和周末的租车情况
 62 fig3=plt.figure(figsize=(16,6))
 63 ax3=plt.subplot(111)
 64 df3=periodDf[['count','weekday']]
 65 df3.boxplot(by='weekday',ax=ax3)
 66 ax3.set_title('2011-2012 bike sharing demand by weekday')
 67 ax3.set_xlabel('Figure 3')
 68 ax3.set_xticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], rotation='horizontal')
 69 ax3.set_ylim(0,800)
 70 fig4=plt.figure(figsize=(14,4))
 71 ax4=plt.subplot(111)
 72 df4=periodDf.groupby(['hour', 'season']).mean().unstack()['count']
 73 df4.columns=['Spring','Summer','Fall','Winter']
 74 df4.plot(ax=ax4, style='--.')
 75 ax4.set_title('2011-2012 bike sharing demand by hours')
 76 ax4.set_xlabel('Figure 4')
 77 ax4.set_xticks(list(range(24)))
 78 ax4.set_xticklabels(list(range(24)))
 79 ax4.set_xlim(0,23)
 80 
 81 
 82 #租车数量随季节变化趋势
 83 fig5=plt.figure(figsize=(14,4))
 84 ax5=plt.subplot(111)
 85 df51=periodDf.groupby(['hour','holiday']).mean().unstack()['count'].rename(columns={0:'Non holiday',1:'holiday'})
 86 df52=periodDf.groupby(['hour','workingday']).mean().unstack()['count'].rename(columns={0:'weekend',1:'workingday'})
 87 df51.plot(ax=ax5,style=':,')
 88 df52.plot(ax=ax5,style='-o')
 89 ax5.set_title('2011-2012 bike sharing demand by hours')
 90 ax5.set_xlabel('figure 5')
 91 ax5.set_xticks(list(range(24)))
 92 ax5.set_xticklabels(list(range(24)))
 93 ax5.set_xlim(0,23)
 94 ax5.legend()
 95 plt.show()
 96 
 97 
 98 #天气、温度、湿度、风速信息统计
 99 climateDf=train[['weather','temp','atemp','humidity','windspeed','count']]
100 
101 
102 #查看天气和风速对租车数量的影响
103 fig,axes=plt.subplots(1,2,figsize=(20,6))
104 ax6=plt.subplot(1,2,1)
105 df11=climateDf.groupby('weather').sum()['count']
106 df12=climateDf.groupby('weather').mean()['count']
107 df1=pd.concat([df11,df12],axis=1).reset_index()
108 df1.columns=['weather','sum','mean']
109 df1['sum'].plot(kind='bar',width=0.4,ax=ax6,alpha=0.6,label='')
110 df1['mean'].plot(style='r-',alpha=0.6,ax=ax6,secondary_y=True,label='mean')
111 ax6.set_xlabel('weather')
112 ax6.set_xticks(df1.index)
113 ax6.set_xticklabels(['sunny&cloudy','Fog&overcast','light rain&light snow','bad weather'], rotation='horizontal')
114 ax6.set_ylabel('total')
115 ax6.right_ax.set_ylabel('mean')
116 ax6.set_title('2011-2012 bike sharing demand by weather')
117 ax7=plt.subplot(1,2,2)
118 df21=climateDf.groupby('windspeed').sum()['count']
119 df22=climateDf.groupby('windspeed').mean()['count']
120 df2=pd.concat([df21,df22],axis=1).reset_index()
121 df2.columns=['windspeed','sum','mean']
122 df2['sum'].plot(kind='area',ax=ax7,alpha=0.7,color='orange',label='')
123 df2['mean'].plot(style='-',alpha=0.6,color='red',ax=ax7,secondary_y=True,label='mean')
124 ax7.set_xlabel('windspeed')
125 ax7.set_ylabel('total')
126 ax7.right_ax.set_ylabel('mean')
127 ax7.set_title('2011-2012 bike sharing demand by windspeed')
128 plt.show()
129 climateDf=pd.concat([climateDf,periodDf['hour']],axis=1)
130 train[train['weather']==4]
131 train[train['windspeed']>50]
132 
133 
134 #查看湿度、温度对租车数量的影响
135 fig=plt.subplots(1,2,figsize=(20,8))
136 ax1=plt.subplot(1,2,1)
137 df1=climateDf[['humidity','count']]
138 ax1.scatter(df1['humidity'],df1['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
139 ax1.set_title('2011-2012 bike sharing demand by humidity')
140 ax1.set_xlabel('humidity')
141 ax1.set_ylabel('count')
142 ax2=plt.subplot(1,2,2)
143 df2=climateDf[['temp','count']]
144 ax2.scatter(df2['temp'],df2['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
145 ax2.set_title('2011-2012 bike sharing demand by temperature')
146 ax2.set_xlabel('temperature')
147 ax2.set_ylabel('count')
148 plt.show()
149 
150 
151 #查看租车数量和其它变量的相关性
152 df=pd.concat([periodDf.iloc[:,-5:].astype(int),train.iloc[:,1:]],axis=1)
153 corrDf=df.corr()
154 mask=np.array(corrDf)
155 mask[np.tril_indices_from(mask)]=False
156 fig=plt.figure(figsize=(15,15))
157 sn.heatmap(corrDf,mask=mask,annot=True,square=True)
158 plt.show()

总结

经过上述的可视化分析,我们对共享单车租车数据有了大致的把握,对数据特征之间的关系有了初步的了解。季节、小时、月份、工作日非工作日、天气状况、温度、湿度、风速等特征对总体需求量有相关性。总的来说,效果还是蛮不错的,后面会再多加强这方面的知识。

 

posted @ 2021-06-24 01:00  haniooo  阅读(1079)  评论(1编辑  收藏  举报