Python数据分析-Kobe Bryan生涯数据读取及分析
1.将数据(csv格式)导入jupyter
import pandas as pd
import matplotlib.pyplot as plt
filename='data.csv'
raw=pd.read_csv filename
print(raw.shape)
raw.head()#打印前几行
2.去掉某一列的空值
kobe=raw[pd.notnull(raw['shot_made_flag'])]
print(kobe.shape)
3.用matplotlib画图
alpha=0.02#点的透明程度,越小透明度越高
plt.figure(figsize=(10,10))
plt.subplot(121)#一行两列,第一个
plt.scatter(kobe.loc_x,kobe.loc_y,color='blue',alpha=alpha)#散点图
plt.title('loc_x and loc_y')
plt.subplot(122)#一行两列,第一个
plt.scatter(kobe.loc_x,kobe.loc_y,color='green',alpha=alpha)#散点图
plt.title('lat and lon')
4.打印唯一值
print(kobe.action_type.unique())
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())
print(kobe.shot_type.value_counts())#相同值出现次数
5.数据预处理
kobe['season'].unique()
raw['season']=raw['season'].apply(lambda x:int(x.split('-')[1]))#去掉特殊字符
raw['season'].unique()
6.取关键字段做分析
pd.DataFrame({'matchup':kobe.matchup,'opponent':kobe.opponent})
7.线性关系预测
plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='blue')
plt.title('dist and shot_distance')
8.列值计算&散点图着色
gs=kobe.groupby('shot_zone_area')
print(kobe['shot_zone_area'].value_counts())
print(len(gs))
import matpoltlib.cm as cm#colormap
plt.figure(figsize=(20,10))
def scatter_plot_by_category(feat):
alpha=0.1
gs=kobe.groupby(feat)
cs=cm.rainbow(np.linespace(0,1,len(gs)))#numpy.linspace是用于创建一个一位数组,并且是等差数列构成的一位数组
for g,c in zip(gs,cs):
plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)
#shot_zone_area
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')
#shot_zone_basic
plt.subplot(131)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')
#shot_zone_range
plt.subplot(131)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')
9.离散特征编码
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]
10.列合并
categorical_vars=['action_type','combined_shot_type','shot_type','opponent','period','season']
for var in categorical_vars:
raw=pd.concat([raw,pd.get_dummies(raw[var],prefix=var)],1)
raw=raw.drop(var,1)
11.区分测试数据和训练数据
train_kobe=raw[pd.notnull(raw['shot_made_flag'])]
train_kobe=train_kobe.drop('shot_made_flag',1)
train_label=train_kobe['shot_made_flag']
test_kobe=raw[pd.isnull(raw['shot_made_flag'])]
test_kobe=test_kobe.drop('shot_made_flag',1)
posted on 2017-05-06 20:44 Lily_maybe1994 阅读(691) 评论(0) 收藏 举报
浙公网安备 33010602011771号