-->Lily

Data analyst,Python, MFE,CUHK, CS,CSU. 10/21/1994

导航

Python数据分析-Kobe Bryan生涯数据读取及分析

1.将数据(csv格式)导入jupyter

import pandas as pd

import matplotlib.pyplot as plt

filename='data.csv'

raw=pd.read_csv filename

print(raw.shape)

raw.head()#打印前几行

2.去掉某一列的空值

kobe=raw[pd.notnull(raw['shot_made_flag'])]

print(kobe.shape)

3.用matplotlib画图

alpha=0.02#点的透明程度,越小透明度越高

plt.figure(figsize=(10,10))

plt.subplot(121)#一行两列,第一个

plt.scatter(kobe.loc_x,kobe.loc_y,color='blue',alpha=alpha)#散点图

plt.title('loc_x and loc_y')

plt.subplot(122)#一行两列,第一个

plt.scatter(kobe.loc_x,kobe.loc_y,color='green',alpha=alpha)#散点图

plt.title('lat and lon')

4.打印唯一值

print(kobe.action_type.unique())

print(kobe.combined_shot_type.unique())

print(kobe.shot_type.unique())

print(kobe.shot_type.value_counts())#相同值出现次数

5.数据预处理

kobe['season'].unique()

raw['season']=raw['season'].apply(lambda x:int(x.split('-')[1]))#去掉特殊字符

raw['season'].unique()

6.取关键字段做分析

pd.DataFrame({'matchup':kobe.matchup,'opponent':kobe.opponent})

7.线性关系预测

plt.figure(figsize=(5,5))

plt.scatter(raw.dist,raw.shot_distance,color='blue')

plt.title('dist and shot_distance')

8.列值计算&散点图着色

gs=kobe.groupby('shot_zone_area')

print(kobe['shot_zone_area'].value_counts())

print(len(gs))

import matpoltlib.cm as cm#colormap

plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):

    alpha=0.1

    gs=kobe.groupby(feat)

    cs=cm.rainbow(np.linespace(0,1,len(gs)))#numpy.linspace是用于创建一个一位数组,并且是等差数列构成的一位数组

    for g,c in zip(gs,cs):

          plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)

#shot_zone_area

plt.subplot(131)

scatter_plot_by_category('shot_zone_area')

plt.title('shot_zone_area')

#shot_zone_basic

plt.subplot(131)

scatter_plot_by_category('shot_zone_basic')

plt.title('shot_zone_basic')

#shot_zone_range

plt.subplot(131)

scatter_plot_by_category('shot_zone_range')

plt.title('shot_zone_range')

9.离散特征编码

print(raw['combined_shot_type'].value_counts())

pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]

10.列合并

categorical_vars=['action_type','combined_shot_type','shot_type','opponent','period','season']

for var in categorical_vars:

     raw=pd.concat([raw,pd.get_dummies(raw[var],prefix=var)],1)

     raw=raw.drop(var,1)

11.区分测试数据和训练数据

train_kobe=raw[pd.notnull(raw['shot_made_flag'])]

train_kobe=train_kobe.drop('shot_made_flag',1)

train_label=train_kobe['shot_made_flag']

test_kobe=raw[pd.isnull(raw['shot_made_flag'])]

test_kobe=test_kobe.drop('shot_made_flag',1)

 

posted on 2017-05-06 20:44  Lily_maybe1994  阅读(691)  评论(0)    收藏  举报