Kobe Bryant Shot Selection---1

data = pd.read_csv('data.csv')
print(data.info()) #显示多少行 多少列 每列的数据类型

 

set(data['shot_type'].tolist()) #显示这一列有哪几种数据

 

data['shot_type'].value_counts()#显示这一列有哪几种数据,每种数据多少个

 

#########获取shot_made_flag列不为空所有行###########

import pandas as pd 

raw = pd.read_csv(filename)

nona =  raw[pd.notnull(raw['shot_made_flag'])]

 

##############matplotlib例子#####################

alpha = 0.02 #透明度,0为完全透明,1为完全不透明
plt.figure(figsize=(10,10)) #画布大小

# loc_x and loc_y
plt.subplot(121)
plt.scatter(nona.loc_x, nona.loc_y, color='blue', alpha=alpha)
plt.title('loc_x and loc_y')

# lat and lon
plt.subplot(122)
plt.scatter(nona.lon, nona.lat, color='green', alpha=alpha)
plt.title('lat and lon')

 

##############将横纵坐标转化为极坐标#####################

raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)

loc_x_zero = raw['loc_x'] == 0 #取得某一列数值的0的行的索引
raw['angle'] = np.array([0]*len(raw)) #增加一列,列名为angle,数值全部为0
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2

 

########################minutes_remaining和seconds_remaining都表示时间,将他们组合
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']

 

#######################查看action_type,combined_shot_type,shot_type这三种属性的值
print(nona.action_type.unique())
print(nona.combined_shot_type.unique())
print(nona.shot_type.unique())

 

#####################将2000-01中的-后面的部分取出
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) )
raw['season'].unique()

 

#######################舍弃掉没用的属性
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
raw = raw.drop(drop, 1) #1表示删除一列属性

 

###################### 将某些变量改变为哑变量(dummy variables)
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
raw = raw.drop(var, 1)

 

Dummy Variable
Dummy Variable 中文称哑变量,或者称虚拟变量(这个名字总觉得怪怪的),指的是反映属性的一种变量。
哑变量的值通常取0或者1, 比如 0表示非本科学位, 1表示本科学位。
在很多数据处理中,我们都需要对数据进行哑变量处理。例如,某个数据中的月份用1-12进行表示,但是月份的值本身并没有数值上的意义,比方说2月份比1月份多,这显然是不合理的。
常用哑变量:日期,月份等

 

######################### 将某些变量改变为哑变量(dummy variables)
#采用随机森林分类器,必须把字符串型数据转化为哑变量
# 把哑变量新生成的列加到原始数据后面,再把用于生成哑变量的原有列删除
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
raw = raw.drop(var, 1)

 

#显示dataframe的所有列的列名

raw.columns.values

 

#显示dataframe一共有多少列

len(raw.columns)

 

#将训练数据的数据和标签分开

train = df.drop('shot_made_flag', 1)
train_y = df['shot_made_flag']

 

#模型评估方法为对数损失函数logloss

import scipy as sp
def logloss(act, pred):
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1-epsilon, pred)
ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
ll = ll * -1.0/len(act)
return ll

######################模型训练部分######################

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
import time

#随机森林原理 https://blog.csdn.net/y0367/article/details/51501780
#K-Fold原理 https://blog.csdn.net/qq_16949707/article/details/79080432

# find the best n_estimators for RandomForestClassifier
# 下面程序的目的,是选出随机森林函数中,最合适的n_estimators参数,选择范围是1,10,100
print('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,num=3).astype(int)
for n in range_n:
print("the number of trees : {0}".format(n))
t1 = time.time()

rfc_score = 0.
rfc = RandomForestClassifier(n_estimators=n)
for train_k, test_k in KFold(len(train), n_folds=10, shuffle=True):
rfc.fit(train.iloc[train_k], train_y.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train.iloc[test_k])
rfc_score += logloss(train_y.iloc[test_k], pred) / 10
scores_n.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n

t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
print(best_n, min_score)

# find best max_depth for RandomForestClassifier
# 下面程序的目的,是选出随机森林函数中,最合适的max_depth参数,选择范围是1,10,100
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
print("the max depth : {0}".format(m))
t1 = time.time()

rfc_score = 0.
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
for train_k, test_k in KFold(len(train), n_folds=10, shuffle=True):
rfc.fit(train.iloc[train_k], train_y.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train.iloc[test_k])
rfc_score += logloss(train_y.iloc[test_k], pred) / 10
scores_m.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_m = m

t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)

#查看程序获取的n_estimators和max_depth两个参数,是不是logloss分数最低的
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')

#根据选好的参数建模
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train, train_y)
pred = model.predict_proba(submission)

#将预测好的结果列写入要提交的文件中
sub = pd.read_csv("data/sample_submission.csv")
sub['shot_made_flag'] = pred
sub.to_csv("data/real_submission.csv", index=False)

posted on 2019-02-21 15:37  wangzhonghan  阅读(383)  评论(0)    收藏  举报

导航