Kobe Bryant Shot Selection---1
data = pd.read_csv('data.csv')
print(data.info()) #显示多少行 多少列 每列的数据类型
set(data['shot_type'].tolist()) #显示这一列有哪几种数据
data['shot_type'].value_counts()#显示这一列有哪几种数据,每种数据多少个
#########获取shot_made_flag列不为空所有行###########
import pandas as pd
raw = pd.read_csv(filename)
nona = raw[pd.notnull(raw['shot_made_flag'])]
##############matplotlib例子#####################
alpha = 0.02 #透明度,0为完全透明,1为完全不透明
plt.figure(figsize=(10,10)) #画布大小
# loc_x and loc_y
plt.subplot(121)
plt.scatter(nona.loc_x, nona.loc_y, color='blue', alpha=alpha)
plt.title('loc_x and loc_y')
# lat and lon
plt.subplot(122)
plt.scatter(nona.lon, nona.lat, color='green', alpha=alpha)
plt.title('lat and lon')
##############将横纵坐标转化为极坐标#####################
raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)
loc_x_zero = raw['loc_x'] == 0 #取得某一列数值的0的行的索引
raw['angle'] = np.array([0]*len(raw)) #增加一列,列名为angle,数值全部为0
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
########################minutes_remaining和seconds_remaining都表示时间,将他们组合
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']
#######################查看action_type,combined_shot_type,shot_type这三种属性的值
print(nona.action_type.unique())
print(nona.combined_shot_type.unique())
print(nona.shot_type.unique())
#####################将2000-01中的-后面的部分取出
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) )
raw['season'].unique()
#######################舍弃掉没用的属性
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
raw = raw.drop(drop, 1) #1表示删除一列属性
###################### 将某些变量改变为哑变量(dummy variables)
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
raw = raw.drop(var, 1)
Dummy Variable
Dummy Variable 中文称哑变量,或者称虚拟变量(这个名字总觉得怪怪的),指的是反映属性的一种变量。
哑变量的值通常取0或者1, 比如 0表示非本科学位, 1表示本科学位。
在很多数据处理中,我们都需要对数据进行哑变量处理。例如,某个数据中的月份用1-12进行表示,但是月份的值本身并没有数值上的意义,比方说2月份比1月份多,这显然是不合理的。
常用哑变量:日期,月份等
######################### 将某些变量改变为哑变量(dummy variables)
#采用随机森林分类器,必须把字符串型数据转化为哑变量
# 把哑变量新生成的列加到原始数据后面,再把用于生成哑变量的原有列删除
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
raw = raw.drop(var, 1)
#显示dataframe的所有列的列名
raw.columns.values
#显示dataframe一共有多少列
len(raw.columns)
#将训练数据的数据和标签分开
train = df.drop('shot_made_flag', 1)
train_y = df['shot_made_flag']
#模型评估方法为对数损失函数logloss
import scipy as sp
def logloss(act, pred):
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1-epsilon, pred)
ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
ll = ll * -1.0/len(act)
return ll
######################模型训练部分######################
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
import time
#随机森林原理 https://blog.csdn.net/y0367/article/details/51501780
#K-Fold原理 https://blog.csdn.net/qq_16949707/article/details/79080432
# find the best n_estimators for RandomForestClassifier
# 下面程序的目的,是选出随机森林函数中,最合适的n_estimators参数,选择范围是1,10,100
print('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,num=3).astype(int)
for n in range_n:
print("the number of trees : {0}".format(n))
t1 = time.time()
rfc_score = 0.
rfc = RandomForestClassifier(n_estimators=n)
for train_k, test_k in KFold(len(train), n_folds=10, shuffle=True):
rfc.fit(train.iloc[train_k], train_y.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train.iloc[test_k])
rfc_score += logloss(train_y.iloc[test_k], pred) / 10
scores_n.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
print(best_n, min_score)
# find best max_depth for RandomForestClassifier
# 下面程序的目的,是选出随机森林函数中,最合适的max_depth参数,选择范围是1,10,100
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
print("the max depth : {0}".format(m))
t1 = time.time()
rfc_score = 0.
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
for train_k, test_k in KFold(len(train), n_folds=10, shuffle=True):
rfc.fit(train.iloc[train_k], train_y.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train.iloc[test_k])
rfc_score += logloss(train_y.iloc[test_k], pred) / 10
scores_m.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)
#查看程序获取的n_estimators和max_depth两个参数,是不是logloss分数最低的
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')
plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')
#根据选好的参数建模
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train, train_y)
pred = model.predict_proba(submission)
#将预测好的结果列写入要提交的文件中
sub = pd.read_csv("data/sample_submission.csv")
sub['shot_made_flag'] = pred
sub.to_csv("data/real_submission.csv", index=False)
posted on 2019-02-21 15:37 wangzhonghan 阅读(383) 评论(0) 收藏 举报
浙公网安备 33010602011771号