kaggle入门之titanic

看过吴恩达的视频,觉得自己学的已经很多了,虽然很多知识点是囫囵吞枣,但是纸上得来终觉浅,刚好看到网上有人刷kaggle,其中titanic为大家公认的入门必学。

一拿到题目,就蒙了,从来没有过ML的实战经验,第一次上来,不知道从哪里下手,于是从网上又找到了入门的教程,特此声明,此程序并非自己想出来的,而是照搬前人的代码,一行一行撸出来的,目的是为了建立自信和入门的方法。

 

代码如下:

 

1,首先引入pandas和numpy包,读取训练文件(第一次用python,不得不说,python去除了前戏和清理工作,上来就是直奔主题,这一点深得现代人的喜欢)

1 import pandas as pd
2 import numpy as np
3 from pandas import Series,DataFrame
4 
5 data_train = pd.read_csv("/home/helong/share/kaggle/titannic/train.csv")
6 data_train

 2. 打印出data_train的基本信息

data_train.info

 3, 光看excel没有直观的概念,把数据转化为图表后会一目了然, 获取获救的情况,乘客的等级分布,获救年龄分布, 各个仓的年龄分布,各登船口岸上船人数, 还有需要注意的一点是,你会发现pyplot的中文会显示乱码,这种情况下,有几种解决方案,

这里提供的是用font_manager

import matplotlib.pyplot as plt
import matplotlib as mpl
fig = plt.figure()
fig.set(alpha=0.2)

zhfont = mpl.font_manager.FontProperties(fname='/usr/share/fonts/truetype/arphic/uming.ttc')


plt.subplot2grid((2,3),(0,0))
data_train.Survived.value_counts().plot(kind='bar')
plt.title(u"获救情况(1为获救)",fontproperties=zhfont)
plt.ylabel(u"人数",fontproperties=zhfont)

plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind="bar")
plt.ylabel(u"人数",fontproperties=zhfont)
plt.title(u"乘客等级分布",fontproperties=zhfont)

plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.ylabel(u"年龄",fontproperties=zhfont)
plt.grid(b=True, which='major', axis='y')
plt.title(u"按年龄看获救分布(1为获救)",fontproperties=zhfont)

plt.subplot2grid((2,3),(1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind='kde')
data_train.Age[data_train.Pclass == 2].plot(kind='kde')
data_train.Age[data_train.Pclass == 3].plot(kind='kde')
plt.xlabel(u"年龄",fontproperties=zhfont)
plt.ylabel(u"密度",fontproperties=zhfont)
plt.title(u"各等级的乘客年龄分布",fontproperties=zhfont)
plt.legend((u'头等舱',u'2等舱',u'3等舱'),loc='best')

plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数",fontproperties=zhfont)
plt.ylabel(u"人数",fontproperties=zhfont)
plt.show()

 

获取各等级舱的获救情况 

fig = plt.figure()
fig.set(alpha=0.2)

Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u'获救':Survived_1,u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各乘客等级的获救情况",fontproperties=zhfont)
plt.xlabel(u"乘客等级",fontproperties=zhfont)
plt.ylabel(u"人数",fontproperties=zhfont)
plt.show()

 

 获取按照性别分类的获救情况

fig = plt.figure()
fig.set(alpha=0.2)

Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df = pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})
df.plot(kind='bar', stacked=True)
plt.title(u"按性别看获救情况")
plt.xlabel(u"性别")
plt.ylabel(u"人数")
plt.show()

 

利用随机森林算法补全空的age和cabin 

from sklearn.ensemble import RandomForestRegressor

def set_missing_ages(df):
    age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
    
    known_age = age_df[age_df.Age.notna()].as_matrix()
    unknown_age = age_df[age_df.Age.isna()].as_matrix()
    rfr = RandomForestRegressor(random_state=0,n_estimators=2000, n_jobs=-1)
    
    if len(unknown_age) != 0:
        print(known_age)
        y = known_age[:, 0]
        #print(y)
        X = known_age[:, 1:]
        #print(X)
        print(unknown_age)
        
        rfr.fit(X,y)
        predictedAges = rfr.predict(unknown_age[:, 1::])
        df.loc[(df.Age.isnull()), 'Age'] = predictedAges
    
    return df, rfr

def set_Cabin_type(df):
    df.loc[(df.Cabin.notnull()), 'Cabin'] = "Yes"
    df.loc[(df.Cabin.isnull()), 'Cabin'] = "No"
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)

 

利用独热编码,将以下数据划分成更多列,值为0或1,再将原先不需要的数据删掉 

dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass')

df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex','Ticket','Cabin','Embarked'], axis=1, inplace=True)
df

 将跨度比较大的age和fare进行收缩,为的是让结果尽快收敛

import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'].values.reshape(-1,1))
df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1,1), age_scale_param)

fare_scale_param = scaler.fit(df['Fare'].values.reshape(-1,1))
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1), fare_scale_param)
df

 

生成模型

from sklearn import linear_model

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

y = train_np[:, 0]
X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)

clf

 

对测试数据进行预处理,和训练数据流程大体一致 

data_test = pd.read_csv("/home/helong/share/kaggle/titannic/test.csv")
data_test.loc[(data_test.Fare.isnull()), 'Fare'] = 0
tmp_df = data_test[['Age','Fare','Parch','SibSp','Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
X = null_age[:,1:]
predictedAges = rfr.predict(X)
data_test.loc[(data_test.Age.isnull()), 'Age'] = predictedAges

data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix='Pclass')

df_test = pd.concat([data_test,dummies_Cabin,dummies_Embarked,dummies_Sex,dummies_Pclass], axis=1)
df_test.drop(['Pclass','Name','Sex','Ticket','Cabin','Embarked'], axis=1, inplace=True)
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'].values.reshape(-1,1), age_scale_param)
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'].values.reshape(-1,1), fare_scale_param)
df_test

 

验证数据,看看结果怎么样 

test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("/home/helong/share/kaggle/titannic/result1.csv", index=False)

 

至此,基本的操作已经结束了,提交到kaggle上,也能有不错的成绩,下面就是需要进一步优化:

 

看一下各个特征对于结果的关联程度

pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})

  

交叉验证的重要性不言而已,以下是具体的方法

from sklearn.model_selection import cross_val_score

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
all_data = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
X = all_data.as_matrix()[:, 1:]
y = all_data.as_matrix()[:,0]
print(cross_val_score(clf, X, y, cv=5))

  

将训练数据分为训练和测试数据

from sklearn.model_selection import train_test_split

split_train, split_cv = train_test_split(df, test_size=0.3, random_state=0)
train_df = split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(train_df.as_matrix()[:,1:], train_df.as_matrix()[:,0])

cv_df = split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(cv_df.as_matrix()[:,1:])

origin_data_train = pd.read_csv("/home/helong/share/kaggle/titannic/train.csv")
bad_cases = origin_data_train.loc[origin_data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:, 0]]['PassengerId'].values)]

bad_cases

  

看一下我们的学习曲线怎么样,是过拟合还是欠拟合,以此作为调整特征参数或者数据的参考。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
                       train_sizes=np.linspace(.05,1., 20), verbose=0, plot=True):
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"xun lian yang ben shu")
        plt.ylabel(u"de fen")
        plt.gca().invert_yaxis()
        plt.grid()
        
        plt.fill_between(train_sizes, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std,
                        alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.1, color="r")
        
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"xun lian ji shang de fen")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"jiaocha yanzhengji shang defen")
        
        plt.legend(loc="best")
        
        plt.draw()
        plt.show()
        plt.gca().invert_yaxis()
        
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

plot_learning_curve(clf, u"xuexi quxian", X, y)

 

融合训练。。。

from sklearn.ensemble import BaggingRegressor

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

y = train_np[:, 0]

X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X,y)

test = df_test.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = bagging_clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)})
result.to_csv("/home/helong/share/kaggle/titannic/test2.csv", index=False)

 

posted @ 2018-09-28 19:42  调皮的贝叶斯  阅读(560)  评论(0)    收藏  举报