kaggle入门之titanic
看过吴恩达的视频,觉得自己学的已经很多了,虽然很多知识点是囫囵吞枣,但是纸上得来终觉浅,刚好看到网上有人刷kaggle,其中titanic为大家公认的入门必学。
一拿到题目,就蒙了,从来没有过ML的实战经验,第一次上来,不知道从哪里下手,于是从网上又找到了入门的教程,特此声明,此程序并非自己想出来的,而是照搬前人的代码,一行一行撸出来的,目的是为了建立自信和入门的方法。
代码如下:
1,首先引入pandas和numpy包,读取训练文件(第一次用python,不得不说,python去除了前戏和清理工作,上来就是直奔主题,这一点深得现代人的喜欢)
1 import pandas as pd 2 import numpy as np 3 from pandas import Series,DataFrame 4 5 data_train = pd.read_csv("/home/helong/share/kaggle/titannic/train.csv") 6 data_train
2. 打印出data_train的基本信息
data_train.info
3, 光看excel没有直观的概念,把数据转化为图表后会一目了然, 获取获救的情况,乘客的等级分布,获救年龄分布, 各个仓的年龄分布,各登船口岸上船人数, 还有需要注意的一点是,你会发现pyplot的中文会显示乱码,这种情况下,有几种解决方案,
这里提供的是用font_manager
import matplotlib.pyplot as plt import matplotlib as mpl fig = plt.figure() fig.set(alpha=0.2) zhfont = mpl.font_manager.FontProperties(fname='/usr/share/fonts/truetype/arphic/uming.ttc') plt.subplot2grid((2,3),(0,0)) data_train.Survived.value_counts().plot(kind='bar') plt.title(u"获救情况(1为获救)",fontproperties=zhfont) plt.ylabel(u"人数",fontproperties=zhfont) plt.subplot2grid((2,3),(0,1)) data_train.Pclass.value_counts().plot(kind="bar") plt.ylabel(u"人数",fontproperties=zhfont) plt.title(u"乘客等级分布",fontproperties=zhfont) plt.subplot2grid((2,3),(0,2)) plt.scatter(data_train.Survived, data_train.Age) plt.ylabel(u"年龄",fontproperties=zhfont) plt.grid(b=True, which='major', axis='y') plt.title(u"按年龄看获救分布(1为获救)",fontproperties=zhfont) plt.subplot2grid((2,3),(1,0), colspan=2) data_train.Age[data_train.Pclass == 1].plot(kind='kde') data_train.Age[data_train.Pclass == 2].plot(kind='kde') data_train.Age[data_train.Pclass == 3].plot(kind='kde') plt.xlabel(u"年龄",fontproperties=zhfont) plt.ylabel(u"密度",fontproperties=zhfont) plt.title(u"各等级的乘客年龄分布",fontproperties=zhfont) plt.legend((u'头等舱',u'2等舱',u'3等舱'),loc='best') plt.subplot2grid((2,3),(1,2)) data_train.Embarked.value_counts().plot(kind='bar') plt.title(u"各登船口岸上船人数",fontproperties=zhfont) plt.ylabel(u"人数",fontproperties=zhfont) plt.show()
获取各等级舱的获救情况
fig = plt.figure() fig.set(alpha=0.2) Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts() Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts() df=pd.DataFrame({u'获救':Survived_1,u'未获救':Survived_0}) df.plot(kind='bar', stacked=True) plt.title(u"各乘客等级的获救情况",fontproperties=zhfont) plt.xlabel(u"乘客等级",fontproperties=zhfont) plt.ylabel(u"人数",fontproperties=zhfont) plt.show()
获取按照性别分类的获救情况
fig = plt.figure() fig.set(alpha=0.2) Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts() Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts() df = pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f}) df.plot(kind='bar', stacked=True) plt.title(u"按性别看获救情况") plt.xlabel(u"性别") plt.ylabel(u"人数") plt.show()
利用随机森林算法补全空的age和cabin
from sklearn.ensemble import RandomForestRegressor def set_missing_ages(df): age_df = df[['Age','Fare','Parch','SibSp','Pclass']] known_age = age_df[age_df.Age.notna()].as_matrix() unknown_age = age_df[age_df.Age.isna()].as_matrix() rfr = RandomForestRegressor(random_state=0,n_estimators=2000, n_jobs=-1) if len(unknown_age) != 0: print(known_age) y = known_age[:, 0] #print(y) X = known_age[:, 1:] #print(X) print(unknown_age) rfr.fit(X,y) predictedAges = rfr.predict(unknown_age[:, 1::]) df.loc[(df.Age.isnull()), 'Age'] = predictedAges return df, rfr def set_Cabin_type(df): df.loc[(df.Cabin.notnull()), 'Cabin'] = "Yes" df.loc[(df.Cabin.isnull()), 'Cabin'] = "No" return df data_train, rfr = set_missing_ages(data_train) data_train = set_Cabin_type(data_train)
利用独热编码,将以下数据划分成更多列,值为0或1,再将原先不需要的数据删掉
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix='Cabin') dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked') dummies_Sex = pd.get_dummies(data_train['Sex'], prefix='Sex') dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass') df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1) df.drop(['Pclass', 'Name', 'Sex','Ticket','Cabin','Embarked'], axis=1, inplace=True) df
将跨度比较大的age和fare进行收缩,为的是让结果尽快收敛
import sklearn.preprocessing as preprocessing scaler = preprocessing.StandardScaler() age_scale_param = scaler.fit(df['Age'].values.reshape(-1,1)) df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1,1), age_scale_param) fare_scale_param = scaler.fit(df['Fare'].values.reshape(-1,1)) df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1), fare_scale_param) df
生成模型
from sklearn import linear_model train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') train_np = train_df.as_matrix() y = train_np[:, 0] X = train_np[:, 1:] clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) clf.fit(X, y) clf
对测试数据进行预处理,和训练数据流程大体一致
data_test = pd.read_csv("/home/helong/share/kaggle/titannic/test.csv") data_test.loc[(data_test.Fare.isnull()), 'Fare'] = 0 tmp_df = data_test[['Age','Fare','Parch','SibSp','Pclass']] null_age = tmp_df[data_test.Age.isnull()].as_matrix() X = null_age[:,1:] predictedAges = rfr.predict(X) data_test.loc[(data_test.Age.isnull()), 'Age'] = predictedAges data_test = set_Cabin_type(data_test) dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix='Cabin') dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix='Embarked') dummies_Sex = pd.get_dummies(data_test['Sex'], prefix='Sex') dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix='Pclass') df_test = pd.concat([data_test,dummies_Cabin,dummies_Embarked,dummies_Sex,dummies_Pclass], axis=1) df_test.drop(['Pclass','Name','Sex','Ticket','Cabin','Embarked'], axis=1, inplace=True) df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'].values.reshape(-1,1), age_scale_param) df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'].values.reshape(-1,1), fare_scale_param) df_test
验证数据,看看结果怎么样
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') predictions = clf.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("/home/helong/share/kaggle/titannic/result1.csv", index=False)
至此,基本的操作已经结束了,提交到kaggle上,也能有不错的成绩,下面就是需要进一步优化:
看一下各个特征对于结果的关联程度
pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})
交叉验证的重要性不言而已,以下是具体的方法
from sklearn.model_selection import cross_val_score clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) all_data = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') X = all_data.as_matrix()[:, 1:] y = all_data.as_matrix()[:,0] print(cross_val_score(clf, X, y, cv=5))
将训练数据分为训练和测试数据
from sklearn.model_selection import train_test_split
split_train, split_cv = train_test_split(df, test_size=0.3, random_state=0)
train_df = split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(train_df.as_matrix()[:,1:], train_df.as_matrix()[:,0])
cv_df = split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(cv_df.as_matrix()[:,1:])
origin_data_train = pd.read_csv("/home/helong/share/kaggle/titannic/train.csv")
bad_cases = origin_data_train.loc[origin_data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:, 0]]['PassengerId'].values)]
bad_cases
看一下我们的学习曲线怎么样,是过拟合还是欠拟合,以此作为调整特征参数或者数据的参考。
import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05,1., 20), verbose=0, plot=True): train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(u"xun lian yang ben shu") plt.ylabel(u"de fen") plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.1, color="b") plt.fill_between(train_sizes, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"xun lian ji shang de fen") plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"jiaocha yanzhengji shang defen") plt.legend(loc="best") plt.draw() plt.show() plt.gca().invert_yaxis() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff plot_learning_curve(clf, u"xuexi quxian", X, y)
融合训练。。。
from sklearn.ensemble import BaggingRegressor train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') train_np = train_df.as_matrix() y = train_np[:, 0] X = train_np[:, 1:] clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X,y) test = df_test.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') predictions = bagging_clf.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)}) result.to_csv("/home/helong/share/kaggle/titannic/test2.csv", index=False)

浙公网安备 33010602011771号