kaggle& titanic代码

公告

Posted on 2015-11-08 21:54 ypscut 阅读(1427) 评论(0) 收藏举报

　　这两天报名参加了阿里天池的’公交线路客流预测‘赛，就顺便先把以前看的kaggle的titanic的训练赛代码在熟悉下数据的一些处理。题目根据titanic乘客的信息来预测乘客的生还情况。给了titanic_test.csv和titanic_train.csv两数据表。首先是表的一些字段说明：

PassengerId -- A numerical id assigned to each passenger.
Survived -- Whether the passenger survived (1), or didn't (0). We'll be making predictions for this column.
Pclass -- The class the passenger was in -- first class (1), second class (2), or third class (3).
Name -- the name of the passenger.
Sex -- The gender of the passenger -- male or female.
Age -- The age of the passenger. Fractional.
SibSp -- The number of siblings and spouses the passenger had on board.
Parch -- The number of parents and children the passenger had on board.
Ticket -- The ticket number of the passenger.
Fare -- How much the passenger paid for the ticker.
Cabin -- Which cabin the passenger was in.
Embarked -- Where the passenger boarded the Titanic.

下面是python处理代码：

  1 from sklearn.ensemble import AdaBoostClassifier
  2 import numpy as np
  3 import pandas as pd
  4 from sklearn.linear_model import LogisticRegression
  5 from sklearn.ensemble import RandomForestClassifier
  6 from sklearn.feature_selection import SelectKBest, f_classif
  7 import  matplotlib.pyplot as plt
  8 
  9 train = pd.read_csv("titanic_train.csv", dtype={"Age": np.float64})
 10 test = pd.read_csv("titanic_test.csv", dtype={"Age": np.float64} )
 11 
 12 print("\n\nTop of the training data:")
 13 print(train.head())
 14 
 15 print("\n\nSummary statistics of training data")   
 16 print(train.describe())
 17 
 18 #train.to_csv('copy_of_the_training_data.csv', index=False)
 19 
 20 train["Age"]=train["Age"].fillna(-1)
 21 test["Age"]=test["Age"].fillna(-1)
 22 
 23 train.loc[train["Sex"]=="male","Sex"]=0
 24 test.loc[test["Sex"]=="male","Sex"]=0
 25 train.loc[train["Sex"]=="female","Sex"]=1
 26 test.loc[test["Sex"]=="female","Sex"]=1
 27 
 28 print(train["Embarked"].unique())
 29 train["Embarked"]=train["Embarked"].fillna("S")
 30 test["Embarked"]=test["Embarked"].fillna("S")
 31 
 32 train.loc[train["Embarked"]=="S","Embarked"]=0
 33 train.loc[train["Embarked"]=="C","Embarked"]=1
 34 train.loc[train["Embarked"]=="Q","Embarked"]=2
 35 
 36 
 37 test.loc[test["Embarked"]=="S","Embarked"]=0
 38 test.loc[test["Embarked"]=="C","Embarked"]=1
 39 test.loc[test["Embarked"]=="Q","Embarked"]=2
 40 
 41 train["Fare"]=train["Fare"].fillna(train["Fare"].median())
 42 test["Fare"]=test["Fare"].fillna(test["Fare"].median())
 43 
 44 #Generating a familysize column 
 45 train["FamilySize"]=train["SibSp"]+train["Parch"]
 46 test["FamilySize"]=train["SibSp"]+test["Parch"]
 47 
 48 train["NameLength"]=train["Name"].apply(lambda x:len(x))
 49 test["NameLength"]=test["Name"].apply(lambda x:len(x))
 50 
 51 import re 
 52 
 53 def get_title(name):
 54     # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
 55     title_search = re.search(' ([A-Za-z]+)\.', name)
 56     # If the title exists, extract and return it.
 57     if title_search:
 58         return title_search.group(1)
 59     return ""
 60 
 61 # Get all the titles and print how often each one occurs.
 62 train_titles = train["Name"].apply(get_title)
 63 test_titles=test["Name"].apply(get_title)
 64 
 65 # Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
 66 title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2,"Dona":9}
 67 for k,v in title_mapping.items():
 68     train_titles[train_titles == k] =v
 69     test_titles[test_titles==k]=v
 70 
 71 
 72 
 73 # Add in the title column.
 74 train["Title"] = train_titles
 75 test["Title"]= test_titles
 76 
 77 #print (test["Title"])
 78 
 79 
 80 
 81 import operator
 82 
 83 # A dictionary mapping family name to id
 84 family_id_mapping = {}
 85 
 86 # A function to get the id given a row
 87 def get_family_id(row):
 88     # Find the last name by splitting on a comma
 89     last_name = row["Name"].split(",")[0]
 90     # Create the family id
 91     family_id = "{0}{1}".format(last_name, row["FamilySize"])
 92     # Look up the id in the mapping
 93     if family_id not in family_id_mapping:
 94         if len(family_id_mapping) == 0:
 95             current_id = 1
 96         else:
 97             # Get the maximum id from the mapping and add one to it if we don't have an id
 98             current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
 99         family_id_mapping[family_id] = current_id
100     return family_id_mapping[family_id]
101 
102 # Get the family ids with the apply method
103 train_family_ids = train.apply(get_family_id, axis=1)
104 test_family_ids = test.apply(get_family_id,axis=1)
105 
106 # There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
107 train_family_ids[train["FamilySize"] < 3] = -1
108 test_family_ids[test["FamilySize"]<3]=-1
109 
110 
111 train["FamilyId"] = train_family_ids
112 test["FamilyId"]=test_family_ids
113 
114 
115 
116 
117 
118 alg = AdaBoostClassifier()
119 #alg=RandomForestClassifier(random_state=1,n_estimators=150,min_samples_split=4,min_samples_leaf=2)
120 predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"]
121 
122 
123 # Perform feature selection
124 selector = SelectKBest(f_classif, k=5)
125 selector.fit(train[predictors], train["Survived"])
126 
127 # Get the raw p-values for each feature, and transform from p-values into scores
128 scores = -np.log10(selector.pvalues_)
129 
130 # Plot the scores.  See how "Pclass", "Sex", "Title", and "Fare" are the best?
131 plt.bar(range(len(predictors)),scores)
132 plt.xticks(range(len(predictors)), predictors, rotation='vertical')
133 plt.show()
134 
135 print("#######")
136 predictors = ["Pclass", "Sex", "Fare","Title"]
137 
138 
139 x_train = train[predictors]
140 y_train = train['Survived']
141 x_test=test[predictors]
142 alg.fit(x_train,y_train)
143 predictions = alg.predict(x_test)
144 
145 submission = pd.DataFrame({
146         "PassengerId": test["PassengerId"],
147         "Survived": predictions
148     })
149 
150 submission.to_csv('submission.csv', index=False)

顺便总结一上上面有用到的格式化字符串的输出结果str.format():

 1 #coding=utf-8
 2 #str.format() 函数
 3 
 4 #使用‘｛｝’占位符
 5 print('what\'s {},{}'.format('wrong','hong!'))
 6 
 7 #使用｛0｝，｛1｝形式的占位符
 8 print ('{},I\'m {},my qq is {}'.format('hello','hong','123'))
 9 print('{},I\'m {},my E-mail is {}'.format('Hello','Hongten','hongtenzone@foxmail.com'))
10 
11 print ('{1},{0}'.format('hello','world'))
12 
13 #使用'{name}'形式的占位符
14 print('Hi,{name},{message}'.format(name="hongten",message='how are you?'))
15 
16 
17 #格式控制：
18 import math
19 print('The value of PI is approximately {0:.3f}.'.format(math.pi))
20 
21 table = {'Sjoerd': 4127, 'Jack': 4098, 'Dcab': 7678}
22 
23 for name,phone in table.items():
24     print('{0:10}==>{1:10d}'.format(name,phone))

还有就是python中正则表达式的模块学习：

#coding=utf-8
import  re

#re.match()
s='I like hongten! he is so cool!'
m=re.match(r'(\w+)\s',s)
if m:
    print m.group(0)    #print groups()    正则表达式中用 ()表示的是要提取的分组（Group）
else:
    print 'not match'

#re.seach()
text = "JGood is a handsome boy, he is cool, clever, and so on..."
m=re.search(r'\shan(ds)ome\s',text)
if m:
    print m.groups()
else:
    print "No serch"


#re.sub  替换字符串的匹配项
import re
text = "JGood is a handsome boy, he is cool, clever, and so on..."
print re.sub(r'\s+', '-', text)

#re.split

#re.findall

#re.compile
re_telephone=re.compile(r'^(\d{3})-(\d{3,8})$')
print re_telephone.match('010-8086').groups()

刷新页面返回顶部