这两天报名参加了阿里天池的’公交线路客流预测‘赛,就顺便先把以前看的kaggle的titanic的训练赛代码在熟悉下数据的一些处理。题目根据titanic乘客的信息来预测乘客的生还情况。给了titanic_test.csv和titanic_train.csv两数据表。首先是表的一些字段说明:
PassengerId-- A numerical id assigned to each passenger.Survived-- Whether the passenger survived (1), or didn't (0). We'll be making predictions for this column.Pclass-- The class the passenger was in -- first class (1), second class (2), or third class (3).Name-- the name of the passenger.Sex-- The gender of the passenger -- male or female.Age-- The age of the passenger. Fractional.SibSp-- The number of siblings and spouses the passenger had on board.Parch-- The number of parents and children the passenger had on board.Ticket-- The ticket number of the passenger.Fare-- How much the passenger paid for the ticker.Cabin-- Which cabin the passenger was in.Embarked-- Where the passenger boarded the Titanic.
下面是python处理代码:
1 from sklearn.ensemble import AdaBoostClassifier 2 import numpy as np 3 import pandas as pd 4 from sklearn.linear_model import LogisticRegression 5 from sklearn.ensemble import RandomForestClassifier 6 from sklearn.feature_selection import SelectKBest, f_classif 7 import matplotlib.pyplot as plt 8 9 train = pd.read_csv("titanic_train.csv", dtype={"Age": np.float64}) 10 test = pd.read_csv("titanic_test.csv", dtype={"Age": np.float64} ) 11 12 print("\n\nTop of the training data:") 13 print(train.head()) 14 15 print("\n\nSummary statistics of training data") 16 print(train.describe()) 17 18 #train.to_csv('copy_of_the_training_data.csv', index=False) 19 20 train["Age"]=train["Age"].fillna(-1) 21 test["Age"]=test["Age"].fillna(-1) 22 23 train.loc[train["Sex"]=="male","Sex"]=0 24 test.loc[test["Sex"]=="male","Sex"]=0 25 train.loc[train["Sex"]=="female","Sex"]=1 26 test.loc[test["Sex"]=="female","Sex"]=1 27 28 print(train["Embarked"].unique()) 29 train["Embarked"]=train["Embarked"].fillna("S") 30 test["Embarked"]=test["Embarked"].fillna("S") 31 32 train.loc[train["Embarked"]=="S","Embarked"]=0 33 train.loc[train["Embarked"]=="C","Embarked"]=1 34 train.loc[train["Embarked"]=="Q","Embarked"]=2 35 36 37 test.loc[test["Embarked"]=="S","Embarked"]=0 38 test.loc[test["Embarked"]=="C","Embarked"]=1 39 test.loc[test["Embarked"]=="Q","Embarked"]=2 40 41 train["Fare"]=train["Fare"].fillna(train["Fare"].median()) 42 test["Fare"]=test["Fare"].fillna(test["Fare"].median()) 43 44 #Generating a familysize column 45 train["FamilySize"]=train["SibSp"]+train["Parch"] 46 test["FamilySize"]=train["SibSp"]+test["Parch"] 47 48 train["NameLength"]=train["Name"].apply(lambda x:len(x)) 49 test["NameLength"]=test["Name"].apply(lambda x:len(x)) 50 51 import re 52 53 def get_title(name): 54 # Use a regular expression to search for a title. Titles always consist of capital and lowercase letters, and end with a period. 55 title_search = re.search(' ([A-Za-z]+)\.', name) 56 # If the title exists, extract and return it. 57 if title_search: 58 return title_search.group(1) 59 return "" 60 61 # Get all the titles and print how often each one occurs. 62 train_titles = train["Name"].apply(get_title) 63 test_titles=test["Name"].apply(get_title) 64 65 # Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles. 66 title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2,"Dona":9} 67 for k,v in title_mapping.items(): 68 train_titles[train_titles == k] =v 69 test_titles[test_titles==k]=v 70 71 72 73 # Add in the title column. 74 train["Title"] = train_titles 75 test["Title"]= test_titles 76 77 #print (test["Title"]) 78 79 80 81 import operator 82 83 # A dictionary mapping family name to id 84 family_id_mapping = {} 85 86 # A function to get the id given a row 87 def get_family_id(row): 88 # Find the last name by splitting on a comma 89 last_name = row["Name"].split(",")[0] 90 # Create the family id 91 family_id = "{0}{1}".format(last_name, row["FamilySize"]) 92 # Look up the id in the mapping 93 if family_id not in family_id_mapping: 94 if len(family_id_mapping) == 0: 95 current_id = 1 96 else: 97 # Get the maximum id from the mapping and add one to it if we don't have an id 98 current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1) 99 family_id_mapping[family_id] = current_id 100 return family_id_mapping[family_id] 101 102 # Get the family ids with the apply method 103 train_family_ids = train.apply(get_family_id, axis=1) 104 test_family_ids = test.apply(get_family_id,axis=1) 105 106 # There are a lot of family ids, so we'll compress all of the families under 3 members into one code. 107 train_family_ids[train["FamilySize"] < 3] = -1 108 test_family_ids[test["FamilySize"]<3]=-1 109 110 111 train["FamilyId"] = train_family_ids 112 test["FamilyId"]=test_family_ids 113 114 115 116 117 118 alg = AdaBoostClassifier() 119 #alg=RandomForestClassifier(random_state=1,n_estimators=150,min_samples_split=4,min_samples_leaf=2) 120 predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"] 121 122 123 # Perform feature selection 124 selector = SelectKBest(f_classif, k=5) 125 selector.fit(train[predictors], train["Survived"]) 126 127 # Get the raw p-values for each feature, and transform from p-values into scores 128 scores = -np.log10(selector.pvalues_) 129 130 # Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best? 131 plt.bar(range(len(predictors)),scores) 132 plt.xticks(range(len(predictors)), predictors, rotation='vertical') 133 plt.show() 134 135 print("#######") 136 predictors = ["Pclass", "Sex", "Fare","Title"] 137 138 139 x_train = train[predictors] 140 y_train = train['Survived'] 141 x_test=test[predictors] 142 alg.fit(x_train,y_train) 143 predictions = alg.predict(x_test) 144 145 submission = pd.DataFrame({ 146 "PassengerId": test["PassengerId"], 147 "Survived": predictions 148 }) 149 150 submission.to_csv('submission.csv', index=False)
顺便总结一上上面有用到的格式化字符串的输出结果str.format():
1 #coding=utf-8 2 #str.format() 函数 3 4 #使用‘{}’占位符 5 print('what\'s {},{}'.format('wrong','hong!')) 6 7 #使用{0},{1}形式的占位符 8 print ('{},I\'m {},my qq is {}'.format('hello','hong','123')) 9 print('{},I\'m {},my E-mail is {}'.format('Hello','Hongten','hongtenzone@foxmail.com')) 10 11 print ('{1},{0}'.format('hello','world')) 12 13 #使用'{name}'形式的占位符 14 print('Hi,{name},{message}'.format(name="hongten",message='how are you?')) 15 16 17 #格式控制: 18 import math 19 print('The value of PI is approximately {0:.3f}.'.format(math.pi)) 20 21 table = {'Sjoerd': 4127, 'Jack': 4098, 'Dcab': 7678} 22 23 for name,phone in table.items(): 24 print('{0:10}==>{1:10d}'.format(name,phone))
还有就是python中正则表达式的模块学习:
#coding=utf-8 import re #re.match() s='I like hongten! he is so cool!' m=re.match(r'(\w+)\s',s) if m: print m.group(0) #print groups() 正则表达式中用 ()表示的是要提取的分组(Group) else: print 'not match' #re.seach() text = "JGood is a handsome boy, he is cool, clever, and so on..." m=re.search(r'\shan(ds)ome\s',text) if m: print m.groups() else: print "No serch" #re.sub 替换字符串的匹配项 import re text = "JGood is a handsome boy, he is cool, clever, and so on..." print re.sub(r'\s+', '-', text) #re.split #re.findall #re.compile re_telephone=re.compile(r'^(\d{3})-(\d{3,8})$') print re_telephone.match('010-8086').groups()
浙公网安备 33010602011771号