Titanic上的人是否生还预测

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

train = pd.read_csv('/Kaggle_compete/Titanic/train.csv')

from plotly.offline import init_notebook_mode,iplot
import plotly.graph_objects as go
import cufflinks as cf
init_notebook_mode(connected=True)

lab = train["Survived"].value_counts().keys().tolist() #labels
val = train["Survived"].value_counts().values.tolist() #values
trace = go.Pie(labels=lab,
values=val,
marker=dict(colors=['red']),
hoverinfo="value" # Seting values to
)
data = [trace]

下载包

https://stackoverflow.com/questions/50713726/how-to-install-cufflinks-in-python3

conda install -c bioconda cufflinks

pip install plotly

#layout: you can plot title, x and y axis titles or show legends
layout = go.Layout(title="Survived Distribution") #set title

#figure: when you want to show on a graph, it takes the defined data and layout parameters
fig = go.Figure(data = data,layout = layout)
iplot(fig)

train['Survived'].replace(1, "Yes",inplace=True)
train['Survived'].replace(0, "No",inplace=True)
train['Survived']

plt.subplots(figsize=(10,10))
sns.countplot('Sex',hue='Survived',data=train, palette='RdBu_r')
plt.show()

#Distribution by age group
plt.figure(figsize=[10,10])
#http://seaborn.pydata.org/generated/seaborn.distplot.html
#seaborn.distplot
sns.distplot(train['Age'].dropna().values, bins=range(0,17), kde=False, color="#007598")   #不绘制高斯密度估值
sns.distplot(train['Age'].dropna().values, bins=range(16, 33), kde=False, color="#7B97A0")
sns.distplot(train['Age'].dropna().values, bins=range(32, 49), kde=False, color="#06319B")
sns.distplot(train['Age'].dropna().values, bins=range(48,65), kde=False, color="#007598")
sns.distplot(train['Age'].dropna().values, bins=range(64,81), kde=False, color="#000000",
axlabel='Age')
plt.show()
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html?highlight=loc#pandas.DataFrame.loc
#pandas.DataFrame.loc用法  查pandas——API
train.loc[ train['Age'] <= 16, 'Age'] = 0   #返回指定调节的列表,并设定值
train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1 
train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2 
train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3 
train.loc[ train['Age'] > 64, 'Age'] = 4

#Create a column "Family" which will store the number of family members who travel together (need to add Parch and SibSp)
train['Family'] = train['SibSp'] + train['Parch'] + 1
train['Alone'] = 0
train.loc[train['Family'] == 1, 'Alone'] = 1

#plot the graph
train['Survived'].replace("Yes", 1,inplace=True) 
train['Survived'].replace("No", 0, inplace=True) 
survived = train[train['Survived'] == 1] 
not_survived = train[train['Survived'] == 0] 
sns.barplot(x='Pclass', y='Survived', data=train, palette='RdBu_r');  #palette调色盘redblue翻转渐变
train['Sex'].replace("male", 0, inplace=True) 
train['Sex'].replace("female", 1, inplace=True)

#Divide the cost of tickets into 4 categories
train['Fare'] = train['Fare'].fillna(train['Fare'].median())
train['FareBand'] = pd.qcut(train['Fare'], 4)
print (train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean())

#train.loc[ train['Fare'] 7.91) & (train['Fare'] 14.454) & (train['Fare'] 31, 'Fare'] = 3
train['Fare'] = train['Fare'].astype(int)

#Replace in the Embarked column from 'S' to 0, 'C' to 1, 'Q' to 2
train['Embarked'] = train['Embarked'].fillna('S') 
train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) #输入对应关系的映射值

处理数据、使用随机森林进行预测、保存模型

#Divide the dataset in random order into training and test in proportions
#https://www.cnblogs.com/cindycindy/p/13515115.html  train_test_split函数
training, testing = train_test_split(train, test_size=0.2, random_state=0)
cols = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family', 'Alone']
tcols = np.append(['Survived'],cols)
#pandas.dropna() 主要用于滤除缺失数据
df = training.loc[:,tcols].dropna()  #返回training中tcols标签的所有列表并去除缺失数据

X = df.loc[:,cols]
y = np.ravel(df.loc[:,['Survived']])

df_test = testing.loc[:,tcols].dropna()
X_test = df_test.loc[:,cols]
y_test = np.ravel(df_test.loc[:,['Survived']])

from sklearn.ensemble import RandomForestClassifier   #https://www.cnblogs.com/cgmcoding/p/13597389.html
clf = RandomForestClassifier()   #随机森林分类
clf.fit(X, y)                   #Build a forest of trees from the training set (X, y).
print(y_test)
y_red_random_forest = clf.predict(X_test)          #predict(X)预测X的类。 就是根据测试的X值预测y
print(y_red_random_forest)
acc_random_forest = round(clf.score(X, y)*100, 2)  #round四舍五入保留两位小数 score(X,y [,sample_weight])返回给定测试数据和标签上的平均准确度。
print(acc_random_forest)

from sklearn.externals import joblib
joblib.dump(clf,'/Kaggle_compete/RFC.model')

 numpy.ravel

>>> x = np.array([[1, 2, 3], [4, 5, 6]])
>>> np.ravel(x)
array([1, 2, 3, 4, 5, 6])

 

posted @ 2020-12-13 11:55  向阳而生w  阅读(105)  评论(0)    收藏  举报