Titanic上的人是否生还预测
import numpy as np import pandas as pd import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt from sklearn import svm from sklearn.utils import shuffle from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split train = pd.read_csv('/Kaggle_compete/Titanic/train.csv') from plotly.offline import init_notebook_mode,iplot import plotly.graph_objects as go import cufflinks as cf init_notebook_mode(connected=True) lab = train["Survived"].value_counts().keys().tolist() #labels val = train["Survived"].value_counts().values.tolist() #values trace = go.Pie(labels=lab, values=val, marker=dict(colors=['red']), hoverinfo="value" # Seting values to ) data = [trace]
下载包
https://stackoverflow.com/questions/50713726/how-to-install-cufflinks-in-python3
conda install -c bioconda cufflinks
pip install plotly
#layout: you can plot title, x and y axis titles or show legends layout = go.Layout(title="Survived Distribution") #set title #figure: when you want to show on a graph, it takes the defined data and layout parameters fig = go.Figure(data = data,layout = layout) iplot(fig) train['Survived'].replace(1, "Yes",inplace=True) train['Survived'].replace(0, "No",inplace=True) train['Survived'] plt.subplots(figsize=(10,10)) sns.countplot('Sex',hue='Survived',data=train, palette='RdBu_r') plt.show() #Distribution by age group plt.figure(figsize=[10,10]) #http://seaborn.pydata.org/generated/seaborn.distplot.html #seaborn.distplot sns.distplot(train['Age'].dropna().values, bins=range(0,17), kde=False, color="#007598") #不绘制高斯密度估值 sns.distplot(train['Age'].dropna().values, bins=range(16, 33), kde=False, color="#7B97A0") sns.distplot(train['Age'].dropna().values, bins=range(32, 49), kde=False, color="#06319B") sns.distplot(train['Age'].dropna().values, bins=range(48,65), kde=False, color="#007598") sns.distplot(train['Age'].dropna().values, bins=range(64,81), kde=False, color="#000000", axlabel='Age') plt.show()
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html?highlight=loc#pandas.DataFrame.loc #pandas.DataFrame.loc用法 查pandas——API train.loc[ train['Age'] <= 16, 'Age'] = 0 #返回指定调节的列表,并设定值 train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1 train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2 train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3 train.loc[ train['Age'] > 64, 'Age'] = 4 #Create a column "Family" which will store the number of family members who travel together (need to add Parch and SibSp) train['Family'] = train['SibSp'] + train['Parch'] + 1 train['Alone'] = 0 train.loc[train['Family'] == 1, 'Alone'] = 1 #plot the graph train['Survived'].replace("Yes", 1,inplace=True) train['Survived'].replace("No", 0, inplace=True) survived = train[train['Survived'] == 1] not_survived = train[train['Survived'] == 0] sns.barplot(x='Pclass', y='Survived', data=train, palette='RdBu_r'); #palette调色盘redblue翻转渐变 train['Sex'].replace("male", 0, inplace=True) train['Sex'].replace("female", 1, inplace=True) #Divide the cost of tickets into 4 categories train['Fare'] = train['Fare'].fillna(train['Fare'].median()) train['FareBand'] = pd.qcut(train['Fare'], 4) print (train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean()) #train.loc[ train['Fare'] 7.91) & (train['Fare'] 14.454) & (train['Fare'] 31, 'Fare'] = 3 train['Fare'] = train['Fare'].astype(int) #Replace in the Embarked column from 'S' to 0, 'C' to 1, 'Q' to 2 train['Embarked'] = train['Embarked'].fillna('S') train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) #输入对应关系的映射值
处理数据、使用随机森林进行预测、保存模型
#Divide the dataset in random order into training and test in proportions #https://www.cnblogs.com/cindycindy/p/13515115.html train_test_split函数 training, testing = train_test_split(train, test_size=0.2, random_state=0) cols = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family', 'Alone'] tcols = np.append(['Survived'],cols) #pandas.dropna() 主要用于滤除缺失数据 df = training.loc[:,tcols].dropna() #返回training中tcols标签的所有列表并去除缺失数据 X = df.loc[:,cols] y = np.ravel(df.loc[:,['Survived']]) df_test = testing.loc[:,tcols].dropna() X_test = df_test.loc[:,cols] y_test = np.ravel(df_test.loc[:,['Survived']]) from sklearn.ensemble import RandomForestClassifier #https://www.cnblogs.com/cgmcoding/p/13597389.html clf = RandomForestClassifier() #随机森林分类 clf.fit(X, y) #Build a forest of trees from the training set (X, y). print(y_test) y_red_random_forest = clf.predict(X_test) #predict(X)预测X的类。 就是根据测试的X值预测y print(y_red_random_forest) acc_random_forest = round(clf.score(X, y)*100, 2) #round四舍五入保留两位小数 score(X,y [,sample_weight])返回给定测试数据和标签上的平均准确度。 print(acc_random_forest) from sklearn.externals import joblib joblib.dump(clf,'/Kaggle_compete/RFC.model')
>>> x = np.array([[1, 2, 3], [4, 5, 6]]) >>> np.ravel(x) array([1, 2, 3, 4, 5, 6])
                    
                
                
            
        
浙公网安备 33010602011771号