第二章:数据清洗与可视化
# 1.数据清洗
import numpy as np
import pandas as pd
df=pd.read_csv('train.csv')
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df.info()#查看train.csv中的基本信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.isnull().sum()#查看train.csv中的缺失值数量
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
df.isnull()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | True | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | True | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | True | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | False | False | False | False | False | False | False | False | False | False | True | False |
| 887 | False | False | False | False | False | False | False | False | False | False | False | False |
| 888 | False | False | False | False | False | True | False | False | False | False | True | False |
| 889 | False | False | False | False | False | False | False | False | False | False | False | False |
| 890 | False | False | False | False | False | False | False | False | False | False | True | False |
891 rows × 12 columns
df[['Age','Cabin','Embarked']]#查看缺失数据的列
| Age | Cabin | Embarked | |
|---|---|---|---|
| 0 | 22.0 | NaN | S |
| 1 | 38.0 | C85 | C |
| 2 | 26.0 | NaN | S |
| 3 | 35.0 | C123 | S |
| 4 | 35.0 | NaN | S |
| ... | ... | ... | ... |
| 886 | 27.0 | NaN | S |
| 887 | 19.0 | B42 | S |
| 888 | NaN | NaN | S |
| 889 | 26.0 | C148 | C |
| 890 | 32.0 | NaN | Q |
891 rows × 3 columns
df1=df.fillna({'Age':0})#用0填充Age中缺失的数据,生成的是副本
df1
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df.loc[df['Age'].isnull()]#查找Age为空的行
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
| 17 | 18 | 1 | 2 | Williams, Mr. Charles Eugene | male | NaN | 0 | 0 | 244373 | 13.0000 | NaN | S |
| 19 | 20 | 1 | 3 | Masselmani, Mrs. Fatima | female | NaN | 0 | 0 | 2649 | 7.2250 | NaN | C |
| 26 | 27 | 0 | 3 | Emir, Mr. Farred Chehab | male | NaN | 0 | 0 | 2631 | 7.2250 | NaN | C |
| 28 | 29 | 1 | 3 | O'Dwyer, Miss. Ellen "Nellie" | female | NaN | 0 | 0 | 330959 | 7.8792 | NaN | Q |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 859 | 860 | 0 | 3 | Razi, Mr. Raihed | male | NaN | 0 | 0 | 2629 | 7.2292 | NaN | C |
| 863 | 864 | 0 | 3 | Sage, Miss. Dorothy Edith "Dolly" | female | NaN | 8 | 2 | CA. 2343 | 69.5500 | NaN | S |
| 868 | 869 | 0 | 3 | van Melkebeke, Mr. Philemon | male | NaN | 0 | 0 | 345777 | 9.5000 | NaN | S |
| 878 | 879 | 0 | 3 | Laleff, Mr. Kristo | male | NaN | 0 | 0 | 349217 | 7.8958 | NaN | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
177 rows × 12 columns
df.loc[df['Age'].isnull(),'Age']=0#将Age空的一行中的Age补为0
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df=df.fillna(0)#对整张表的缺失进行处理,将空数据都改为0
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 147 | 2 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | 81 | 0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 147 | 2 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | 55 | 2 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 147 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 147 | 2 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | 30 | 2 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | 147 | 2 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | 60 | 0 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 147 | 1 |
891 rows × 12 columns
df.duplicated()#查看数据中是否有重复数据,有的话为False
df[df.duplicated()]#将重复数据拿出来
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
|---|
a=pd.DataFrame({'A':['a','a','c','c'],
'B':['a','a','c','c'],
'C':['1','1','2','3'],})
a
| A | B | C | |
|---|---|---|---|
| 0 | a | a | 1 |
| 1 | a | a | 1 |
| 2 | c | c | 2 |
| 3 | c | c | 3 |
a.drop_duplicates()#将重复数据删除
| A | B | C | |
|---|---|---|---|
| 0 | a | a | 1 |
| 2 | c | c | 2 |
| 3 | c | c | 3 |
df.drop_duplicates()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df.to_csv('train_clear.csv')
# 特征大概分为两大类:
# 数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,其中Survived, Pclass为离散型数值特征,Age,SibSp, Parch, Fare为连续型数值特征
# 文本型特征:Name, Sex, Cabin,Embarked, Ticket,其中Sex, Cabin, Embarked,Ticket为类别型文本特征,
# 数值型特征一般可以直接用于模型的训练,但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。文本型特征往往需要转换成数值型特征才能用于建模分析。
df['Age bins']=pd.cut(df['Age'],5,labels=list('12345'))#将Age平均分箱成5个年龄段,并分别用‘12345’表示,并命名为Age bins 存入df表中
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 0 | S | 2 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 3 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 0 | S | 2 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 3 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 0 | S | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 0 | S | 2 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 2 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | 0 | S | 1 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 2 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 0 | Q | 2 |
891 rows × 13 columns
from matplotlib import pyplot as plt
plt.hist(df['Age'])#查看Age的直方图
(array([227., 33., 164., 181., 123., 74., 50., 26., 11., 2.]),
array([ 0., 8., 16., 24., 32., 40., 48., 56., 64., 72., 80.]),
<BarContainer object of 10 artists>)

plt.hist(df['Age bins'])#查看Age bins的直方图
(array([346., 0., 188., 0., 0., 277., 0., 69., 0., 11.]),
array([0. , 0.4, 0.8, 1.2, 1.6, 2. , 2.4, 2.8, 3.2, 3.6, 4. ]),
<BarContainer object of 10 artists>)

df['Age bins']=pd.cut(df['Age'],[0,5,15,30,50,80],right = False,labels=list('12345'))#right=False将区间改为左闭右开区间,不写默认为True即左开右闭区间
#按[0,5),[5,15),[15,30),[30,50),[50,80)区间分箱
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 0 | S | 3 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 4 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 0 | S | 3 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 0 | S | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 0 | S | 3 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 3 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | 0 | S | 1 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 3 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 0 | Q | 4 |
891 rows × 13 columns
df['Age bins']=pd.qcut(df['Age'],[0,0.1,0.3,0.5,0.7,0.9],duplicates='drop',labels=list('1234'))
#duplicates='drop'表示如果边缘重复的话用drop进行删除,默认为raise(上诉数据因为重复了一个所以标签减少一个)
#按数据的百分比进行分箱
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 0 | S | 2 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 4 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 0 | S | 3 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 0 | S | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 0 | S | 3 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 2 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | 0 | S | 1 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 3 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 0 | Q | 3 |
891 rows × 13 columns
df.to_csv('train_bin.csv')
df['Sex'].unique()#查看文本变量名及种类
array(['male', 'female'], dtype=object)
df['Cabin'].unique()
array([0, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27',
'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'F G73', 'E31',
'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69', 'D47',
'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4',
'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87', 'B77',
'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7',
'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37',
'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104',
'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30',
'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22',
'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20',
'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126',
'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64',
'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30', 'E121',
'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36', 'B102',
'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42', 'C148'],
dtype=object)
df['Embarked'].unique()
array(['S', 'C', 'Q', 0], dtype=object)
df['Sex'].replace(['male', 'female'],[1,2],inplace=True)#用1,2替换掉Sex中的male和female,inplace=True表示替换掉母本
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 0 | S | 2 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 2 | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 4 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 2 | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 0 | S | 3 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 2 | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | 1 | 35.0 | 0 | 0 | 373450 | 8.0500 | 0 | S | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | 1 | 27.0 | 0 | 0 | 211536 | 13.0000 | 0 | S | 3 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | 2 | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 2 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | 2 | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | 0 | S | 1 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | 1 | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 3 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | 1 | 32.0 | 0 | 0 | 370376 | 7.7500 | 0 | Q | 3 |
891 rows × 13 columns
from sklearn.preprocessing import LabelEncoder
df['Cabin'] = LabelEncoder().fit_transform(df['Cabin'])#将文本变量通过sklearn库直接变为数字变量
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 147 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | 81 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 147 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | 55 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 147 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 147 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | 30 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | 147 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | 60 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 147 | Q |
891 rows × 12 columns
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])#将文本变量通过sklearn库直接变为数字变量
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 147 | 2 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | 81 | 0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 147 | 2 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | 55 | 2 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 147 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 147 | 2 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | 30 | 2 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | 147 | 2 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | 60 | 0 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 147 | 1 |
891 rows × 12 columns
for column in ['Cabin','Age','Embarked']:
x=pd.get_dummies(df[column],prefix=column)#对df表中的Age,Cabin,Embarked进行one-hot编码转换
df=pd.concat([df,x],axis=1)#将df和x按列拼接
df.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | ... | Age_66.0 | Age_70.0 | Age_70.5 | Age_71.0 | Age_74.0 | Age_80.0 | Embarked_0 | Embarked_1 | Embarked_2 | Embarked_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | ... | False | False | False | False | False | False | False | False | True | False |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | ... | False | False | False | False | False | False | True | False | False | False |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | ... | False | False | False | False | False | False | False | False | True | False |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | ... | False | False | False | False | False | False | False | False | True | False |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | ... | False | False | False | False | False | False | False | False | True | False |
5 rows × 253 columns
df['Title']=df.Name.str.extract(r'([A-Za-z]+)\.')#df.Name表示拿出df表中的Name用str.extract进行正则表达式匹配,并用Title进行存储
df
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | ... | Age_70.0 | Age_70.5 | Age_71.0 | Age_74.0 | Age_80.0 | Embarked_0 | Embarked_1 | Embarked_2 | Embarked_3 | Title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | ... | False | False | False | False | False | False | False | True | False | Mr |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | ... | False | False | False | False | False | True | False | False | False | Mrs |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | ... | False | False | False | False | False | False | False | True | False | Miss |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | ... | False | False | False | False | False | False | False | True | False | Mrs |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | ... | False | False | False | False | False | False | False | True | False | Mr |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | ... | False | False | False | False | False | False | False | True | False | Rev |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | ... | False | False | False | False | False | False | False | True | False | Miss |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 0.0 | 1 | 2 | W./C. 6607 | 23.4500 | ... | False | False | False | False | False | False | False | True | False | Miss |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | ... | False | False | False | False | False | True | False | False | False | Mr |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | ... | False | False | False | False | False | False | True | False | False | Mr |
891 rows × 254 columns
#2.数据重构
left_up=pd.read_csv('data/train-left-up.csv')
left_up
| PassengerId | Survived | Pclass | Name | |
|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry |
| ... | ... | ... | ... | ... |
| 434 | 435 | 0 | 1 | Silvey, Mr. William Baird |
| 435 | 436 | 1 | 1 | Carter, Miss. Lucile Polk |
| 436 | 437 | 0 | 3 | Ford, Miss. Doolina Margaret "Daisy" |
| 437 | 438 | 1 | 2 | Richards, Mrs. Sidney (Emily Hocking) |
| 438 | 439 | 0 | 1 | Fortune, Mr. Mark |
439 rows × 4 columns
left_down=pd.read_csv('data/train-left-down.csv')
left_down
| PassengerId | Survived | Pclass | Name | |
|---|---|---|---|---|
| 0 | 440 | 0 | 2 | Kvillner, Mr. Johan Henrik Johannesson |
| 1 | 441 | 1 | 2 | Hart, Mrs. Benjamin (Esther Ada Bloomfield) |
| 2 | 442 | 0 | 3 | Hampe, Mr. Leon |
| 3 | 443 | 0 | 3 | Petterson, Mr. Johan Emil |
| 4 | 444 | 1 | 2 | Reynaldo, Ms. Encarnacion |
| ... | ... | ... | ... | ... |
| 447 | 887 | 0 | 2 | Montvila, Rev. Juozas |
| 448 | 888 | 1 | 1 | Graham, Miss. Margaret Edith |
| 449 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" |
| 450 | 890 | 1 | 1 | Behr, Mr. Karl Howell |
| 451 | 891 | 0 | 3 | Dooley, Mr. Patrick |
452 rows × 4 columns
right_up=pd.read_csv('data/train-right-up.csv')
right_up
| Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | male | 50.0 | 1 | 0 | 13507 | 55.9000 | E44 | S |
| 435 | female | 14.0 | 1 | 2 | 113760 | 120.0000 | B96 B98 | S |
| 436 | female | 21.0 | 2 | 2 | W./C. 6608 | 34.3750 | NaN | S |
| 437 | female | 24.0 | 2 | 3 | 29106 | 18.7500 | NaN | S |
| 438 | male | 64.0 | 1 | 4 | 19950 | 263.0000 | C23 C25 C27 | S |
439 rows × 8 columns
right_down=pd.read_csv('data/train-right-down.csv')
right_down
| Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | male | 31.0 | 0 | 0 | C.A. 18723 | 10.500 | NaN | S |
| 1 | female | 45.0 | 1 | 1 | F.C.C. 13529 | 26.250 | NaN | S |
| 2 | male | 20.0 | 0 | 0 | 345769 | 9.500 | NaN | S |
| 3 | male | 25.0 | 1 | 0 | 347076 | 7.775 | NaN | S |
| 4 | female | 28.0 | 0 | 0 | 230434 | 13.000 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 447 | male | 27.0 | 0 | 0 | 211536 | 13.000 | NaN | S |
| 448 | female | 19.0 | 0 | 0 | 112053 | 30.000 | B42 | S |
| 449 | female | NaN | 1 | 2 | W./C. 6607 | 23.450 | NaN | S |
| 450 | male | 26.0 | 0 | 0 | 111369 | 30.000 | C148 | C |
| 451 | male | 32.0 | 0 | 0 | 370376 | 7.750 | NaN | Q |
452 rows × 8 columns
# 用concat方法进行拼接
result_up=pd.concat([left_up,right_up],axis=1)#将[left_up,right_up]两个表横向合并成一张表
result_up
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 435 | 0 | 1 | Silvey, Mr. William Baird | male | 50.0 | 1 | 0 | 13507 | 55.9000 | E44 | S |
| 435 | 436 | 1 | 1 | Carter, Miss. Lucile Polk | female | 14.0 | 1 | 2 | 113760 | 120.0000 | B96 B98 | S |
| 436 | 437 | 0 | 3 | Ford, Miss. Doolina Margaret "Daisy" | female | 21.0 | 2 | 2 | W./C. 6608 | 34.3750 | NaN | S |
| 437 | 438 | 1 | 2 | Richards, Mrs. Sidney (Emily Hocking) | female | 24.0 | 2 | 3 | 29106 | 18.7500 | NaN | S |
| 438 | 439 | 0 | 1 | Fortune, Mr. Mark | male | 64.0 | 1 | 4 | 19950 | 263.0000 | C23 C25 C27 | S |
439 rows × 12 columns
result_down=pd.concat([left_down,right_down],axis=1)#将[left_down,right_down]两个表横向合并成一张表
result_down
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 440 | 0 | 2 | Kvillner, Mr. Johan Henrik Johannesson | male | 31.0 | 0 | 0 | C.A. 18723 | 10.500 | NaN | S |
| 1 | 441 | 1 | 2 | Hart, Mrs. Benjamin (Esther Ada Bloomfield) | female | 45.0 | 1 | 1 | F.C.C. 13529 | 26.250 | NaN | S |
| 2 | 442 | 0 | 3 | Hampe, Mr. Leon | male | 20.0 | 0 | 0 | 345769 | 9.500 | NaN | S |
| 3 | 443 | 0 | 3 | Petterson, Mr. Johan Emil | male | 25.0 | 1 | 0 | 347076 | 7.775 | NaN | S |
| 4 | 444 | 1 | 2 | Reynaldo, Ms. Encarnacion | female | 28.0 | 0 | 0 | 230434 | 13.000 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 447 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.000 | NaN | S |
| 448 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.000 | B42 | S |
| 449 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.450 | NaN | S |
| 450 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.000 | C148 | C |
| 451 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.750 | NaN | Q |
452 rows × 12 columns
result=pd.concat([result_up,result_down])#将[result_up,result_down]两个表纵向合并成一张表,默认axis=0
result=result.reset_index(drop=True)
result
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
# 用join方法进行拼接
up=left_up.join(right_up)
up
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 435 | 0 | 1 | Silvey, Mr. William Baird | male | 50.0 | 1 | 0 | 13507 | 55.9000 | E44 | S |
| 435 | 436 | 1 | 1 | Carter, Miss. Lucile Polk | female | 14.0 | 1 | 2 | 113760 | 120.0000 | B96 B98 | S |
| 436 | 437 | 0 | 3 | Ford, Miss. Doolina Margaret "Daisy" | female | 21.0 | 2 | 2 | W./C. 6608 | 34.3750 | NaN | S |
| 437 | 438 | 1 | 2 | Richards, Mrs. Sidney (Emily Hocking) | female | 24.0 | 2 | 3 | 29106 | 18.7500 | NaN | S |
| 438 | 439 | 0 | 1 | Fortune, Mr. Mark | male | 64.0 | 1 | 4 | 19950 | 263.0000 | C23 C25 C27 | S |
439 rows × 12 columns
down=left_down.join(right_down)
down
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 440 | 0 | 2 | Kvillner, Mr. Johan Henrik Johannesson | male | 31.0 | 0 | 0 | C.A. 18723 | 10.500 | NaN | S |
| 1 | 441 | 1 | 2 | Hart, Mrs. Benjamin (Esther Ada Bloomfield) | female | 45.0 | 1 | 1 | F.C.C. 13529 | 26.250 | NaN | S |
| 2 | 442 | 0 | 3 | Hampe, Mr. Leon | male | 20.0 | 0 | 0 | 345769 | 9.500 | NaN | S |
| 3 | 443 | 0 | 3 | Petterson, Mr. Johan Emil | male | 25.0 | 1 | 0 | 347076 | 7.775 | NaN | S |
| 4 | 444 | 1 | 2 | Reynaldo, Ms. Encarnacion | female | 28.0 | 0 | 0 | 230434 | 13.000 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 447 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.000 | NaN | S |
| 448 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.000 | B42 | S |
| 449 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.450 | NaN | S |
| 450 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.000 | C148 | C |
| 451 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.750 | NaN | Q |
452 rows × 12 columns
res = pd.concat([up,down])# 横向拼接用join,纵向拼接用concat
res=res.reset_index(drop=True)
res
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
# 用merge方法进行拼接
up=pd.merge(left_up,right_up,left_index=True,right_index=True)#left_index=True,right_index=True表示用行索引进行拼接
up
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 435 | 0 | 1 | Silvey, Mr. William Baird | male | 50.0 | 1 | 0 | 13507 | 55.9000 | E44 | S |
| 435 | 436 | 1 | 1 | Carter, Miss. Lucile Polk | female | 14.0 | 1 | 2 | 113760 | 120.0000 | B96 B98 | S |
| 436 | 437 | 0 | 3 | Ford, Miss. Doolina Margaret "Daisy" | female | 21.0 | 2 | 2 | W./C. 6608 | 34.3750 | NaN | S |
| 437 | 438 | 1 | 2 | Richards, Mrs. Sidney (Emily Hocking) | female | 24.0 | 2 | 3 | 29106 | 18.7500 | NaN | S |
| 438 | 439 | 0 | 1 | Fortune, Mr. Mark | male | 64.0 | 1 | 4 | 19950 | 263.0000 | C23 C25 C27 | S |
439 rows × 12 columns
down=pd.merge(left_down,right_down,left_index=True,right_index=True)#left_index=True,right_index=True表示用行索引进行拼接
down
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 440 | 0 | 2 | Kvillner, Mr. Johan Henrik Johannesson | male | 31.0 | 0 | 0 | C.A. 18723 | 10.500 | NaN | S |
| 1 | 441 | 1 | 2 | Hart, Mrs. Benjamin (Esther Ada Bloomfield) | female | 45.0 | 1 | 1 | F.C.C. 13529 | 26.250 | NaN | S |
| 2 | 442 | 0 | 3 | Hampe, Mr. Leon | male | 20.0 | 0 | 0 | 345769 | 9.500 | NaN | S |
| 3 | 443 | 0 | 3 | Petterson, Mr. Johan Emil | male | 25.0 | 1 | 0 | 347076 | 7.775 | NaN | S |
| 4 | 444 | 1 | 2 | Reynaldo, Ms. Encarnacion | female | 28.0 | 0 | 0 | 230434 | 13.000 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 447 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.000 | NaN | S |
| 448 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.000 | B42 | S |
| 449 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.450 | NaN | S |
| 450 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.000 | C148 | C |
| 451 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.750 | NaN | Q |
452 rows × 12 columns
res1 = pd.concat([up,down])# 横向拼接用join,纵向拼接用concat
res1=res1.reset_index(drop=True)
res1
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
result.to_csv('result.csv')
data=pd.read_csv('result.csv')
data
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 13 columns
data_unit=data.stack()#将数据改为Series类型数据
data_unit
0 Unnamed: 0 0
PassengerId 1
Survived 0
Pclass 3
Name Braund, Mr. Owen Harris
...
890 SibSp 0
Parch 0
Ticket 370376
Fare 7.75
Embarked Q
Length: 10717, dtype: object
# 数据重构第二部分
df=pd.read_csv('result.csv')
df.head(2)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
list(df.groupby('Sex'))#将男性和女性分成两组
[('female',
Unnamed: 0 PassengerId Survived Pclass \
1 1 2 1 1
2 2 3 1 3
3 3 4 1 1
8 8 9 1 3
9 9 10 1 2
.. ... ... ... ...
880 880 881 1 2
882 882 883 0 3
885 885 886 0 3
887 887 888 1 1
888 888 889 0 3
Name Sex Age SibSp \
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0
9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1
.. ... ... ... ...
880 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0
882 Dahlberg, Miss. Gerda Ulrika female 22.0 0
885 Rice, Mrs. William (Margaret Norton) female 39.0 0
887 Graham, Miss. Margaret Edith female 19.0 0
888 Johnston, Miss. Catherine Helen "Carrie" female NaN 1
Parch Ticket Fare Cabin Embarked
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
8 2 347742 11.1333 NaN S
9 0 237736 30.0708 NaN C
.. ... ... ... ... ...
880 1 230433 26.0000 NaN S
882 0 7552 10.5167 NaN S
885 5 382652 29.1250 NaN Q
887 0 112053 30.0000 B42 S
888 2 W./C. 6607 23.4500 NaN S
[314 rows x 13 columns]),
('male',
Unnamed: 0 PassengerId Survived Pclass \
0 0 1 0 3
4 4 5 0 3
5 5 6 0 3
6 6 7 0 1
7 7 8 0 3
.. ... ... ... ...
883 883 884 0 2
884 884 885 0 3
886 886 887 0 2
889 889 890 1 1
890 890 891 0 3
Name Sex Age SibSp Parch \
0 Braund, Mr. Owen Harris male 22.0 1 0
4 Allen, Mr. William Henry male 35.0 0 0
5 Moran, Mr. James male NaN 0 0
6 McCarthy, Mr. Timothy J male 54.0 0 0
7 Palsson, Master. Gosta Leonard male 2.0 3 1
.. ... ... ... ... ...
883 Banfield, Mr. Frederick James male 28.0 0 0
884 Sutehall, Mr. Henry Jr male 25.0 0 0
886 Montvila, Rev. Juozas male 27.0 0 0
889 Behr, Mr. Karl Howell male 26.0 0 0
890 Dooley, Mr. Patrick male 32.0 0 0
Ticket Fare Cabin Embarked
0 A/5 21171 7.2500 NaN S
4 373450 8.0500 NaN S
5 330877 8.4583 NaN Q
6 17463 51.8625 E46 S
7 349909 21.0750 NaN S
.. ... ... ... ...
883 C.A./SOTON 34068 10.5000 NaN S
884 SOTON/OQ 392076 7.0500 NaN S
886 211536 13.0000 NaN S
889 111369 30.0000 C148 C
890 370376 7.7500 NaN Q
[577 rows x 13 columns])]
df.groupby('Sex').describe()#查看女性和男性的不同信息
| Unnamed: 0 | PassengerId | ... | Parch | Fare | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| Sex | |||||||||||||||||||||
| female | 314.0 | 430.028662 | 256.846324 | 1.0 | 230.75 | 413.5 | 640.25 | 888.0 | 314.0 | 431.028662 | ... | 1.0 | 6.0 | 314.0 | 44.479818 | 57.997698 | 6.75 | 12.071875 | 23.0 | 55.00 | 512.3292 |
| male | 577.0 | 453.147314 | 257.486139 | 0.0 | 221.00 | 463.0 | 679.00 | 890.0 | 577.0 | 454.147314 | ... | 0.0 | 5.0 | 577.0 | 25.523893 | 43.138263 | 0.00 | 7.895800 | 10.5 | 26.55 | 512.3292 |
2 rows × 64 columns
df.groupby('Sex')['Age'].describe()#只查看年龄的相关信息
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Sex | ||||||||
| female | 261.0 | 27.915709 | 14.110146 | 0.75 | 18.0 | 27.0 | 37.0 | 63.0 |
| male | 453.0 | 30.726645 | 14.678201 | 0.42 | 21.0 | 29.0 | 39.0 | 80.0 |
df.groupby('Sex')['Age'].mean()#只差看年龄的平均值信息
Sex
female 27.915709
male 30.726645
Name: Age, dtype: float64
mean_fare_sex=df.groupby('Sex')['Fare'].describe()#票价信息
mean_fare_sex
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Sex | ||||||||
| female | 314.0 | 44.479818 | 57.997698 | 6.75 | 12.071875 | 23.0 | 55.00 | 512.3292 |
| male | 577.0 | 25.523893 | 43.138263 | 0.00 | 7.895800 | 10.5 | 26.55 | 512.3292 |
survived_sex=df.groupby('Sex')['Survived'].sum()#存活总人数
survived_sex
Sex
female 233
male 109
Name: Survived, dtype: int64
survived_Pclass=df.groupby('Pclass')['Survived'].sum()#1,2,3等船舱存活总人数
survived_Pclass
Pclass
1 136
2 87
3 119
Name: Survived, dtype: int64
# 用agg方法进行上述任务
df.groupby('Sex').agg({'Survived':'sum','Fare':'mean'}).rename(columns={'Survived':'Survived_sum','Fare':'Fare_mean'})
#通过agg方法同时对两个任务进行求和
| Survived_sum | Fare_mean | |
|---|---|---|
| Sex | ||
| female | 233 | 44.479818 |
| male | 109 | 25.523893 |
df.groupby(['Pclass','Age'])['Fare'].mean()#通过'Pclass','Age'来计算'Fare'的平均值
Pclass Age
1 0.92 151.5500
2.00 151.5500
4.00 81.8583
11.00 120.0000
14.00 120.0000
...
3 61.00 6.2375
63.00 9.5875
65.00 7.7500
70.50 7.7500
74.00 7.7750
Name: Fare, Length: 182, dtype: float64
mean_fare_sex.index
Index(['female', 'male'], dtype='object', name='Sex')
survived_sex=survived_sex.to_frame()#将Series数据变为DataFrame类型数据
type(survived_sex)
pandas.core.frame.DataFrame
pd.merge(survived_sex,mean_fare_sex,on='Sex')#将survived_sex,mean_fare_sex进行拼接,并以'Sex'为表头
| Survived | count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|
| Sex | |||||||||
| female | 233 | 314.0 | 44.479818 | 57.997698 | 6.75 | 12.071875 | 23.0 | 55.00 | 512.3292 |
| male | 109 | 577.0 | 25.523893 | 43.138263 | 0.00 | 7.895800 | 10.5 | 26.55 | 512.3292 |
survived_age=df.groupby(['Age'])['Survived'].sum()
survived_age
Age
0.42 1
0.67 1
0.75 2
0.83 2
0.92 1
..
70.00 0
70.50 0
71.00 0
74.00 0
80.00 1
Name: Survived, Length: 88, dtype: int64
max(survived_age)
15
survived_age[survived_age.values==max(survived_age)]#查找存活率最高的年龄
Age
24.0 15
Name: Survived, dtype: int64
rate=max(survived_age)/sum(df['Age'].values==24.0)#该年龄最大存活率
rate
0.5
f'最大存活率:{rate}'
'最大存活率:0.5'
#数据可视化
df=pd.read_csv('result.csv')
df.head(2)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
sex=df.groupby('Sex')['Survived'].sum()
sex
Sex
female 233
male 109
Name: Survived, dtype: int64
sex.plot.bar()#画出女性存活数和男性存活数的柱状图,plot是Series自带的库
plt.title('survived')
Text(0.5, 1.0, 'survived')

df.groupby(['Sex','Survived'])['Survived'].count()
Sex Survived
female 0 81
1 233
male 0 468
1 109
Name: Survived, dtype: int64
sex_survived=df.groupby(['Sex','Survived'])['Survived'].count().unstack()#分别计算男性和女性0和1(死亡和生存)的人数,用.unstack()进行反转
sex_survived
| Survived | 0 | 1 |
|---|---|---|
| Sex | ||
| female | 81 | 233 |
| male | 468 | 109 |
died=sex_survived[0]
died.plot.bar()
plt.title('died')
Text(0.5, 1.0, 'died')

sex_survived.plot.bar()
plt.title('survived and died')
Text(0.5, 1.0, 'survived and died')

sex_survived.plot(kind='bar',stacked='True')#stacked='True'可以将两个数据叠加在一块
<Axes: xlabel='Sex'>

fare=df.groupby(['Fare','Survived'])['Survived'].count().unstack()#分别计算男性和女性0和1(死亡和生存)的人数,用.unstack()进行反转
fare
| Survived | 0 | 1 |
|---|---|---|
| Fare | ||
| 0.0000 | 14.0 | 1.0 |
| 4.0125 | 1.0 | NaN |
| 5.0000 | 1.0 | NaN |
| 6.2375 | 1.0 | NaN |
| 6.4375 | 1.0 | NaN |
| ... | ... | ... |
| 227.5250 | 1.0 | 3.0 |
| 247.5208 | 1.0 | 1.0 |
| 262.3750 | NaN | 2.0 |
| 263.0000 | 2.0 | 2.0 |
| 512.3292 | NaN | 3.0 |
248 rows × 2 columns
fare.plot()#默认为折线图
<Axes: xlabel='Fare'>

pclass=df.groupby(['Pclass','Survived'])['Survived'].count().unstack()#分别计算男性和女性0和1(死亡和生存)的人数,用.unstack()进行反转
pclass
| Survived | 0 | 1 |
|---|---|---|
| Pclass | ||
| 1 | 80 | 136 |
| 2 | 97 | 87 |
| 3 | 372 | 119 |
pclass.plot.bar()
<Axes: xlabel='Pclass'>

df.Age[df.Survived==0].hist(bins=5,alpha=0.5)#死亡乘客年龄的直方图
df.Age[df.Survived==1].hist(bins=5,alpha=0.5)#存活乘客年龄的直方图
# bins=5表示分成5份,alpha=0.5用来调整透明度
plt.legend([0,1])#蓝色为0,黄色为1
plt.xlabel('age')#横坐标标签
plt.ylabel('count')#纵坐标标签
Text(0, 0.5, 'count')

df.Age[df.Survived==0].hist(bins=5,alpha=0.5,density=1)#死亡乘客年龄的直方图
df.Age[df.Survived==1].hist(bins=5,alpha=0.5,density=1)#存活乘客年龄的直方图
# bins=5表示分成5份,alpha=0.5用来调整透明度,density=1表示把y轴改为密度
df.Age[df.Survived==0].plot.density()
df.Age[df.Survived==1].plot.density()#添加一个密度曲线
plt.legend([0,1])#蓝色为0,黄色为1
plt.xlabel('age')#横坐标标签
plt.ylabel('density')#纵坐标标签
Text(0, 0.5, 'density')

df.Age[df.Pclass==1].plot.density()#1,2,3等船舱与年龄之间的关系
<Axes: ylabel='Density'>

unique_pclass=df.Pclass.unique()
unique_pclass.sort()#顺序排序
unique_pclass
array([1, 2, 3], dtype=int64)
for i in unique_pclass:
df.Age[df.Pclass==i].plot.density()
plt.xlabel('age')
plt.legend(unique_pclass)
<matplotlib.legend.Legend at 0x1e2ff3706d0>

import seaborn as sns#画图库
for i in unique_pclass:
sns.kdeplot(df.Age[df.Pclass==i],shade=True,linewidth=0)#shade=True表示添加阴影,linewidth=0表示线宽度为0。


浙公网安备 33010602011771号