lesson3

import numpy as np
import pandas as pd
df = pd.read_csv('train.csv')
df.head(3)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS

缺省值查看

  • info
  • isnull().sum()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
df.head(6)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
df.drop_duplicates().head(4) #对整行有缺失值的清理方法
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
df.to_csv('test_clear.csv')
df.head(5)
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeBand
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS3
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C4
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS3
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S4
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS4
df = pd.read_csv('test_clear.csv')
df['AgeBand'] = pd.cut(df['Age'],[0,5,15,30,50,80],labels = ['1','2','3','4','5'])#将年龄按照年龄段离散化并用12345表示
df.head(5)

Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeBand
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS3
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C4
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS3
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S4
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS4
df.to_csv('test_cut.csv')
df['AgeBand'] = pd.cut(df['Age'],5,labels = ['1','2','3','4','5'])#将年龄平均分成五个年龄段并用12345表示
df.head(5)
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeBand
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS2
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C3
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS2
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S3
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS3
pd.value_counts(df['AgeBand'])
2    346
3    188
1    100
4     69
5     11
Name: AgeBand, dtype: int64
df.to_csv('test_ave.csv')
df['AgeBand'] = pd.qcut(df['Age'],[0,0.1,0.3,0.5,0.7,1.],labels = ['1','2','3','4','5'])#将年龄按照百分比分为五个年龄段并用12345表示
df.head(5)
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeBand
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS2
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C5
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS3
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S4
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS4
pd.value_counts(df['AgeBand'])
5    195
4    157
2    154
3    131
1     77
Name: AgeBand, dtype: int64
df.to_csv('test_ave.csv')

df['Sex'].value_counts() #查看文本变量名及数量,并rename
male      577
female    314
Name: Sex, dtype: int64
df['Sex_num'] = df['Sex'].replace(['female','male'],[2,1])
df.head(5)
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeBandSex_num
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS21
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C52
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS32
33411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S42
44503Allen, Mr. William Henrymale35.0003734508.0500NaNS41

df['Embarked'].value_counts() 
S    644
C    168
Q     77
Name: Embarked, dtype: int64
df['Embarked_num'] = df['Embarked'].map({'S' : 1, 'C' : 2, 'Q': 3})
df.head(3)
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeBandSex_numEmbarked_num
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS211.0
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C522.0
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS321.0

缺省值的处理

  • dropna 根据每个标签的值是否是缺失数据来筛选轴标签,并根据允许丢失的数据量来确定阈值
  • fillna 用某些值来填充缺失的数据或使用插值方法
    参数: value标量值,或者字典型对象用于填充缺失值
    method插值方法,如果没有其他参数,默认‘ffill’
    limit 前向或者后向填充时的最大填充范围
    inplace 修改被调用的对象,而不是生成一个备份
    axis 需要填充的轴,默认axis = 0
  • isnull 返回表明那些值是缺失值的布尔值
  • notnull isnull的反函数
df[df['Age'] == np.nan] = 0
df.head(6)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
df['Age'].isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool
from numpy import nan as NA

data = pd.DataFrame(np.random.randn(7,3))
data.iloc[4:6,0]= NA
data.iloc[3:,1] = NA
data.iloc[5:,2] = NA
data
012
00.2557842.167973-0.583178
1-0.0669631.567319-0.213410
2-2.6123500.314180-0.843624
3-0.087889NaN0.447406
4NaNNaN-0.022148
5NaNNaNNaN
62.363004NaNNaN
data.dropna()
012
01.2116541.6441430.469220
11.1476140.0110540.884146
2-0.6523020.5423381.268530
data.dropna(how = 'all')  # how 删除所有值为NA的行 ,如果同样方式删除列,参数axis = 1;
012
01.2116541.6441430.469220
11.1476140.0110540.884146
2-0.6523020.5423381.268530
3-0.368937NaN0.039350
4NaNNaN0.018442
6-1.392818NaNNaN
data.dropna(thresh = 2) #https://zhuanlan.zhihu.com/p/109535661  thresh = n 保留至少n个有非NA数据的行或者列(axis=1/0)
012
01.2116541.6441430.469220
11.1476140.0110540.884146
2-0.6523020.5423381.268530
3-0.368937NaN0.039350
data.dropna(thresh = 5,axis = 1)
02
01.2116540.469220
11.1476140.884146
2-0.6523021.268530
3-0.3689370.039350
4NaN0.018442
5NaNNaN
6-1.392818NaN
data.fillna(0)
012
01.2116541.6441430.469220
11.1476140.0110540.884146
2-0.6523020.5423381.268530
3-0.3689370.0000000.039350
40.0000000.0000000.018442
50.0000000.0000000.000000
6-1.3928180.0000000.000000
data.fillna({0:1,1:8,2:9}) #不同列填充不同数值,如果要 改变已经存在的对象 加入参数 inplace = Ture
012
01.2116541.6441430.469220
11.1476140.0110540.884146
2-0.6523020.5423381.268530
3-0.3689378.0000000.039350
41.0000008.0000000.018442
51.0000008.0000009.000000
6-1.3928188.0000009.000000
data.fillna(method = 'ffill') #相同的插值方法插值
012
01.2116541.6441430.469220
11.1476140.0110540.884146
2-0.6523020.5423381.268530
3-0.3689370.5423380.039350
4-0.3689370.5423380.018442
5-0.3689370.5423380.018442
6-1.3928180.5423380.018442
data.fillna(method = 'ffill',limit = 2)
012
01.2116541.6441430.469220
11.1476140.0110540.884146
2-0.6523020.5423381.268530
3-0.3689370.5423380.039350
4-0.3689370.5423380.018442
5-0.368937NaN0.018442
6-1.392818NaN0.018442
data.fillna(method = 'ffill',limit = 2,inplace = True)
data
012
00.2557842.167973-0.583178
1-0.0669631.567319-0.213410
2-2.6123500.314180-0.843624
3-0.0878890.3141800.447406
4-0.0878890.314180-0.022148
5-0.087889NaN-0.022148
62.363004NaN-0.022148
df[df['Age'] ==None] =0
df.head(3)
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS

重复数值处理

  • duplicated 返回一个布尔值Series,这个Series反映的是每一行是否存在重复(即是否与之前出现的行相同)的情况
  • drop_duplicates 返回的是DataFrame,内容是duplicate返回数组中为Flase的部分
  • 两个函数默认都是保留第一个观测的数值,参数keep = ‘last’将会返回重复值中最后一个
df[df.duplicated()]
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
data1 = pd.DataFrame({'k1':['one' , 'two'] * 3 + ['two'],
                     'k2':[1,1,2,3,3,4,4]})
data1
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
6two4
data1.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
data1[data1.duplicated()] #查看重复值
k1k2
6two4
data1.drop_duplicates()#过滤重复值
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
data1['v1'] = 0
data1.drop_duplicates(['k1']) #基于K1列去除重复值
k1k2v1
0one10
1two11
data1['v1'] = 0
data1.drop_duplicates(keep = 'last')
k1k2v1
0one10
1two10
2one20
3two30
4one30
6two40

分箱操作

  • cut函数 自定义取间/等区间分
  • qcut函数 基于样本分位数分箱类似[0,0.1,0.5,1.] 还可以进行样本数据量相等的分箱
ages = [20,22,25,27,21,34,56,36,42,28,61]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (35, 60], (35, 60], (35, 60], (25, 35], (60, 100]]
Length: 11
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
array([0, 0, 0, 1, 0, 1, 2, 2, 2, 1, 3], dtype=int8)
pd.value_counts(cats)
(18, 25]     4
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64
cats = pd.cut(ages,5,labels = [1,2,3,4,5])
cats
[1, 1, 1, 1, 1, ..., 5, 2, 3, 1, 5]
Length: 11
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
cats = pd.qcut(ages,[0,0.1,0.5,0.7,0.8,0.9],labels = [1,2,3,4,5])
cats
[1, 2, 2, 2, 1, ..., 5, 3, 4, 2, NaN]
Length: 11
Categories (5, int64): [1 < 2 < 3 < 4 < 5]

替换

  • replace replace(被替代值,替代值),参数可以数组可以,字典传递
  • map
data2 = pd.Series([1,-99,3,-99,-100,8])
data2
0      1
1    -99
2      3
3    -99
4   -100
5      8
dtype: int64
data2.replace([-99,8],0)
0      1
1      0
2      3
3      0
4   -100
5      0
dtype: int64
data2.replace([-99,8],[np.nan,0])
0      1.0
1      NaN
2      3.0
3      NaN
4   -100.0
5      0.0
dtype: float64
data2.replace({-99:np.nan,
               8:0})
0      1.0
1      NaN
2      3.0
3      NaN
4   -100.0
5      0.0
dtype: float64
data2.map({8:0,-99:np.nan,1:7})
0    7.0
1    NaN
2    NaN
3    NaN
4    NaN
5    0.0
dtype: float64
posted @ 2021-06-19 22:23  visionwpc  阅读(48)  评论(0编辑  收藏  举报