已信任
Jupyter 服务器: 本地
Python 3: Not Started
[1]
import pandas as pd
import numpy as np
[3]
df = pd.DataFrame(np.random.randn(5,3),index=['a','b','e','f','h'],columns=['one','two','three'])
df
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[5]
df = df.reindex(['a','b','c','d','e','f','h'])
df
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c NaN NaN NaN
d NaN NaN NaN
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[6]
# 检查数据是否为空
df['one'].isnull()
a False
b False
c True
d True
e False
f False
h False
Name: one, dtype: bool
[7]
# 检查数据是否非空
df['one'].notnull()
a True
b True
c False
d False
e True
f True
h True
Name: one, dtype: bool
[8]
# 提取空
df[df['one'].isnull()]
one two three
c NaN NaN NaN
d NaN NaN NaN
[9]
#提取非空
df[df['one'].notnull()]
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[10]
df
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c NaN NaN NaN
d NaN NaN NaN
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[11]
# 计算 第一列求和,若有nan,则视为0;如果所有的数据都为nan,则结果也为nan
df['one'].sum()
-1.9544119617918125
[12]
# 填充,把所有的nan填充为0
df.fillna(0)
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c 0.000000 0.000000 0.000000
d 0.000000 0.000000 0.000000
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[16]
df
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c NaN NaN NaN
d NaN NaN NaN
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[15]
# 计算平均值进行填充,按列的平均值进行填充
df.fillna(df.mean())
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c -0.390882 -0.090721 -0.109377
d -0.390882 -0.090721 -0.109377
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[17]
# pad为填充前一个数据
df.fillna(method='pad')
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c 0.056577 -0.612873 -1.710761
d 0.056577 -0.612873 -1.710761
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[18]
# backfill为填充后一个数据
df.fillna(method='backfill')
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c 1.000864 -0.708675 0.690998
d 1.000864 -0.708675 0.690998
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[19]
# 将空值的数据删除,按行删除nan
df.dropna()
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[20]
# 按列删除,每列都有nan,为空
df.dropna(axis=1)
a
b
c
d
e
f
h
[22]
# 替换丢失或者nan值或者通用值
df.replace({np.nan:10})
one two three
a -0.134914 -0.183527 1.455060
b 0.056577 -0.612873 -1.710761
c 10.000000 10.000000 10.000000
d 10.000000 10.000000 10.000000
e 1.000864 -0.708675 0.690998
f -2.126286 0.363740 -0.151361
h -0.750653 0.687731 -0.830824
[24]
df['four']=pd.Series([1,2,3,4,5,6,7],index=['a','b','c','d','e','f','h'])
df
one two three four
a -0.134914 -0.183527 1.455060 1
b 0.056577 -0.612873 -1.710761 2
c NaN NaN NaN 3
d NaN NaN NaN 4
e 1.000864 -0.708675 0.690998 5
f -2.126286 0.363740 -0.151361 6
h -0.750653 0.687731 -0.830824 7
[25]
df.replace({np.nan:10,5:1000})
one two three four
a -0.134914 -0.183527 1.455060 1
b 0.056577 -0.612873 -1.710761 2
c 10.000000 10.000000 10.000000 3
d 10.000000 10.000000 10.000000 4
e 1.000864 -0.708675 0.690998 1000
f -2.126286 0.363740 -0.151361 6
h -0.750653 0.687731 -0.830824 7
[-]