pandas入门:基本功能
重新索引
from pandas import Series,DataFrame
# Series重新索引
obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
print(obj)
'''
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
'''
# 调用reindex将会根据新索进行重排,如果某个索引值当前不存在,就引入缺失值
obj2 = obj.reindex(['a','b','c','d','e'])
print(obj2)
'''
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
'''
obj3 = obj.reindex(['a','b','c','d','e'],fill_value=0)
print(obj3)
# fill_value 实现空值填充
'''
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
'''
# ffill实现向前填充,bfill实现向后填充
obj4 = Series(['blue','purpul','yellow'],index=[0,1,4])
obj5 = obj4.reindex(range(6),method='ffill')
print(obj5)
'''
0 blue
1 purpul
2 purpul
3 purpul
4 yellow
5 yellow
dtype: object
'''
obj6 = obj4.reindex(range(6),method='bfill')
print(obj6)
'''
0 blue
1 purpul
2 yellow
3 yellow
4 yellow
5 NaN
dtype: object
'''
from pandas import Series,DataFrame
import numpy as np
# DataFrame重新索引
frame = DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['Ohio','Texas','California'])
print(frame)
'''
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
'''
frame2 = frame.reindex(['a','b','c','d'])
print(frame2)
'''
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
'''
states = ['Texas','Utah','California']
frame3 = frame.reindex(columns=states)
print(frame3)
'''
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
'''
# 可同时对行列进行索引
frame4 = frame.reindex(index=['a','b','c','d'],columns=['Ohio','Texas','California','Utah'])
print(frame4)
'''
Ohio Texas California Utah
a 0.0 1.0 2.0 NaN
b NaN NaN NaN NaN
c 3.0 4.0 5.0 NaN
d 6.0 7.0 8.0 NaN
'''
# 利用ix的标签索引功能,重新索引任务可以变得更简洁
frame5 = frame.ix[['a','c','d'],['Ohio','Texas','California']]
print(frame5)
'''
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
'''
丢弃指定轴上的项
from pandas import Series,DataFrame
import numpy as np
# drop方法
obj = Series(np.arange(5),index=['a','b','c','d','e'])
new_obj = obj.drop('c')
print(new_obj)
'''
a 0
b 1
d 3
e 4
dtype: int32
'''
new_obj = obj.drop(['d','c'])
print(new_obj)
'''
a 0
b 1
e 4
dtype: int32
'''
# 对于DataFrame可删除任意轴上的索引值
data = DataFrame(np.arange(16).reshape((4,4)),
index=[1,2,3,4],
columns=['one','two','three','four'])
new_data = data.drop([1,3])
print(new_data)
'''
one two three four
2 4 5 6 7
4 12 13 14 15
'''
new_data = data.drop('two',axis=1)
print(new_data)
'''
one three four
1 0 2 3
2 4 6 7
3 8 10 11
4 12 14 15
'''
new_data = data.drop(['two','four'],axis=1)
print(new_data)
'''
one three
1 0 2
2 4 6
3 8 10
4 12 14
'''
索引、选取和过滤
from pandas import Series,DataFrame
obj = Series([9,5,7,3],index=['a','b','c','d'])
print(obj['b']) # 5
print(obj[2]) # 7
print(obj[2:4])
'''
c 7
d 3
dtype: int64
'''
print(obj[['b','a','d']])
'''
b 5
a 9
d 3
dtype: int64
'''
print(obj[[1,3]])
'''
b 5
d 3
dtype: int64
'''
print(obj[obj<5])
'''
d 3
dtype: int64
'''
# 利用标签的切片运算与普通python切片运算不通过,其末端是包含的
print(obj['b':'c'])
'''
b 5
c 7
dtype: int64
'''
# 赋值方式
obj['b':'c'] = 5
print(obj)
'''
a 9
b 5
c 5
d 3
dtype: int64
'''
from pandas import Series,DataFrame
import numpy as np
data = DataFrame(np.arange(16).reshape((4,4)),
index=['Ohio','Colorado','Utah','New York'],
columns=['one','two','three','four'])
print(data)
'''
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
'''
print(data['two'])
'''
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int32
'''
print(data[['three','one']])
'''
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
'''
# 特殊情况,通过切片或布尔型数组选取行
print(data[:2])
'''
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
'''
print(data['three'] >5)
'''
Ohio False
Colorado True
Utah True
New York True
Name: three, dtype: bool
'''
print(data[data['three'] >5]) # 等于data[[False,True,True,True]]
'''
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
'''
# 索引字段ix
print(data.ix['Colorado',['two','three']])
'''
two 5
three 6
Name: Colorado, dtype: int32
'''
print(data.ix[['Colorado','Utah'],[3,0,1]])
'''
four one two
Colorado 7 4 5
Utah 11 8 9
'''
print(data.ix[2])
'''
one 8
two 9
three 10
four 11
Name: Utah, dtype: int32
'''
print(data.ix[:'Utah','two'])
'''
Ohio 1
Colorado 5
Utah 9
Name: two, dtype: int32
'''
print(data.ix[data.three>5,:3])
'''
one two three
Colorado 4 5 6
Utah 8 9 10
New York 12 13 14
'''
算术运算和数据对齐
from pandas import Series,DataFrame
import numpy as np
s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
print(s1+s2)
# 自动的数据对齐在不重叠处引入NA值
'''
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
'''
df1 = DataFrame(np.arange(9).reshape((3,3)),
columns=list('ABC'),
index=['one','two','three'])
df2 = DataFrame(np.arange(12).reshape((4,3)),
columns=list('ABC'),
index=['one','three','four','five'])
print(df1+df2)
# 索引为原来两个DataFrame的并集
'''
A B C
five NaN NaN NaN
four NaN NaN NaN
one 0.0 2.0 4.0
three 9.0 11.0 13.0
two NaN NaN NaN
'''
from pandas import Series,DataFrame
import numpy as np
# 在算术方法中填充值
df1 = DataFrame(np.arange(12).reshape(3,4),columns=list('abcd'))
df2 = DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))
print(df1+df2)
'''
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 11.0 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
'''
print(df1.add(df2,fill_value=0))
'''
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 11.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0
'''
from pandas import Series,DataFrame
import numpy as np
# DataFrame和Series之间的运算
arr = np.arange(12.).reshape((3,4))
print(arr)
'''
[[ 0. 1. 2. 3.]
[ 4. 5. 6. 7.]
[ 8. 9. 10. 11.]]
'''
print(arr[0])
'''
[0. 1. 2. 3.]
'''
print(arr-arr[0])
'''
[[0. 0. 0. 0.]
[4. 4. 4. 4.]
[8. 8. 8. 8.]]
'''
frame = DataFrame(np.arange(12.).reshape((4,3)),
columns=list('bde'),
index=['one','three','four','five'])
print(frame)
'''
b d e
one 0.0 1.0 2.0
three 3.0 4.0 5.0
four 6.0 7.0 8.0
five 9.0 10.0 11.0
'''
series = frame.ix[0]
print(series)
'''
b 0.0
d 1.0
e 2.0
Name: one, dtype: float64
'''
# 默认情况下,DataFrame和Series之间的算术运算会将Series的索引匹配到DataFrame的列,然后沿着行乡下广播
print(frame-series)
'''
b d e
one 0.0 0.0 0.0
three 3.0 3.0 3.0
four 6.0 6.0 6.0
five 9.0 9.0 9.0
'''
# 如果某个索引在DataFrame或Series的索引中找不到,则参与运算的两个对象会被重新索引以形成并集
series2 = Series(range(3),index=['b','e','f'])
print(series2)
'''
b 0
e 1
f 2
dtype: int64
'''
print(frame+series2)
'''
b d e f
one 0.0 NaN 3.0 NaN
three 3.0 NaN 6.0 NaN
four 6.0 NaN 9.0 NaN
five 9.0 NaN 12.0 NaN
'''
# 如果希望匹配行切在列上广播,需使用蒜素运算方法,如
series3 = frame['d']
print(series3)
'''
one 1.0
three 4.0
four 7.0
five 10.0
Name: d, dtype: float64
'''
print(frame.sub(series3,axis=0))
# 传入的轴号是希望匹配的轴
'''
b d e
one -1.0 0.0 1.0
three -1.0 0.0 1.0
four -1.0 0.0 1.0
five -1.0 0.0 1.0
'''
函数应用和映射
from pandas import Series,DataFrame
import numpy as np
#Numpy的ufuncs也可用于操作pandas对象
frame = DataFrame(np.random.randn(4,3),
columns=list('bde'),
index=['one','two','three','four'])
print(frame)
'''
b d e
one -1.415255 -1.084419 0.724132
two -0.468757 0.493345 0.318408
three 0.913162 -0.513506 0.149354
four -2.219956 1.166779 -0.359199
'''
print(np.abs(frame))
'''
b d e
one 1.415255 1.084419 0.724132
two 0.468757 0.493345 0.318408
three 0.913162 0.513506 0.149354
four 2.219956 1.166779 0.359199
'''
# 将函数应用到由各列或行形成的一维数组上,使用apply方法
data = [[1,2,3],
[5,2,3],
[6,6,6],
[9,7,1]]
frame2 = DataFrame(data,
columns=list('bde'),
index=['one','two','three','four'])
print(frame2)
'''
b d e
one 1 2 3
two 5 2 3
three 6 6 6
four 9 7 1
'''
f = lambda x:x.max()-x.min()
print(frame2.apply(f))
'''
b 8
d 5
e 5
dtype: int64
'''
# axis=1 横向计算,axis=0 默认纵向计算
print(frame2.apply(f,axis=1))
'''
one 2
two 3
three 0
four 8
dtype: int64
'''
# 元素级函数使用applymap
f = lambda x:x+1
print(frame2.applymap(f))
'''
b d e
one 2 3 4
two 6 3 4
three 7 7 7
four 10 8 2
'''
print(frame2['e'].map(f))
'''
one 4
two 4
three 7
four 2
Name: e, dtype: int64
'''
排序和排名
from pandas import Series,DataFrame
import numpy as np
obj = Series([1,4,2,3],index=['d','a','c','b'])
print(obj)
'''
d 1
a 4
c 2
b 3
dtype: int64
'''
print(obj.sort_index())
'''
a 4
b 3
c 2
d 1
dtype: int64
'''
print(obj.sort_values())
'''
d 1
c 2
b 3
a 4
dtype: int64
'''
frame = DataFrame(np.arange(8).reshape((2,4)),
index=['n','c'],
columns=[0,4,1,6])
print(frame)
'''
0 4 1 6
n 0 1 2 3
c 4 5 6 7
'''
print(frame.sort_index())
'''
0 4 1 6
c 4 5 6 7
n 0 1 2 3
'''
# axis=1 横向,axis=0 默认纵向
print(frame.sort_index(axis=1))
'''
0 1 4 6
n 0 2 1 3
c 4 6 5 7
'''
# ascending默认升序,可设置降序
print(frame.sort_index(axis=1,ascending=False))
'''
6 4 1 0
n 3 1 2 0
c 7 5 6 4
'''
# 对Series排序,python3.6版本之后没有order了,可使用sort_values
obj = Series([4,7,-3,2])
print(obj.sort_values())
'''
2 -3
3 2
0 4
1 7
dtype: int64
'''
# 排序时,缺失值会放到Series的末尾
obj = Series([4,np.nan,7,np.nan,-3,2])
print(obj.sort_values())
'''
4 -3.0
5 2.0
0 4.0
2 7.0
1 NaN
3 NaN
dtype: float64
'''
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
print(frame)
'''
a b
0 0 4
1 1 7
2 0 -3
3 1 2
'''
# 根据一个或多个李忠的值进行排序。将一个或多个列的名字传递给by选项即可
print(frame.sort_values(by='b'))
'''
a b
2 0 -3
3 1 2
0 0 4
1 1 7
'''
print(frame.sort_values(by=['a','b']))
'''
a b
2 0 -3
0 0 4
3 1 2
1 1 7
'''
# 排名(ranking) 为各组分配一个平均排名,即排序之后给一个编号
obj = Series([7,-5,7,4,2,0,4])
print(obj.rank())
'''
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
'''
print(obj.rank(method='first'))
'''
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
'''
print(obj.rank(ascending=False,method='max'))
'''
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64
'''
frame = DataFrame({'b':[4.3,7,-3,2],
'a':[0,1,0,1],
'c':[-2,5,8,-2.5]})
print(frame)
'''
a b c
0 0 4.3 -2.0
1 1 7.0 5.0
2 0 -3.0 8.0
3 1 2.0 -2.5
'''
print(frame.rank(axis=1))
'''
a b c
0 2.0 3.0 1.0
1 1.0 3.0 2.0
2 2.0 1.0 3.0
3 2.0 3.0 1.0
'''
method
- average:默认,在相等的分组中,为各个值分配平均排名
- min:使用整个组的最小排名
- max:使用整个组的最大排名
- first:按值在原始数据中出现的顺序排名
带有重复值的轴索引
from pandas import Series
obj= Series(range(5),index=['a','a','b','b','c'])
print(obj)
'''
a 0
a 1
b 2
b 3
c 4
dtype: int64
'''
# 索引的is_unique是否唯一
print(obj.index.is_unique) # False
print(obj['a'])
'''
a 0
a 1
dtype: int64
'''
print(obj['c']) # 4
本文来自博客园,作者:OTAKU_nicole,转载请注明原文链接:https://www.cnblogs.com/nicole-zhang/p/12955099.html