Pandas入门基本知识点
1.pandas基础 Series和DataFrame
>>> import numpy as np
>>> import pandas as pd
>>> s1=pd.Series([4,7,-5,3]) #创建一个series 索引为默认值
>>> print(s1)
0 4
1 7
2 -5
3 3
dtype: int64
>>> s2=pd.Series([[1,2,3,4],[5,6,7,8]]) #创建二维的
>>> print(s2)
0 [1, 2, 3, 4]
1 [5, 6, 7, 8]
>>> s2=pd.Series([[1,2,3,4],[5,6,7,8]],index=['a','b']) #给二维的Series添加索引
>>> print(s2)
a [1, 2, 3, 4]
b [5, 6, 7, 8]
dtype: object
>>> s1=pd.Series((4,7,-5,3)) #这种写法与上述创建一维的写法相同
>>> print(s1)
0 4
1 7
2 -5
3 3
dtype: int64
>>> s1.values
array([ 4, 7, -5, 3], dtype=int64) #查看s1的值
>>> s1.index
RangeIndex(start=0, stop=4, step=1) #查看s1的索引,从0开始,4结束,不包括4
>>> s2=pd.Series([4.0,6.5,-0.5,4.2],index=['d','b','a','c'])
>>> print(s2)
d 4.0
b 6.5
a -0.5
c 4.2
dtype: float64
>>> s2['a'] #根据索引提取一个值
-0.5
>>> s2['a']=10
>>> print(s2)
d 4.0
b 6.5
a 10.0
c 4.2
dtype: float64
>>> s2[['a','b','c']] #根据索引提取多个值
a 10.0
b 6.5
c 4.2
dtype: float64
>>> 'b' in s2
True # 'b'在S2里面
>>> 'e' in s2
False # 'e' 不在S2里面
#DataFrame
>>> data={'year':[2014,2015,2016,2017],'income':[10000,30000,50000,80000],'pay':[5000,20000,30000,30000]}
>>> df1=pd.DataFrame(data)
>>> df1
year income pay
0 2014 10000 5000
1 2015 30000 20000
2 2016 50000 30000
3 2017 80000 30000
>>> df2=pd.DataFrame(np.arange(12).reshape((3,4)))
>>> df2
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
>>> df3=pd.DataFrame(np.arange(12).reshape(3,4),index=['a','c','d'],columns=[2,33,44,5]) #指定索引名称
>>> df3
2 33 44 5
a 0 1 2 3
c 4 5 6 7
d 8 9 10 11
>>> df3.columns
Int64Index([2, 33, 44, 5], dtype='int64')
>>> df3.index
Index(['a', 'c', 'd'], dtype='object')
>>> df3.values
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
>>>
>>> df3.describe() #直接统计出最大值,最小值,平均值等
2 33 44 5
count 3.0 3.0 3.0 3.0
mean 4.0 5.0 6.0 7.0
std 4.0 4.0 4.0 4.0
min 0.0 1.0 2.0 3.0
25% 2.0 3.0 4.0 5.0
50% 4.0 5.0 6.0 7.0
75% 6.0 7.0 8.0 9.0
max 8.0 9.0 10.0 11.0
>>> df3.T
a c d
2 0 4 8
33 1 5 9
44 2 6 10
5 3 7 11
>>> df3.sort_index(axis=1) #列排序
2 5 33 44
a 0 3 1 2
c 4 7 5 6
d 8 11 9 10
>>> df3.sort_index(axis=0) #行排序
2 33 44 5
a 0 1 2 3
c 4 5 6 7
d 8 9 10 11
>>> df3.sort_values(by=44) #单独对某一列进行排序
2 33 44 5
a 0 1 2 3
c 4 5 6 7
d 8 9 10 11
2.pandas的数据选择
>>> import numpy as np
>>> import pandas as pd
>>> dates=pd.date_range('20170101',periods=6)
>>> df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
>>> df1
A B C D
2017-01-01 0 1 2 3
2017-01-02 4 5 6 7
2017-01-03 8 9 10 11
2017-01-04 12 13 14 15
2017-01-05 16 17 18 19
2017-01-06 20 21 22 23
>>> df1['A'] #将DataFrame的列获取为一个Series
2017-01-01 0
2017-01-02 4
2017-01-03 8
2017-01-04 12
2017-01-05 16
2017-01-06 20
Freq: D, Name: A, dtype: int32
>>> df1.A
2017-01-01 0
2017-01-02 4
2017-01-03 8
2017-01-04 12
2017-01-05 16
2017-01-06 20
Freq: D, Name: A, dtype: int32
>>> df1[0:2] #取0-2行
A B C D
2017-01-01 0 1 2 3
2017-01-02 4 5 6 7
>>> df1['20170102':'20170104']
A B C D
2017-01-02 4 5 6 7
2017-01-03 8 9 10 11
2017-01-04 12 13 14 15
>>> df1.loc['20170102'] #通过标签选择数据
A 4
B 5
C 6
D 7
Name: 2017-01-02 00:00:00, dtype: int32
>>> df1.loc['20170101',['A','C']]
A 0
C 2
Name: 2017-01-01 00:00:00, dtype: int32
>>> df1.loc[:,['A','B']]
A B
2017-01-01 0 1
2017-01-02 4 5
2017-01-03 8 9
2017-01-04 12 13
2017-01-05 16 17
2017-01-06 20 21
>>> df1.iloc[2] #通过位置来选择数据 选择第2行的数据
A 8
B 9
C 10
D 11
Name: 2017-01-03 00:00:00, dtype: int32
>>> df1.iloc[1:3,2:4] #提取第1到3行 第2到4列
C D
2017-01-02 6 7
2017-01-03 10 11
>>> df1.iloc[[1,2,4],[1,3]] #提取第1,2,4行 第1,3列
B D
2017-01-02 5 7
2017-01-03 9 11
2017-01-05 17 19
>>> df1.ix[2:4,['A','C']] #混合标签位置选择
A C
2017-01-03 8 10
2017-01-04 12 14
>>> df1.ix['20170102':'20170104',2:4]
C D
2017-01-02 6 7
2017-01-03 10 11
2017-01-04 14 15
>>> df1.A
2017-01-01 0
2017-01-02 4
2017-01-03 8
2017-01-04 12
2017-01-05 16
2017-01-06 20
Freq: D, Name: A, dtype: int32
>>> df1.A>6
2017-01-01 False
2017-01-02 False
2017-01-03 True
2017-01-04 True
2017-01-05 True
2017-01-06 True
Freq: D, Name: A, dtype: bool
>>> df1[df1>6]
A B C D
2017-01-01 NaN NaN NaN NaN
2017-01-02 NaN NaN NaN 7.0
2017-01-03 8.0 9.0 10.0 11.0
2017-01-04 12.0 13.0 14.0 15.0
2017-01-05 16.0 17.0 18.0 19.0
2017-01-06 20.0 21.0 22.0 23.0
3.pandas的赋值以及操作
>>> dates=np.arange(20170101,20170107)
>>> df1=pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
>>> df1
A B C D
20170101 0 1 2 3
20170102 4 5 6 7
20170103 8 9 10 11
20170104 12 13 14 15
20170105 16 17 18 19
20170106 20 21 22 23
>>> df1.iloc[2,2]
10
>>> df1.iloc[2,2]=100
>>> df1
A B C D
20170101 0 1 2 3
20170102 4 5 6 7
20170103 8 9 100 11
20170104 12 13 14 15
20170105 16 17 18 19
20170106 20 21 22 23
>>> df1.loc[20170102,'B']=200
>>> df1
A B C D
20170101 0 1 2 3
20170102 4 200 6 7
20170103 8 9 100 11
20170104 12 13 14 15
20170105 16 17 18 19
20170106 20 21 22 23
>>> df1.A
20170101 0
20170102 4
20170103 8
20170104 12
20170105 16
20170106 20
Name: A, dtype: int32
>>> df1.A>10
20170101 False
20170102 False
20170103 False
20170104 True
20170105 True
20170106 True
Name: A, dtype: bool
>>> df1[df1.A>10]=0
>>> df1
A B C D
20170101 0 1 2 3
20170102 4 200 6 7
20170103 8 9 100 11
20170104 0 0 0 0
20170105 0 0 0 0
20170106 0 0 0 0
>>> df1.A[df1.A==0]=1
>>> df1
A B C D
20170101 1 1 2 3
20170102 4 200 6 7
20170103 8 9 100 11
20170104 1 0 0 0
20170105 1 0 0 0
20170106 1 0 0 0
>>> df1['E']=10
>>> df1
A B C D E
20170101 1 1 2 3 10
20170102 4 200 6 7 10
20170103 8 9 100 11 10
20170104 1 0 0 0 10
20170105 1 0 0 0 10
20170106 1 0 0 0 10
>>> df1['F']=pd.Series([1,2,3,4,5,6],index=dates)
>>> df1
A B C D E F
20170101 1 1 2 3 10 1
20170102 4 200 6 7 10 2
20170103 8 9 100 11 10 3
20170104 1 0 0 0 10 4
20170105 1 0 0 0 10 5
20170106 1 0 0 0 10 6
df1.loc[20170107,['A','B','C']]=[1,2,3]
>>> df1
A B C D E F
20170101 1.0 1.0 2.0 3.0 10.0 1.0
20170102 4.0 200.0 6.0 7.0 10.0 2.0
20170103 8.0 9.0 100.0 11.0 10.0 3.0
20170104 1.0 0.0 0.0 0.0 10.0 4.0
20170105 1.0 0.0 0.0 0.0 10.0 5.0
20170106 1.0 0.0 0.0 0.0 10.0 6.0
20170107 1.0 2.0 3.0 NaN NaN NaN
>>> s1=pd.Series([1,2,3,4,5,6],index=['A','B','C','D','E','F'])
>>> s1.name='S1'
>>> df2=df1.append(s1)
>>> df2
A B C D E F
20170101 1.0 1.0 2.0 3.0 10.0 1.0
20170102 4.0 200.0 6.0 7.0 10.0 2.0
20170103 8.0 9.0 100.0 11.0 10.0 3.0
20170104 1.0 0.0 0.0 0.0 10.0 4.0
20170105 1.0 0.0 0.0 0.0 10.0 5.0
20170106 1.0 0.0 0.0 0.0 10.0 6.0
20170107 1.0 2.0 3.0 NaN NaN NaN
S1 1.0 2.0 3.0 4.0 5.0 6.0
>>> df1.insert(1,'G',df2['E']) #在第一列插入索引为'G'的列,列中的内容为df2中E列的内容。
>>> df1
A G B C D E F
20170101 1.0 10.0 1.0 2.0 3.0 10.0 1.0
20170102 4.0 10.0 200.0 6.0 7.0 10.0 2.0
20170103 8.0 10.0 9.0 100.0 11.0 10.0 3.0
20170104 1.0 10.0 0.0 0.0 0.0 10.0 4.0
20170105 1.0 10.0 0.0 0.0 0.0 10.0 5.0
20170106 1.0 10.0 0.0 0.0 0.0 10.0 6.0
20170107 1.0 NaN 2.0 3.0 NaN NaN NaN
>>> g=df1.pop('G') #弹出'G'列
>>> df1.insert(6,'G',g) #将g列插入到最后
>>> df1
A B C D E F G
20170101 1.0 1.0 2.0 3.0 10.0 1.0 10.0
20170102 4.0 200.0 6.0 7.0 10.0 2.0 10.0
20170103 8.0 9.0 100.0 11.0 10.0 3.0 10.0
20170104 1.0 0.0 0.0 0.0 10.0 4.0 10.0
20170105 1.0 0.0 0.0 0.0 10.0 5.0 10.0
20170106 1.0 0.0 0.0 0.0 10.0 6.0 10.0
20170107 1.0 2.0 3.0 NaN NaN NaN NaN
>>> del df1['G'] #删除'G'列
>>> df1
A B C D E F
20170101 1.0 1.0 2.0 3.0 10.0 1.0
20170102 4.0 200.0 6.0 7.0 10.0 2.0
20170103 8.0 9.0 100.0 11.0 10.0 3.0
20170104 1.0 0.0 0.0 0.0 10.0 4.0
20170105 1.0 0.0 0.0 0.0 10.0 5.0
20170106 1.0 0.0 0.0 0.0 10.0 6.0
20170107 1.0 2.0 3.0 NaN NaN NaN
>>> df2=df1.drop(['A','B'],axis=1) #删除AB列
>>> df1
A B C D E F
20170101 1.0 1.0 2.0 3.0 10.0 1.0
20170102 4.0 200.0 6.0 7.0 10.0 2.0
20170103 8.0 9.0 100.0 11.0 10.0 3.0
20170104 1.0 0.0 0.0 0.0 10.0 4.0
20170105 1.0 0.0 0.0 0.0 10.0 5.0
20170106 1.0 0.0 0.0 0.0 10.0 6.0
20170107 1.0 2.0 3.0 NaN NaN NaN
>>> df2
C D E F
20170101 2.0 3.0 10.0 1.0
20170102 6.0 7.0 10.0 2.0
20170103 100.0 11.0 10.0 3.0
20170104 0.0 0.0 10.0 4.0
20170105 0.0 0.0 10.0 5.0
20170106 0.0 0.0 10.0 6.0
20170107 3.0 NaN NaN NaN
>>> df2=df1.drop([20170101,20170102],axis=0) #删除第一行 和 第二行
>>> df2
A B C D E F
20170103 8.0 9.0 100.0 11.0 10.0 3.0
20170104 1.0 0.0 0.0 0.0 10.0 4.0
20170105 1.0 0.0 0.0 0.0 10.0 5.0
20170106 1.0 0.0 0.0 0.0 10.0 6.0
20170107 1.0 2.0 3.0 NaN NaN NaN
>>> df1
A B C D E F
20170101 1.0 1.0 2.0 3.0 10.0 1.0
20170102 4.0 200.0 6.0 7.0 10.0 2.0
20170103 8.0 9.0 100.0 11.0 10.0 3.0
20170104 1.0 0.0 0.0 0.0 10.0 4.0
20170105 1.0 0.0 0.0 0.0 10.0 5.0
20170106 1.0 0.0 0.0 0.0 10.0 6.0
20170107 1.0 2.0 3.0 NaN NaN NaN
4.pandas如何处理丢失的数据
>>> import numpy as np
>>> import pandas as pd
>>> dates=np.arange(20170101,20170105)
>>> df1=pd.DataFrame(np.arange(12).reshape(4,3),index=dates,colums=['A','B','C'])
>>> df1=pd.DataFrame(np.arange(12).reshape(4,3),index=dates,columns=['A','B','C'])
>>> df1
A B C
20170101 0 1 2
20170102 3 4 5
20170103 6 7 8
20170104 9 10 11
>>> df2=pd.DataFrame(df1,index=dates,columns=['A','B','C','D','E'])
>>> df2
A B C D E
20170101 0 1 2 NaN NaN
20170102 3 4 5 NaN NaN
20170103 6 7 8 NaN NaN
20170104 9 10 11 NaN NaN
>>> s1=pd.Series([3,4,6],index=dates[:3])
>>> s2=pd.Series([32,5,2],index=dates[1:])
>>> df2['D']=s1
>>> df2['E']=s2
>>> df2
A B C D E
20170101 0 1 2 3.0 NaN
20170102 3 4 5 4.0 32.0
20170103 6 7 8 6.0 5.0
20170104 9 10 11 NaN 2.0
>>> df2.dropna(axis=0,how='any') #axis[0 1] 0代表行 1代表列 how=['any','all'] any代表任意一个或者多个就可以删除 all全部都有才可以删除
A B C D E
20170102 3 4 5 4.0 32.0
20170103 6 7 8 6.0 5.0
>>> df2.dropna(axis=0,how='all')
A B C D E
20170101 0 1 2 3.0 NaN
20170102 3 4 5 4.0 32.0
20170103 6 7 8 6.0 5.0
20170104 9 10 11 NaN 2.0
>>> df2.dropna(axis=1,how='any') #有空值的列给删除
A B C
20170101 0 1 2
20170102 3 4 5
20170103 6 7 8
20170104 9 10 11
>>> df2.dropna(axis=1,how='all') #列全部为空值,则删除
A B C D E
20170101 0 1 2 3.0 NaN
20170102 3 4 5 4.0 32.0
20170103 6 7 8 6.0 5.0
20170104 9 10 11 NaN 2.0
>>> df2.fillna(value=0) #将空值赋值为0
A B C D E
20170101 0 1 2 3.0 0.0
20170102 3 4 5 4.0 32.0
20170103 6 7 8 6.0 5.0
20170104 9 10 11 0.0 2.0
>>> df2.isnull() #查看空值
A B C D E
20170101 False False False False True
20170102 False False False False False
20170103 False False False False False
20170104 False False False True False
>>> np.any(df2.isnull()) #只要有一个或者多个空值就会返回true
True
>>> np.all(df2.isnull()) #所有的值为空值,才会返回true
False
5.pandas读取以及写入文件
6.pandas的数据合并
>>> df1=pd.DataFrame(np.arange(12).reshape(3,4),columns=['a','b','c','d'])
>>> df2=pd.DataFrame(np.arange(12,24).reshape(3,4),columns=['a','b','c','d'])
>>> df3=pd.DataFrame(np.arange(24,36).reshape(3,4),columns=['a','b','c','d'])
>>> print(df1)
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
>>> print(df2)
a b c d
0 12 13 14 15
1 16 17 18 19
2 20 21 22 23
>>> print(df3)
a b c d
0 24 25 26 27
1 28 29 30 31
2 32 33 34 35
>>> df4=pd.concat([df1,df2,df3],axis=0) #纵向合并
>>> df4
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
0 12 13 14 15
1 16 17 18 19
2 20 21 22 23
0 24 25 26 27
1 28 29 30 31
2 32 33 34 35
>>> df4=pd.concat([df1,df2,df3],axis=0,ignore_index=True) #纵向合并,忽略掉原来的索引
>>> df4
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19
5 20 21 22 23
6 24 25 26 27
7 28 29 30 31
8 32 33 34 35
>>> df5=pd.concat([df1,df2,df3],axis=1) #横向合并
>>> df5
a b c d a b c d a b c d
0 0 1 2 3 12 13 14 15 24 25 26 27
1 4 5 6 7 16 17 18 19 28 29 30 31
2 8 9 10 11 20 21 22 23 32 33 34 35

浙公网安备 33010602011771号