Data
Stay hungry,Stay foolish!

导航

 

GroupBy技术

分组运算的过程可以用下面的流程图表示出来

gai

import pandas as pd
from pandas import Series
import numpy as np

df = pd.DataFrame({
    'key1':['a','a','b','b','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5)
})
df

   key1	key2	data1	    data2
0	a	one	  -0.687740	   1.536286
1	a	two	   0.177156	  -1.132658
2	b	one	   2.024931	   0.755856
3	b	two	  -0.304281	  -0.629296
4	a	one	   0.202405	   0.911700

# 访问key1调用groupby
grouped = df['data1'].groupby(df['key1'])
# grouped是一个GroupBy对象,它实际上还没有进行任何计算。但是它已经包含了执行分组运算的一切信息
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000020473C51978>

# 例如,算平均值
grouped.mean()

      key1
a   -0.102726
b    0.860325
Name: data1, dtype: float64

means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

      key1      key2
a     one    -0.242668
      two     0.177156
b     one     2.024931
      two    -0.304281
Name: data1, dtype: float64

# 默认操作的是最内层的
means.unstack()

key2	one	      two
key1		
a	 -0.242668	 0.177156
b	  2.024931	-0.304281

# 自定义分组键
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005    0.177156
            2006    2.024931
Ohio        2005   -0.496010
            2006    0.202405
Name: data1, dtype: float64

# 将列名用作分组键

df.groupby('key1').mean()

         data1	   data2
key1		
a	  -0.102726	  0.438443
b	   0.860325	  0.063280

df.groupby(['key1','key2']).mean()

                  data1	          data2
key1	key2		
a	    one	    -0.242668	     1.223993
        two	     0.177156	    -1.132658
b	    one	     2.024931	     0.755856
        two	    -0.304281	    -0.629296

# groupby.size方法,返回一个含有分组大小的Series,统计分组的数量
df.groupby(['key1','key2']).size()

      key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

df.groupby('key1').size()
    key1
a    3
b    2
dtype: int64

对分组进行迭代

for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.687740  1.536286
1    a  two  0.177156 -1.132658
4    a  one  0.202405  0.911700
b
  key1 key2     data1     data2
2    b  one  2.024931  0.755856
3    b  two -0.304281 -0.629296

# 对于多重键的情况
for k1,group in df.groupby(['key1','key2']):
    print(k1)
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.687740  1.536286
4    a  one  0.202405  0.911700
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.177156 -1.132658
('b', 'one')
  key1 key2     data1     data2
2    b  one  2.024931  0.755856
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.304281 -0.629296

pieces = dict(list(df.groupby('key1')))
pieces['a']

   key1	key2	data1	  data2
0	a	one	 -0.687740	 1.536286
1	a	two	  0.177156	-1.132658
4	a	one	  0.202405	 0.911700

# 按数据类型对列尽心分组
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

grouped1 = df.groupby(df.dtypes, axis=1)
dict(list(grouped1))

{dtype('float64'):       data1     data2
 0 -0.687740  1.536286
 1  0.177156 -1.132658
 2  2.024931  0.755856
 3 -0.304281 -0.629296
 4  0.202405  0.911700, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

选取一个或一个组列

# 分组通常和运算是一起的,只分组,只能是出来一个分组对象
df['data1'].groupby(df['key1'])
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000002047407C0F0>

df.groupby(['key1','key2'])[['data2']].mean()

              data2
key1 key2	
a	 one	 1.223993
     two	-1.132658
b	 one	 0.755856
     two	-0.629296

通过字典或Series进行分组

people = pd.DataFrame(np.random.randn(5,5),
                      columns = ['a','b','c','d','e'],
                      index = ['Joe','Steve','Wes','Jim','Travis']
                     )
people

           a	        b	        c	        d	        e
Joe	   -0.721168	-0.332402	-0.512183	 0.765475	-0.119330
Steve	0.166230	 2.095133	-0.298735	-2.167574	-0.273929
Wes	   -1.428962	-2.109915	-0.686155	-1.337891	 0.020153
Jim	    1.041952	 0.130567	 0.776067	 0.508666	 0.100419
Travis -0.215110	 0.611383	-0.011443	-0.552580	-0.274047

people.ix['Steve':'Wes',['b','c']] = np.nan
people

              a	       b	       c	       d	        e
Joe	    -0.721168	-0.332402	-0.512183	0.765475	-0.119330
Steve	0.166230	NaN	          NaN	    -2.167574	-0.273929
Wes	    -1.428962	NaN	          NaN	    -1.337891	 0.020153
Jim	    1.041952	0.130567	 0.776067	 0.508666	 0.100419
Travis	-0.215110	0.611383	-0.011443	-0.552580	-0.274047

# 根据分组计算列的总计
mapping = {'a':'red','b':'red','c':'blue',
          'd':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1).sum()
by_column

         blue	       red
Joe	     0.253291	-1.172899
Steve	-2.167574	-0.107699
Wes	    -1.337891	-1.408809
Jim	     1.284733	 1.272937
Travis	-0.564023	 0.122226

# 检查映射的对应关系
map_series = Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

people.groupby(map_series,axis=1).count()

      blue	red
Joe	    2	3
Steve	1	2
Wes	    1	2
Jim	    2	3
Travis	2	3

通过函数进行分组

# 仅仅传入len,就可以实现按名字长度分组
people.groupby(len).sum()

       a	       b	      c	           d	      e
3	-1.108178	-0.201835	0.263884	-0.063751	0.001242
5	0.166230	0.000000	0.000000	-2.167574	-0.273929
6	-0.215110	0.611383	-0.011443	-0.552580	-0.274047

key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()

           a	      b	          c	           d	       e
3	one	-1.428962	-0.332402	-0.512183	-1.337891	-0.119330
    two	 1.041952	0.130567	 0.776067	 0.508666	 0.100419
5	one	 0.166230	NaN	         NaN	    -2.167574	-0.273929
6	two	-0.215110	0.611383	-0.011443	-0.552580	-0.274047

根据索引级别分组

columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df

cty	     US	                                    JP
tenor	1	        3	         5	         1	        3
0	-0.637036	 0.230604	 0.425606	 0.579673	-0.408535
1	 0.987018	-1.313152	-0.322781	-1.393266	-0.393867
2	-2.343672	 0.385606	 0.992302	-0.791528	-0.774870
3	-0.809241	-0.654219	-0.860250	 0.222547	 0.049766

hier_df.groupby(level='cty',axis=1).count()

cty	JP	US
0	2	3
1	2	3
2	2	3
3	2	3
posted on 2018-12-07 16:33  进击中的青年  阅读(310)  评论(0编辑  收藏  举报