groupby

In [1]:
import warnings
import math
import pandas as pd
import numpy as np
import matplotlib

warnings.filterwarnings('ignore')
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.set_option('max_colwidth', 500)

get_ipython().magic(u'matplotlib inline')
matplotlib.style.use('ggplot')

from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

myfont = matplotlib.font_manager.FontProperties(fname=u'simsun.ttc', size=14)
In [15]:
data = pd.DataFrame({
    'age' : np.random.randint(15, 100, 100),
    'height':np.random.randint(140, 180, 100),
    'weight':np.random.randint(40, 80, 100),
    'gender':np.random.randint(0,2, 100),
    'salary':np.random.randint(3000, 30000, 100)
})
data.head()
Out[15]:
 
 ageheightweightgendersalary
0 70 153 76 0 28492
1 52 167 60 0 13457
2 60 152 56 0 19341
3 56 148 46 0 22948
4 30 171 53 1 27829
In [16]:
data.gender = data.gender.map({0:'man', 1:'women'})
data.head()
Out[16]:
 
 ageheightweightgendersalary
0 70 153 76 man 28492
1 52 167 60 man 13457
2 60 152 56 man 19341
3 56 148 46 man 22948
4 30 171 53 women 27829
In [28]:
# group 对象
group = data.groupby('gender', as_index=False)
list(group)[0]
Out[28]:
('man',     age  height  weight gender  salary
 0    70     153      76    man   28492
 1    52     167      60    man   13457
 2    60     152      56    man   19341
 3    56     148      46    man   22948
 5    43     169      78    man   24664
 10   53     155      68    man   18598
 11   78     172      67    man    7968
 12   60     148      42    man    9037
 13   29     164      71    man   18313
 14   46     166      66    man   25126
 17   61     174      51    man    3431
 18   96     159      52    man   10823
 21   96     161      78    man    4995
 26   32     140      41    man    7146
 27   98     168      59    man    5033
 30   67     155      50    man   24194
 35   84     151      78    man   19993
 36   44     148      69    man   18338
 37   79     166      54    man   11029
 39   37     175      52    man    8755
 41   90     175      47    man   15473
 42   23     147      53    man   25314
 43   73     167      73    man   17872
 44   26     168      45    man   27260
 45   50     173      40    man    5016
 46   53     142      78    man   12550
 48   94     174      53    man    7372
 49   65     151      50    man   11583
 53   84     141      79    man   26520
 56   65     147      50    man   21603
 57   94     168      61    man   13765
 58   17     159      60    man    3645
 59   78     140      44    man   19553
 60   42     144      49    man   27545
 61   50     140      59    man   18159
 62   83     179      69    man   11343
 65   47     175      59    man   17985
 66   65     171      65    man   14097
 67   82     154      74    man   15888
 69   58     155      67    man   23449
 70   98     178      40    man   11743
 73   49     165      77    man   15365
 77   74     159      46    man   28667
 79   15     144      55    man   10374
 84   19     142      41    man   21732
 86   16     143      78    man   11782
 87   91     152      57    man    8086
 91   99     147      52    man    5697
 93   29     160      54    man    3031)
In [25]:
# agg
group.agg({'age':'mean','height':'mean'})
Out[25]:
 
 genderageheight
0 man 60.612245 158.183673
1 women 59.490196 159.156863
In [33]:
# transform
data['avg_age'] = group['age'].transform('mean')
data.head()
Out[33]:
 
 ageheightweightgendersalaryavg_age
0 70 153 76 man 28492 60.612245
1 52 167 60 man 13457 60.612245
2 60 152 56 man 19341 60.612245
3 56 148 46 man 22948 60.612245
4 30 171 53 women 27829 59.490196
In [35]:
# apply
def oldest(x):
    df = x.sort_values(by='age', ascending=False)
    return df.iloc[-1,:]
group.apply(oldest)
Out[35]:
 
 ageheightweightgendersalaryavg_age
0 15 144 55 man 10374 60.612245
1 16 168 65 women 14140 59.490196
In [53]:
def age_level(age):
    return 'young' if age < 30 else ('middle' if age < 60 else 'senior')
data['level'] = data.age.map(age_level)
data.head()
Out[53]:
 
 ageheightweightgendersalaryavg_agelevel
0 70 153 76 man 28492 60.612245 senior
1 52 167 60 man 13457 60.612245 middle
2 60 152 56 man 19341 60.612245 senior
3 56 148 46 man 22948 60.612245 middle
4 30 171 53 women 27829 59.490196 middle
In [68]:
# 分组百分比
age_dist = data.groupby(['gender', 'level']).agg({'age':'count'})
age_dist
Out[68]:
 
  age
genderlevel 
manmiddle 15
senior 26
young 8
womenmiddle 20
senior 26
young 5
In [69]:
# gender_pcts 
age_dist.groupby(level = 0).apply(lambda x:x/float(x.sum()))
Out[69]:
 
  age
genderlevel 
manmiddle 0.306122
senior 0.530612
young 0.163265
womenmiddle 0.392157
senior 0.509804
young 0.098039
In [70]:
age_dist.groupby(level = 'level').apply(lambda x:x/float(x.sum()))
Out[70]:
 
  age
genderlevel 
manmiddle 0.428571
senior 0.500000
young 0.615385
womenmiddle 0.571429
senior 0.500000
young 0.384615
In [64]:
age_dist.groupby(level = 1).apply(lambda x:x/float(x.sum()))
Out[64]:
 
  age
genderlevel 
manmiddle 0.428571
senior 0.500000
young 0.615385
womenmiddle 0.571429
senior 0.500000
young 0.384615
posted @ 2020-02-13 15:29  2BiTT  阅读(225)  评论(0编辑  收藏  举报