import numpy as np
import pandas as pd
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
print(df)
# data1列数据,按照key1来进行分组
g1 = df['data1'].groupby(by=df['key1'])
print(g1)
print(g1.mean())
# 一次传入多个数组,按照key1,key2分组计算
g2 = df['data1'].groupby(by=[df['key1'], df['key2']])
print(g2.sum())
# 可以将列名(可以是字符串、数字或者其他python对象)用作分组
g3 = df.groupby(by='key1')
print(g3.mean())
g4 = df.groupby(by=['key1', 'key2'])
print(g4.mean())
输出结果:
key1 key2 data1 data2
0 a one 0.374982 1.064534
1 a two -0.485860 0.071513
2 b one -0.741052 1.868335
3 b two -0.757161 -0.536510
4 a one -0.092603 -1.647309
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001720F810BA8>
key1
a -0.067827
b -0.749106
Name: data1, dtype: float64
key1 key2
a one 0.282379
two -0.485860
b one -0.741052
two -0.757161
Name: data1, dtype: float64
data1 data2
key1
a -0.067827 -0.170421
b -0.749106 0.665912
data1 data2
key1 key2
a one 0.141189 -0.291387
two -0.485860 0.071513
b one -0.741052 1.868335
two -0.757161 -0.536510
import numpy as np
import pandas as pd
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
print(df)
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
# 按照states, years传入数组的方式对data1列数据分组后进行平均值计算
g1 = df['data1'].groupby(by=[states, years])
print(g1.mean())
people = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
print(people)
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red'}
g2 = people.groupby(mapping, axis=1).sum()
print(g2)
输出结果:
key1 key2 data1 data2
0 a one -0.388889 0.083999
1 a two 0.109513 2.016369
2 b one -0.423841 0.800895
3 b two -0.899408 0.039836
4 a one -0.137741 -1.382602
California 2005 0.109513
2006 -0.423841
Ohio 2005 -0.644148
2006 -0.137741
Name: data1, dtype: float64
a b c d e
Joe 1.179284 0.289269 0.303566 1.493424 -1.432677
Steve -0.386514 -0.154331 -0.771970 -0.597251 0.004637
Wes -0.933249 0.300309 -1.659948 0.404844 -0.375680
Jim -2.427930 0.574752 0.504091 -0.996975 -1.547615
Travis -1.555477 -1.065392 -1.149300 1.326839 -1.530914
blue red
Joe 1.796990 0.035876
Steve -1.369220 -0.536208
Wes -1.255104 -1.008621
Jim -0.492884 -3.400793
Travis 0.177539 -4.151784
import numpy as np
import pandas as pd
df = pd.DataFrame({
'A': [1, 1, 2, 2],
'B': [1, 2, 3, 4],
'C': np.random.randn(4)})
print(df)
# 按照A列分组后聚合求最小值
df1 = df.groupby('A').agg('min')
print(df1)
# 按照A列分组后聚合求最小值和最大值
df2 = df.groupby('A').agg(['min', 'max'])
print(df2)
# 按照A列分组后聚合后对B列求最小值和最大值,同时对C列求和
df3 = df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
print(df3)
输出结果:
A B C
0 1 1 1.630997
1 1 2 -0.238686
2 2 3 0.841689
3 2 4 -0.532344
B C
A
1 1 -0.238686
2 3 -0.532344
B C
min max min max
A
1 1 2 -0.238686 1.630997
2 3 4 -0.532344 0.841689
B C
min max sum
A
1 1 2 1.392311
2 3 4 0.309345
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randn(4, 5), columns=list('ABCDE'))
print(df)
# 计算各列数据总和并作为新列添加到末尾
df['col_sum'] = df.apply(lambda x: x.sum(), axis=1)
print(df)
# 计算各行数据总和并作为新行添加到末尾
df.loc['row_sum'] = df.apply(lambda x: x.sum(), axis=0)
print(df)
输出结果:
A B C D E
0 -0.978040 -0.820454 0.673568 1.613058 -0.203536
1 0.879338 0.561962 0.278855 -0.481307 1.097682
2 -1.244722 1.304386 0.206882 -0.715907 -1.246526
3 1.419993 0.453622 0.237463 1.292601 -0.184714
A B C D E col_sum
0 -0.978040 -0.820454 0.673568 1.613058 -0.203536 0.284595
1 0.879338 0.561962 0.278855 -0.481307 1.097682 2.336530
2 -1.244722 1.304386 0.206882 -0.715907 -1.246526 -1.695887
3 1.419993 0.453622 0.237463 1.292601 -0.184714 3.218965
A B C D E col_sum
0 -0.978040 -0.820454 0.673568 1.613058 -0.203536 0.284595
1 0.879338 0.561962 0.278855 -0.481307 1.097682 2.336530
2 -1.244722 1.304386 0.206882 -0.715907 -1.246526 -1.695887
3 1.419993 0.453622 0.237463 1.292601 -0.184714 3.218965
row_sum 0.076569 1.499516 1.396768 1.708445 -0.537094 4.144204