# -*- coding: utf-8 -*-"""Created on Tue Feb 21 22:07:13 2017@author: zzpp220"""from pandas import DataFrame,Seriesfrom numpy import nan as NaNimport pandas as pdimport numpy as npdf=DataFrame({'k1':list('aabba'),'k2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})#分组键为Series对象grouped=df.groupby('k1')desc_group=grouped.describe()grouped_1=df['data1'].groupby([df['k1'],df['k2']]).sum()##group 是一个series对象## 有多重索引的Series和DataFrame中的相互转变##分组键为groupby前面对象中的列名df.groupby(['k1','k2']).sum().unstack()df.groupby(['k1','k2']).sum().unstack().stack([0,1])##粉组件是任何长度适当的数组states=np.array(['sd','gd','sd','fj','fj'])year=np.array([2005,2006,2005,2005,2006])df.groupby([states,year]).mean()##GroupBy 对象的数据聚合 按照不同的的聚合函数聚合tips=pd.read_csv('/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch08/tips.csv')tips['tip_pct']=tips['tip']/tips['total_bill']gro_tips=tips.groupby(['sex','smoker'])#对聚合后的1列应用1个聚合函数:gro_tips['tip_pct'].mean()gro_tips['tip_pct'].agg('mean')##同上,只不过是把函数名用字符串的形似传入#对聚合后的1列应用多个聚合函数:gro_tips['tip_pct'].agg(['mean','std','var'])##传入一组函数名,得到DataFrame的列就会以相应的函数命名gro_tips['tip_pct'].agg([('avg','mean'),('vag','std'),('gav','var')])#按照(name.funciton)传入二元元祖列表,返回的表的各列名是元祖第一个元素'''gro_tips['tip_pct'].mean()Out[94]: sex smokerFemale No 0.156921 Yes 0.182150Male No 0.160669 Yes 0.152771Name: tip_pct, dtype: float64gro_tips['tip_pct'].agg(['mean','std','var'])Out[98]: mean std varsex smoker Female No 0.156921 0.036421 0.001327 Yes 0.182150 0.071595 0.005126Male No 0.160669 0.041849 0.001751 Yes 0.152771 0.090588 0.008206gro_tips['tip_pct'].agg([('avg','mean'),('vag','std'),('gav','var')])Out[99]: avg vag gavsex smoker Female No 0.156921 0.036421 0.001327 Yes 0.182150 0.071595 0.005126Male No 0.160669 0.041849 0.001751 Yes 0.152771 0.090588 0.008206'''##设计对表中的多个列应用同样的多个函数## 对'tip_pct','total_bill' 分别计算functions中的三个统计信息functions=['count','mean','max']asix=[('name1','count'),('name2','mean'),('name3','max')]result=gro_tips['tip_pct','total_bill'].agg(functions)asix_result=gro_tips['tip_pct','total_bill'].agg(asix)'''resultOut[102]: tip_pct total_bill count mean max count mean maxsex smoker Female No 54 0.156921 0.252672 54 18.105185 35.83 Yes 33 0.182150 0.416667 33 17.977879 44.30Male No 97 0.160669 0.291990 97 19.791237 48.33 Yes 60 0.152771 0.710345 60 22.284500 50.81 asix_resultOut[105]: tip_pct total_bill name1 name2 name3 name1 name2 name3sex smoker Female No 54 0.156921 0.252672 54 18.105185 35.83 Yes 33 0.182150 0.416667 33 17.977879 44.30Male No 97 0.160669 0.291990 97 19.791237 48.33 Yes 60 0.152771 0.710345 60 22.284500 50.81 如上表,result拥有层次化的列,这相当于对各列应用不同的聚合函数后,再分别进行聚合,然后用concat进行聚合。'''##`对多个不同的列应用不同的函数,传入列名和带应用函数名的字典map={'tip_pct':['min','max','mean','std'],'size':'sum',}diff_to_diff=gro_tips.agg(map)''' diff_to_diffOut[109]: tip_pct size min max mean std sumsex smoker Female No 0.056797 0.252672 0.156921 0.036421 140 Yes 0.056433 0.416667 0.182150 0.071595 74Male No 0.071804 0.291990 0.160669 0.041849 263 Yes 0.035638 0.710345 0.152771 0.090588 150'''#-------------apply 函数,返回一个pandas对象或者标量值-----------------------------------'''定义函数得到表df按照列名为column的升序排序后的结果,选出最大5个tip_pct所在的行‘'''def top(df,n=5,column='tip_pct'):#按照 return df.sort_index(by=column)[-n:] top_col=top(tips,n=6)tips.groupby('smoker').apply(top,n=3)##返回一个层次化索引的dataframetips.groupby('smoker')['tip_pct'].describe().unstack(0)test2=DataFrame(np.arange(24).reshape(6,4),index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6'],columns=['col_1','col_2','col_3', 'col_4'])test2['smoker']=Series(['no','yes','no','yes','yes','yes'],index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6'])test2=test2.rename_axis({'col_1':'tip_pct'},axis=1)test2['tip_pct']=[2,5,4,1,2,6]#test2['col_2']=[7,8,5,-1,4,6]test2.groupby('smoker').apply(top,n=3)##在groupby对象上调用apply,返回一个层次化索引的dataframetest2.groupby('smoker').apply(top,n=3).unstack()##如果不想要层次化的索引,可以把分组键禁止掉test2.groupby('smoker',group_keys=False).apply(top,n=3)''' 将tips按找smoker分组,对于每一个组内都应用top 函数,应用完成后,将各个组的应用结果concat起来返回的结果就有了一个层次化的索引,内层的索引就是tip中原来的 索引值total_bill tip sex smoker day time size tip_pctsmoker No 51 10.29 2.60 Female No Sun Dinner 2 0.252672 149 7.51 2.00 Male No Thur Lunch 2 0.266312 232 11.61 3.39 Male No Sat Dinner 2 0.291990-------------------------------------------------------------------------------concat---------------Yes 67 3.07 1.00 Female Yes Sat Dinner 1 0.325733 178 9.60 4.00 Female Yes Sun Dinner 2 0.416667 172 7.25 5.15 Male Yes Sun Dinner 2 0.710345 test2.groupby('smoker').apply(top,n=3):tip_pct col_2 col_3 col_4 smokersmoker no row_1 2 7 2 3 no row_3 4 5 10 11 no-----------------------------------------------------------concat-------------------------yes row_5 2 4 18 19 yes row_2 5 8 6 7 yes row_6 6 6 22 23 yes test2.groupby('smoker',group_keys=False).apply(top,n=3): tip_pct col_2 col_3 col_4 smokerrow_1 2 7 2 3 norow_3 4 5 10 11 norow_5 2 4 18 19 yesrow_2 5 8 6 7 yesrow_6 6 6 22 23 yes 用特定于分组的值填充缺失值: '''s=Series(np.random.rand(6))s[::2]=np.nan#步长为2'''s:0 NaN1 0.6549052 NaN3 0.4755284 NaN5 0.269656dtype: float64s.fillna(s.mean())Out[208]: 0 0.4666961 0.6549052 0.4666963 0.4755284 0.4666965 0.269656dtype: float64'''s.fillna(s.mean())