groupby
# -*- coding:utf-8 -*- import calendar import pandas as pd from datetime import datetime import warnings import matplotlib.pyplot as plt import seaborn as sn import missingno as msno #pd.options.mode.chained_assignment = None warnings.filterwarnings("ignore", category=DeprecationWarning) #设置选项,防止head()出现省略号 pd.set_option('display.width',None) dailyData = pd.read_csv("d:/train.csv", encoding='gbk') print(dailyData.shape) print(dailyData.head()) print(dailyData.columns.tolist()) #2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0 3 13 16 dailyData["date"] = dailyData.datetime.apply(lambda x : x.split()[0]) dailyData["hour"] = dailyData.datetime.apply(lambda x : x.split()[1].split(":")[0]) dailyData["weekday"] = dailyData.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()]) dailyData["month"] = dailyData.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%Y-%m-%d").month]) dailyData["season"] = dailyData.season.map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter" }) dailyData["weather"] = dailyData.weather.map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\ 2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \ 3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \ 4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " }) categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday"] for var in categoryVariableList: dailyData[var] = dailyData[var].astype("category") dailyData = dailyData.drop(["datetime"],axis=1) dailyData.head() dataTypeDf = pd.DataFrame(dailyData.dtypes.value_counts()).reset_index().rename(columns={"index":"variableType",0:"count"}) print(dataTypeDf) #------------------------------------------------------------------ new_dic = {} for i in range(dataTypeDf.shape[0]): temp = dataTypeDf.loc[i, "variableType"] tempType = temp.name if tempType in new_dic: new_dic[tempType] = new_dic[tempType] + dataTypeDf.loc[i, "count"] else: new_dic[tempType] = dataTypeDf.loc[i, "count"] print(new_dic) mylist = list() for key in new_dic.keys(): mylist.append([key, new_dic[key]]) print(mylist) dataTypeDf = pd.DataFrame(mylist, columns=list(dataTypeDf)) #-------------------------------------------------------------------------- print(dataTypeDf) fig,ax = plt.subplots() fig.set_size_inches(12,5) sn.barplot(data=dataTypeDf,x="variableType",y="count",ax=ax)#### ax.set(xlabel='variableType', ylabel='Count',title="Variables DataType Count") plt.show() #这个在jupyter notebook才显示 msno.matrix(dailyData,figsize=(12,5))
---------------------------------------------------------------
test.csv
name,age,score caoming,1,100 muhe,2,99 muhe221,3,9 muhe221,3,1
# -*- coding:utf-8 -*- import pandas as pd # 设置选项,防止head()出现省略号 pd.set_option('display.width', None) dailyData = pd.read_csv("d:/test.csv", encoding='gbk') print(dailyData.shape) print(dailyData.head()) dailyData = dailyData.groupby("name").sum() print(dailyData) print(type(dailyData)) dailyData = dailyData.dtypes.value_counts() print(dailyData) #默认索引名是"index" dailyData = dailyData.reset_index().rename(columns={"index":"variableType", 0:"count"}) print("-------dailyData-----------"); print(dailyData) print("------------") temp = dailyData.loc[0, "count"] print(temp) print(type(temp))
(4, 3)
name age score
0 caoming 1 100
1 muhe 2 99
2 muhe221 3 9
3 muhe221 3 1
age score
name
caoming 1 100
muhe 2 99
muhe221 6 10
<class 'pandas.core.frame.DataFrame'>
int64 2
dtype: int64
-------dailyData-----------
variableType count
0 int64 2
------------
2
<class 'numpy.int64'>
-----------------------------------------------------
import numpy as np import pandas as pd from pandas import Series, DataFrame df1 = DataFrame(np.arange(9).reshape(3, 3), index = ['bj', 'sh', 'gz'], columns=['a', 'b', 'c']) print(df1) # 修改 df1 的 index print(df1.index) # 可以打印出print的值,同时也可以为其赋值 df1.index = Series(['beijing', 'shanghai', 'guangzhou']) print(df1) df2 = df1.rename(index=str.upper, columns=str.upper) # 这种方法 照样是产生一个新的 dataframe print(df2) #分别修改索引和列名 df3 = df2.rename(index={'BEIJING':'bj'}, columns = {'A':'aa'}) # 为某个 index 单独修改名称 print(df3) # df4 = df3.drop(["bj"]) print(df4) # print("-----------------------------") df = pd.DataFrame({'key':['A','B','C','A','B'], 'data1':[1, 2, 3, 4, 5], }, columns = ['key', 'data1']) #aggregate()函数的参数可以支持字符串、函数或者函数列表 print(df) print(type(df)) df = df.groupby('key').sum() print(type(df)) print(df)
打印结果
a b c bj 0 1 2 sh 3 4 5 gz 6 7 8 Index(['bj', 'sh', 'gz'], dtype='object') a b c beijing 0 1 2 shanghai 3 4 5 guangzhou 6 7 8 A B C BEIJING 0 1 2 SHANGHAI 3 4 5 GUANGZHOU 6 7 8 aa B C bj 0 1 2 SHANGHAI 3 4 5 GUANGZHOU 6 7 8 aa B C SHANGHAI 3 4 5 GUANGZHOU 6 7 8 ----------------------------- key data1 0 A 1 1 B 2 2 C 3 3 A 4 4 B 5 <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> data1 key A 5 B 7 C 3

浙公网安备 33010602011771号