groupby

 

 

# -*- coding:utf-8 -*-
import calendar
import pandas as pd
from datetime import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sn
import missingno as msno


#pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)

#设置选项,防止head()出现省略号
pd.set_option('display.width',None)

dailyData = pd.read_csv("d:/train.csv", encoding='gbk')

print(dailyData.shape)
print(dailyData.head())
print(dailyData.columns.tolist())
#2011-01-01 00:00:00       1        0           0        1  9.84  14.395        81        0.0       3          13     16
dailyData["date"] = dailyData.datetime.apply(lambda x : x.split()[0])
dailyData["hour"] = dailyData.datetime.apply(lambda x : x.split()[1].split(":")[0])
dailyData["weekday"] = dailyData.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()])
dailyData["month"] = dailyData.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%Y-%m-%d").month])
dailyData["season"] = dailyData.season.map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter" })
dailyData["weather"] = dailyData.weather.map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\
                                        2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \
                                        3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \
                                        4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " })


categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday"]
for var in categoryVariableList:
    dailyData[var] = dailyData[var].astype("category")

dailyData  = dailyData.drop(["datetime"],axis=1)
dailyData.head() 

dataTypeDf = pd.DataFrame(dailyData.dtypes.value_counts()).reset_index().rename(columns={"index":"variableType",0:"count"})

print(dataTypeDf)
#------------------------------------------------------------------
new_dic = {}
for i in range(dataTypeDf.shape[0]):
    temp = dataTypeDf.loc[i, "variableType"]
    tempType = temp.name
    if tempType in new_dic:
        new_dic[tempType] = new_dic[tempType] + dataTypeDf.loc[i, "count"]
    else:
        new_dic[tempType] = dataTypeDf.loc[i, "count"]
print(new_dic)

mylist = list()
for key in new_dic.keys():
    mylist.append([key, new_dic[key]])
print(mylist)

dataTypeDf = pd.DataFrame(mylist, columns=list(dataTypeDf))
#--------------------------------------------------------------------------
print(dataTypeDf)
fig,ax = plt.subplots()
fig.set_size_inches(12,5)
sn.barplot(data=dataTypeDf,x="variableType",y="count",ax=ax)####
ax.set(xlabel='variableType', ylabel='Count',title="Variables DataType Count")
plt.show()


#这个在jupyter notebook才显示
msno.matrix(dailyData,figsize=(12,5))

 

---------------------------------------------------------------

test.csv

name,age,score
caoming,1,100
muhe,2,99
muhe221,3,9
muhe221,3,1

 

# -*- coding:utf-8 -*-
import pandas as pd

# 设置选项,防止head()出现省略号
pd.set_option('display.width', None)

dailyData = pd.read_csv("d:/test.csv", encoding='gbk')
print(dailyData.shape)
print(dailyData.head())

dailyData = dailyData.groupby("name").sum()
print(dailyData)
print(type(dailyData))

dailyData = dailyData.dtypes.value_counts()
print(dailyData)

#默认索引名是"index"
dailyData = dailyData.reset_index().rename(columns={"index":"variableType", 0:"count"})
print("-------dailyData-----------");
print(dailyData)
print("------------")
temp = dailyData.loc[0, "count"]
print(temp)
print(type(temp))

(4, 3)
name age score
0 caoming 1 100
1 muhe 2 99
2 muhe221 3 9
3 muhe221 3 1
age score
name
caoming 1 100
muhe 2 99
muhe221 6 10
<class 'pandas.core.frame.DataFrame'>
int64 2
dtype: int64
-------dailyData-----------
variableType count
0 int64 2
------------
2
<class 'numpy.int64'>

 -----------------------------------------------------

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = DataFrame(np.arange(9).reshape(3, 3), index = ['bj', 'sh', 'gz'], columns=['a', 'b', 'c'])
print(df1) 

# 修改 df1 的 index
print(df1.index) # 可以打印出print的值,同时也可以为其赋值
df1.index = Series(['beijing', 'shanghai', 'guangzhou'])
print(df1)

df2 = df1.rename(index=str.upper, columns=str.upper) # 这种方法 照样是产生一个新的 dataframe
print(df2)
#分别修改索引和列名
df3 = df2.rename(index={'BEIJING':'bj'}, columns = {'A':'aa'}) # 为某个 index 单独修改名称
print(df3) #
df4 = df3.drop(["bj"])
print(df4) #
print("-----------------------------")

df = pd.DataFrame({'key':['A','B','C','A','B'],
                 'data1':[1, 2, 3, 4, 5], },
                columns = ['key', 'data1'])
#aggregate()函数的参数可以支持字符串、函数或者函数列表
print(df)
print(type(df))
df = df.groupby('key').sum()
print(type(df))
print(df)

打印结果

    a  b  c
bj  0  1  2
sh  3  4  5
gz  6  7  8
Index(['bj', 'sh', 'gz'], dtype='object')
           a  b  c
beijing    0  1  2
shanghai   3  4  5
guangzhou  6  7  8
           A  B  C
BEIJING    0  1  2
SHANGHAI   3  4  5
GUANGZHOU  6  7  8
           aa  B  C
bj          0  1  2
SHANGHAI    3  4  5
GUANGZHOU   6  7  8
           aa  B  C
SHANGHAI    3  4  5
GUANGZHOU   6  7  8
-----------------------------
  key  data1
0   A      1
1   B      2
2   C      3
3   A      4
4   B      5
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
     data1
key       
A        5
B        7
C        3

 

posted @ 2019-05-20 11:33  牧 天  阅读(262)  评论(0)    收藏  举报