Pandas入门教程(二)

merge合并DataFrame

import pandas as pd

left=pd.DataFrame({
    'key':['k0','k1','k2','k3','k4','k5'],
    'A':['A0','A1','A2','A3','A4','A5'],
    'B':['B0','B1','B2','B3','B4','B5']
})


right=pd.DataFrame({
    'key':['k0','k1','k2','k3','k4','k5'],
    'C':['C0','C1','C2','C3','C4','C5'],
    'D':['D0','D1','D2','D3','D4','D5']
})
print(left)
print('-'*20)
print(right)

key A B
0 k0 A0 B0
1 k1 A1 B1
2 k2 A2 B2
3 k3 A3 B3
4 k4 A4 B4
5 k5 A5 B5
--------------------
key C D
0 k0 C0 D0
1 k1 C1 D1
2 k2 C2 D2
3 k3 C3 D3
4 k4 C4 D4
5 k5 C5 D5

# 合并
res=pd.merge(left,right)
print(res)

print('-'*20)
# 指定合并的key
res=pd.merge(left,right,on='key')
print(res)

key A B C D
0 k0 A0 B0 C0 D0
1 k1 A1 B1 C1 D1
2 k2 A2 B2 C2 D2
3 k3 A3 B3 C3 D3
4 k4 A4 B4 C4 D4
5 k5 A5 B5 C5 D5
--------------------
key A B C D
0 k0 A0 B0 C0 D0
1 k1 A1 B1 C1 D1
2 k2 A2 B2 C2 D2
3 k3 A3 B3 C3 D3
4 k4 A4 B4 C4 D4
5 k5 A5 B5 C5 D5

left = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                     'key2': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'], 
                    'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                      'key2': ['K0', 'K1', 'K2', 'K4'],
                    'C': ['C0', 'C1', 'C2', 'C3'], 
                    'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print('-'*20)
print(right)

key1 key2 A B
0 K0 K0 A0 B0
1 K1 K1 A1 B1
2 K2 K2 A2 B2
3 K3 K3 A3 B3
--------------------
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K1 C1 D1
2 K2 K2 C2 D2
3 K3 K4 C3 D3

# 默认取交集 how='inner'
res=pd.merge(left,right,on=['key1','key2'])
print(res)

key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K1 A1 B1 C1 D1
2 K2 K2 A2 B2 C2 D2

# how='outer' 取并集
res=pd.merge(left,right,on=['key1','key2'],how='outer')
print(res)

key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K1 A1 B1 C1 D1
2 K2 K2 A2 B2 C2 D2
3 K3 K3 A3 B3 NaN NaN
4 K3 K4 NaN NaN C3 D3

# 显示合并数据中数据来自哪个表
res=pd.merge(left,right,on=['key1','key2'],how='outer',indicator=True)
print(res)

key1 key2 A B C D _merge
0 K0 K0 A0 B0 C0 D0 both
1 K1 K1 A1 B1 C1 D1 both
2 K2 K2 A2 B2 C2 D2 both
3 K3 K3 A3 B3 NaN NaN left_only
4 K3 K4 NaN NaN C3 D3 right_only

# 左链接
res=pd.merge(left,right,on=['key1','key2'],how='left')
print(res)
print('-'*30)
# 右链接
res=pd.merge(left,right,on=['key1','key2'],how='right')
print(res)

key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K1 A1 B1 C1 D1
2 K2 K2 A2 B2 C2 D2
3 K3 K3 A3 B3 NaN NaN
------------------------------
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K1 A1 B1 C1 D1
2 K2 K2 A2 B2 C2 D2
3 K3 K4 NaN NaN C3 D3

join 拼接列,主要用于索引上的合并

left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                    'key': ['K0', 'K1', 'K0', 'K1']})

right = pd.DataFrame({'C': ['C0', 'C1'],
                       'D': ['D0', 'D1']},
                       index=['K0', 'K1'])

print(left)
print('-'*15)
print(right)

A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K0
3 A3 B3 K1
---------------
C D
K0 C0 D0
K1 C1 D1

res=left.join(right,on='key')
print(res)

A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K0 C0 D0
3 A3 B3 K1 C1 D1

Pandas数据透视表

df = pd.DataFrame({'Month': ["January", "January", "January", "January", 
                                  "February", "February", "February", "February", 
                                  "March", "March", "March", "March"],
                   'Category': ["Transportation", "Grocery", "Household", "Entertainment",
                                "Transportation", "Grocery", "Household", "Entertainment",
                                "Transportation", "Grocery", "Household", "Entertainment"],
                   'Amount': [74., 235., 175., 100., 115., 240., 225., 125., 90., 260., 200., 120.]})
print(df)

Month Category Amount
0 January Transportation 74.0
1 January Grocery 235.0
2 January Household 175.0
3 January Entertainment 100.0
4 February Transportation 115.0
5 February Grocery 240.0
6 February Household 225.0
7 February Entertainment 125.0
8 March Transportation 90.0
9 March Grocery 260.0
10 March Household 200.0
11 March Entertainment 120.0

# 构造一个索引为Category 列为Month 值为Amount的表
res=df.pivot(index='Category',columns='Month',values='Amount')
print(res)

Month February January March
Category
Entertainment 125.0 100.0 120.0
Grocery 240.0 235.0 260.0
Household 225.0 175.0 200.0
Transportation 115.0 74.0 90.0

# 按列求和
res.sum(axis=0)

Month
February 705.0
January 584.0
March 670.0
dtype: float64

# 按行求和
res.sum(axis=1)

Category
Entertainment 345.0
Grocery 735.0
Household 600.0
Transportation 279.0
dtype: float64

pivot_table

df=pd.read_csv('./pandas/data/titanic.csv')
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')

#默认求平均值
res=df.pivot_table(index='Sex',columns='Pclass',values='Fare')
print(res)

Pclass 1 2 3
Sex
female 106.125798 21.970121 16.118810
male 67.226127 19.741782 12.661633

# 求最大值
res=df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='max')
print(res)

Pclass 1 2 3
Sex
female 512.3292 65.0 69.55
male 512.3292 73.5 69.55

# 统计个数
res=df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='count')
print(res)
print('-'*20)
# crosstab 统计个数
res=pd.crosstab(index=df['Sex'],columns=df['Pclass'])
print(res)

Pclass 1 2 3
Sex
female 94 76 144
male 122 108 347
--------------------
Pclass 1 2 3
Sex
female 94 76 144
male 122 108 347

# 求平均值
res=df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='mean')
print(res)

Pclass 1 2 3
Sex
female 106.125798 21.970121 16.118810
male 67.226127 19.741782 12.661633

# 计算未成年男女存活率
df['minor']=df['Age']<=18
res=df.pivot_table(index='minor',columns='Sex',values='Survived',aggfunc='mean')
print(res)

Sex female male
minor
False 0.760163 0.167984
True 0.676471 0.338028

posted @ 2020-09-26 22:44  入门小站  阅读(91)  评论(0)    收藏  举报