#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Time : 2022/2/8 0008 12:25
# @Author : Tzy0425
# @File : Pandas入门.py
import pandas as pd
import numpy as np
# -----------------------------------------Series-----------------------------------------
s = pd.Series([np.nan,'Tzy',520,'Lyy',1314,np.nan])
# print(s)
# 默认index从0开始,如果想要按照自己的索引设置,则修改index参数,如:s = pd.Series([np.nan,'Tzy',520,'Lyy',1314,np.nan],index=[3,4,3,7,8,9])
"""
0 NaN
1 Tzy
2 520
3 Lyy
4 1314
5 NaN
dtype: object
"""
# -----------------------------------------DataFrame-----------------------------------------
# DataFrame既有行索引也有列索引, 它可以被看做由Series组成的大字典
# 生成时间序列
dates = pd.date_range('2022-02-01',periods=6)
"""
numpy.random.randn(d0, d1, …, dn)是从标准正态分布中返回一个或多个样本值。
numpy.random.rand(d0, d1, …, dn)的随机样本位于[0, 1)中。
"""
# columns = ['a','b','c','d']
# df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=columns)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
"""
a b c d
2022-02-01 0.146238 -1.017105 1.331782 0.486643
2022-02-02 -1.358772 -0.295397 0.181340 0.397580
2022-02-03 1.538161 1.425895 0.007177 1.705866
2022-02-04 -0.217335 0.012565 0.377702 1.270212
2022-02-05 0.409731 -2.098344 0.974976 0.986170
2022-02-06 0.249307 0.263997 -1.780088 0.422658
"""
# -----------------------------------------数据筛选-----------------------------------------
# 根据索引值进行数据筛选
print(df['2022-02-02':'2022-02-06'])
# 指定行数据
print(df.loc['2022-02-06'])
# 指定列数据,输出a到c列的所有行数据
print(df.loc[:,'a':'c'])
# 行与列同时检索
print(df.loc['2022-02-06',['a','b']])
# 根据序列iloc,获取特定位置的值,即第4行第2列的值(从0开始)
print(df.iloc[3,1])
# 通过判断的筛选
print(df[df.a>0])
print(df.loc[df.a>0])
# 打印b列
print(df['b'])
df2 = pd.DataFrame({
'A': [1,2,3,4],
'B': pd.Timestamp('20180819'),
'C': pd.Series([1,6,9,10],dtype='float32'),
'D': np.array([3] * 4,dtype='int32'),
'E': pd.Categorical(['test','train','test','train']),
'F': 'foo'
})
print(df2)
"""
A B C D E F
0 1 2018-08-19 1.0 3 test foo
1 2 2018-08-19 6.0 3 train foo
2 3 2018-08-19 9.0 3 test foo
3 4 2018-08-19 10.0 3 train foo
"""
print(df2.index)
"""
RangeIndex(start=0, stop=4, step=1)
"""
# 选择跨越多行或多列
# 选取前3行
print(df[0:3])
# 数据总结
print(df2.describe())
"""
A C D
count 4.000000 4.000000 4.0
mean 2.500000 6.500000 3.0(期望)
std 1.290994 4.041452 0.0(标准差)
min 1.000000 1.000000 3.0
25% 1.750000 4.750000 3.0
50% 2.500000 7.500000 3.0
75% 3.250000 9.250000 3.0
max 4.000000 10.000000 3.0
"""
# 翻转数据
print(df2.T)
# print(np.transpose(df2))等价于上述操作
'''
axis=1表示行
axis=0表示列
默认ascending(升序)为True
ascending=True表示升序,ascending=False表示降序
下面两行分别表示按行升序与按行降序
'''
print(df2.sort_index(axis=1,ascending=True))
# 对特定列数值排列
# 表示对C列降序排列
print(df2.sort_values(by='C',ascending=False))
# -----------------------------------------含NaN操作-----------------------------------------
# 删除含有Nan的行(默认是删除含有Nan的行)
print(df.dropna())
# 删除含有Nan的列
print(df.dropna(
axis=1, # 0对行进行操作;1对列进行操作
how='any' # 'any':只要存在NaN就drop掉;'all':必须全部是NaN才drop
))
# 替换NaN值为0或者其他
print(df.fillna(value=0))
# 是否有缺失数据NaN
# 是否为空
print(df.isnull())
# 是否为NaN
print(df.isna())
# 检测某列是否有缺失数据NaN
print(df.isnull().any())
# 检测数据中是否存在NaN,如果存在就返回True
print(np.any(df.isnull())==True)
# -----------------------------------------导入数据-----------------------------------------
# 读取csv
data = pd.read_csv('student.csv')
# 前三行
print(data.head(3))
# 后三行
print(data.tail(3))
# -----------------------------------------导出数据-----------------------------------------
# 将资料存取成pickle
data.to_pickle('student.pickle')
# 读取pickle文件并打印
print(pd.read_pickle('student.pickle'))
# -----------------------------------------合并数据-----------------------------------------
# ----------------------concat合并----------------------
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
print(df1,df2,df3)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
"""
# concat纵向合并
res = pd.concat([df1,df2,df3],axis=0)
print(res)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
"""
# 上述合并过程中,index重复,下面给出重置index方法
# 只需要将index_ignore设定为True即可
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(res)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
"""
# ----------------------join合并----------------------
#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
print(df1,df2)
"""
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
"""
'''
函数默认为join='outer'。此方法是依照column来做纵向合并,有相同的column上下合并在一起,其他独自的column各自成列,原来没有值的位置皆为NaN填充。
'''
# 纵向"外"合并df1与df2
res = pd.concat([df1,df2],axis=0,join='outer')
print(res)
'''
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
'''
# 修改index,效果同上
res1 = pd.concat([df1,df2],axis=0,join='outer',ignore_index=True)
# join='inner'合并相同的字段,纵向"内"合并df1与df2
# inner就是合并相同的行,outer是扩展到最大的行
res2 = pd.concat([df1,df2],axis=0,join='inner')
'''
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
'''
# ----------------------join_axes合并----------------------
# 依照df1.index进行横向合并
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)
'''
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# 移除join_axes参数,打印结果
'''
res = pd.concat([df1,df2],axis=1)
print(res)
'''
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
'''
# ----------------------append合并----------------------
# 只有纵向合并,没有横向合并
# ----------------------merge合并----------------------
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left,right)
'''
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
'''
# 依据key1与key2 columns进行合并,并打印出四种结果['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
'''
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
'''
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
'''
因为C、D两列中第0行和第2行对应的key1与key2在left和right中是一样的,所以输出C0,D0,C1,D1,其余为NaN
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A3 B3 K2 K1 NaN NaN
'''
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 NaN NaN K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
'''
# -----------------------------------------解决 overlapping 的问题-----------------------------------------
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys,girls)
'''
age k
0 1 K0
1 2 K1
2 3 K2
age k
0 4 K0
1 5 K0
2 6 K3
'''
# 使用suffixes解决overlapping的问题
# 比如将上面两个合并时,age重复了,则可通过suffixes设置,以此保证不重复,不同名
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res)
'''
age_boy k age_girl
0 1 K0 4
1 1 K0 5
'''
# -----------------------------------------Pandas plot 出图-----------------------------------------
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
print(data)
# cumsum()输出累加结果
print(data.cumsum())
# data本来就是一个数据,所以我们可以直接plot
data.plot()
plt.show()
# scatter()画散点图
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
# 将以下这个 data 画在上一个 ax 上面
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
plt.show()
# 总结自:
# https://mp.weixin.qq.com/s/uLBJc_iIize8a9B491U7VQ