10.2 pandas模块
为什么要有pandas模块?pandas模块有什么用?
- pandas模块支持文件存取操作,支持数据库(sql)、html、json、pickle、csv(txt、excel)、sas、stata、hdf等。
以下代码均为使用jupyter书写,方便交互。
10.2.1 Series数据类型(了解)
类似于一维数组,只支持放一维的数组
import pandas as pd
pd.Series([1,2,3])
#输出
0 1
1 2
2 3
dtype: int64
import pandas as pd
import numpy as np
pd.Series(np.array([1,2,5]))
#输出
0 1
1 2
2 5
dtype: int32
10.2.2 DataFrame(重点)
- 二维以上的数组使用DataFrame,二维数组用的最多
import pandas as pd
pd.DataFrame([[1,2,3],[4,5,6]])
# 1. 首先生成日期,作为index,行名
dates=pd.date_range('2019-01-01',periods=4)
print(dates)
#输出
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04'], dtype='datetime64[ns]', freq='D')
# 2. 商品名称,作为列名
goods_list=['watch','book','pen','phone']
# 3. 生成表格内容
price=np.random.rand(4,4)
print(price)
#输出
[[0.4173048 0.55868983 0.14038694 0.19810149]
[0.80074457 0.96826158 0.31342418 0.69232262]
[0.87638915 0.89460666 0.08504421 0.03905478]
[0.16983042 0.8781425 0.09834683 0.42110763]]
# 4. 组装,一般命名为df。
df=pd.DataFrame(price,index=dates,columns=goods_list)
# df.to_excel('test.xlsx') #可以写入文件
df
|
watch |
book |
pen |
phone |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
(1)dtypes
df.dtypes #查看数据类型
#输出
watch float64
book float64
pen float64
phone float64
dtype: object
(2)index
df.index #查看行序列或者索引
#输出
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04'], dtype='datetime64[ns]', freq='D')
(3)columns
df.columns #查看各列的标签
#输出
Index(['watch', 'book', 'pen', 'phone'], dtype='object')
(4)values
df.values #查看数据框内的数据,不含表头索引的数据
#输出
array([[0.4173048 , 0.55868983, 0.14038694, 0.19810149],
[0.80074457, 0.96826158, 0.31342418, 0.69232262],
[0.87638915, 0.89460666, 0.08504421, 0.03905478],
[0.16983042, 0.8781425 , 0.09834683, 0.42110763]])
(5)describe
df.describe() #查看数据每一列的极值,均值,中位数,只可用于数值型数据。
|
watch |
book |
pen |
phone |
count |
4.000000 |
4.000000 |
4.000000 |
4.000000 |
mean |
0.566067 |
0.824925 |
0.159301 |
0.337647 |
std |
0.331917 |
0.181764 |
0.105422 |
0.283661 |
min |
0.169830 |
0.558690 |
0.085044 |
0.039055 |
25% |
0.355436 |
0.798279 |
0.095021 |
0.158340 |
50% |
0.609025 |
0.886375 |
0.119367 |
0.309605 |
75% |
0.819656 |
0.913020 |
0.183646 |
0.488911 |
max |
0.876389 |
0.968262 |
0.313424 |
0.692323 |
(6)transpose
df
|
watch |
book |
pen |
phone |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
df.transpose() #转置,也可用T来操作
|
2019-01-01 |
2019-01-02 |
2019-01-03 |
2019-01-04 |
watch |
0.417305 |
0.800745 |
0.876389 |
0.169830 |
book |
0.558690 |
0.968262 |
0.894607 |
0.878143 |
pen |
0.140387 |
0.313424 |
0.085044 |
0.098347 |
phone |
0.198101 |
0.692323 |
0.039055 |
0.421108 |
(7)sort_index
df
|
watch |
book |
pen |
phone |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
df.sort_index()#排序,可按行或列index排序输出
|
watch |
book |
pen |
phone |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
(8)sort_values
df
|
watch |
book |
pen |
phone |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
df.sort_values(by='watch')
#按数据值来排序,默认升序。列表根据关键字‘watch’排序,索引跟着watch的排序发生改变,其他没有变,
|
watch |
book |
pen |
phone |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
df.sort_values(by='watch',ascending=False) #设置降序
|
watch |
book |
pen |
phone |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
df.sort_values(by=['watch','pen'],ascending=False) #按照watch,pen列进行排序索引跟着最后排序的列变化
|
watch |
book |
pen |
phone |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
10.2.3 处理缺失值
- 原始数据往往可能有缺失,则需要进行处理。
- 方法:去掉缺失值所在行、列,填充值。
import pandas as pd
test_data='''
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
'''
#读入内存
from io import StringIO
test_data=StringIO(test_data) #读入内存,变为文件
#.xlsx/.excel/.csv
df=pd.read_csv(test_data) #读取文件
#默认第一行当表头,索引默认0,1...
#添加表头
df.columns=['c1','c2','c3','c4']
#或者在原始数据第一行添加
df
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
5 |
NaN |
NaN |
NaN |
NaN |
#处理缺失值,缺失值是NaN
#缺失值赋值:df.iloc[0,0]=np.nan
# 1. 删除缺失值所在的行
df.dropna(axis=0) # axis=0为行
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
# 2. 删除缺失值所在列
df.dropna(axis=1)
df
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
5 |
NaN |
NaN |
NaN |
NaN |
# 3. 某行有2个及以上的值,则保留这一行
df.dropna(thresh=2,axis=0) #axis=0控制行 ,不加axis参数默认是行
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
df
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
5 |
NaN |
NaN |
NaN |
NaN |
# 4. 某列有5个及以上的值保留这一列
df.dropna(thresh=5,axis=1) #axis=1控制列,
|
c1 |
c2 |
0 |
4.9 |
3.0 |
1 |
4.7 |
3.2 |
2 |
7.0 |
3.2 |
3 |
6.4 |
3.2 |
4 |
6.9 |
3.1 |
5 |
NaN |
NaN |
df
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
5 |
NaN |
NaN |
NaN |
NaN |
df.dropna(subset='c3') #删除c3具有缺失值的行
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
df
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
5 |
NaN |
NaN |
NaN |
NaN |
#填充
df.fillna(value=10) #缺失值都变成10
|
c1 |
c2 |
c3 |
c4 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
10.0 |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
10.0 |
5 |
10.0 |
10.0 |
10.0 |
10.0 |
10.2.4 合并数据
import numpy as np
df1=pd.DataFrame(np.zeros((3,4)))
df1
|
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
df2=pd.DataFrame(np.ones((3,4)))
df2
|
0 |
1 |
2 |
3 |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
pd.concat((df1,df2)) #默认按列合并
|
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
pd.concat((df1,df2),axis=1) #在这里,axis=1控制行,到时候试一下容易搞混
|
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
10.2.5 读取表格
df=pd.read_excel('test.xlsx')
df
#表格中的索引变成第一列了
|
Unnamed: 0 |
watch |
book |
pen |
phone |
0 |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
1 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
3 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
df=pd.read_excel('test.xlsx',header=0,index_col=0) #header=0:第一行作为表头,index_col:第一列作为索引
df
|
watch |
book |
pen |
phone |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
10.2.5 取值
df
|
watch |
book |
pen |
phone |
2019-01-01 |
0.417305 |
0.558690 |
0.140387 |
0.198101 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
df.loc['2019-01-01'] #通过索引取
#输出
watch 0.417305
book 0.558690
pen 0.140387
phone 0.198101
Name: 2019-01-01 00:00:00, dtype: float64
df.iloc[0,0] #取第一行第一个元素,类似numpy取值
#输出
0.417304802367127
df.iloc[0,:] #取第一行元素
#输出
watch 0.417305
book 0.558690
pen 0.140387
phone 0.198101
Name: 2019-01-01 00:00:00, dtype: float64
df.iloc[0,:]=0 #第一航元素都设置成0
df
|
watch |
book |
pen |
phone |
2019-01-01 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
2019-01-02 |
0.800745 |
0.968262 |
0.313424 |
0.692323 |
2019-01-03 |
0.876389 |
0.894607 |
0.085044 |
0.039055 |
2019-01-04 |
0.169830 |
0.878143 |
0.098347 |
0.421108 |
df.to_excel('test.xlsx')
#1. 文件读取-处理-重新写入可以写成函数
import pandas as pd
df=pd.read_excel('test.xlsx',header=0,index_col=0)
#2. 复杂的逻辑处理:pandas+numpy
func()
#3. 处理完后保存
df.to_excel('test.py') #注意文件打开时不能写入。
10.2.6 高级方法
where、apply