pandas

1.pandas基础

1.1 Series

import pandas as pd
import numpy as np

创建

## 1.创建Series对象
sdata = pd.Series(np.arange(1,4), index=list('abc'))
sdata
a    1
b    2
c    3
dtype: int32

访问

## 2.Series对象访问

# 默认数字索引
print(sdata.iloc[0])
# 使用标签[a,b,c]
print(sdata['b'])
# 使用loc方式,只能使用标签
print(sdata.loc['c'])
1
2
3
## 3.获取index与value
print(sdata.index)
print(sdata.values)
Index(['a', 'b', 'c'], dtype='object')
[1 2 3]
## 4.index与value转成列表

# 将索引转成列表
sdata.index.tolist()
sdata.values.tolist()
[1, 2, 3]
# series对象访问
for item in sdata.items():
    print(item)
('a', 1)
('b', 2)
('c', 3)

1.2 dataframe

创建

# 一维数据
pd.DataFrame(data=np.arange(1,4))
0
0 1
1 2
2 3
# 多维数据 data为4X4
data = np.arange(16).reshape(4,4)
pd.DataFrame(data=data)
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
# 设置index与columns
data = np.arange(16).reshape(4,4)
pdata = pd.DataFrame(data=data, index=list('abcd'), columns=['c1','c2','c3','c4'])
pdata
c1 c2 c3 c4
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
d 12 13 14 15
# 设置index与columns
data = {'c1':[1,2,3], 'c2':[4,5,6]}
pdata = pd.DataFrame(data=data)
pdata
c1 c2
0 1 4
1 2 5
2 3 6
# 设置列标签
pdata.columns = ['t1','t2']
pdata
t1 t2
0 1 4
1 2 5
2 3 6

访问

# 取一列,返回Series对象
pdata['t1']
0    1
1    2
2    3
Name: t1, dtype: int64
# 取多列数据
pdata[['t1','t2']]
t1 t2
0 1 4
1 2 5
2 3 6

loc操作

# 获取第一行
pdata.loc[0]
t1    1
t2    4
Name: 0, dtype: int64
# 获取第一、二行指定t1,t2列
pdata.loc[[0,1],['t1','t2']]
t1 t2
0 1 4
1 2 5

DataFrame修改

data = {'c1':[1,2,3], 'c2':[4,5,6], 'c3':[7,8,9]}
pdata = pd.DataFrame(data=data)
pdata
c1 c2 c3
0 1 4 7
1 2 5 8
2 3 6 9
# 修改c1列值
pdata['c1'] = 0
pdata
c1 c2 c3
0 0 4 7
1 0 5 8
2 0 6 9
# 插入新列
pdata['c4'] = [1,1,1]
pdata
c1 c2 c3 c4
0 0 4 7 1
1 0 5 8 1
2 0 6 9 1
# 插入行
pdata.loc[3] = [2,2,2,2]
pdata
c1 c2 c3 c4
0 0 4 7 1
1 0 5 8 1
2 0 6 9 1
3 2 2 2 2

2 pandas数据导入与保存

数据导入

pd.read_csv(filepath, sep=',', delimiter=None, header='infer', names=None, index_col=None, ...) #读取CSV文件
pd.read_excel(filepath,names=None, index_col=None, usecols=None, ...) #读取Excel文件
pd.read_json(path, orient=None, dtype=None, ...) #读取JSON文件

fpath = r"D:\AI实践课\pokemon.csv"
df = pd.read_csv(fpath,names = ['妖怪','姓名','属性1','属性2','种族值','体力','物攻','防御','特攻','特防','速度'],header = 0)
df
妖怪 姓名 属性1 属性2 种族值 体力 物攻 防御 特攻 特防 速度
0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45
1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60
2 3 Venusaur Grass Poison 525 80 82 83 100 100 80
3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80
4 4 Charmander Fire NaN 309 39 52 43 60 50 65
... ... ... ... ... ... ... ... ... ... ... ...
795 719 Diancie Rock Fairy 600 50 100 150 100 150 50
796 719 DiancieMega Diancie Rock Fairy 700 50 160 110 160 110 110
797 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 150 130 70
798 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 170 130 80
799 721 Volcanion Fire Water 600 80 110 120 130 90 70

800 rows × 11 columns

# 无表头指定None
pdata = pd.read_csv(fpath,header=None)
pdata
0 1 2 3 4 5 6 7 8 9 10
0 # Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed
1 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45
2 2 Ivysaur Grass Poison 405 60 62 63 80 80 60
3 3 Venusaur Grass Poison 525 80 82 83 100 100 80
4 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80
... ... ... ... ... ... ... ... ... ... ... ...
796 719 Diancie Rock Fairy 600 50 100 150 100 150 50
797 719 DiancieMega Diancie Rock Fairy 700 50 160 110 160 110 110
798 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 150 130 70
799 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 170 130 80
800 721 Volcanion Fire Water 600 80 110 120 130 90 70

801 rows × 11 columns

# 导入指定表头
pdata = pd.read_csv(fpath,header=1)
pdata
1 Bulbasaur Grass Poison 318 45 49 49.1 65 65.1 45.1
0 2 Ivysaur Grass Poison 405 60 62 63 80 80 60
1 3 Venusaur Grass Poison 525 80 82 83 100 100 80
2 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80
3 4 Charmander Fire NaN 309 39 52 43 60 50 65
4 5 Charmeleon Fire NaN 405 58 64 58 80 65 80
... ... ... ... ... ... ... ... ... ... ... ...
794 719 Diancie Rock Fairy 600 50 100 150 100 150 50
795 719 DiancieMega Diancie Rock Fairy 700 50 160 110 160 110 110
796 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 150 130 70
797 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 170 130 80
798 721 Volcanion Fire Water 600 80 110 120 130 90 70

799 rows × 11 columns

数据保存

pdata.to_csv(path_or_buf=None, sep=',', ...)
pdata.to_excel(excel_writer, sheet_name='Sheet1', na_rep='', ...)
pdata.to_json(path_or_buf=None, orient=None, ...)

3 缺失数据处理

s1 = [10, 10.5, None, 11]
s2 = [7, 6.9,7.5,None]
df = pd.DataFrame({'s1':s1, 's2':s2})
df
s1 s2
0 10.0 7.0
1 10.5 6.9
2 NaN 7.5
3 11.0 NaN
缺失值判断
s1 = [10, 10.5, None, 11]
s2 = [7, 6.9,7.5,None]
pdata = pd.DataFrame({'s1':s1, 's2':s2})
pd.isnull(pdata)
s1 s2
0 False False
1 False False
2 True False
3 False True
# pd.notnull,若包含缺省值,缺省值对应值为False
# np.all:若对象中包含假,返回False, 否则返回真
np.all(pd.notnull(pdata))
# 返回False, 说明包含缺省值,否则不包含缺省值
False
# isnull:缺省值对应值为True
# any:对象中包含真,返回True
np.any(pd.isnull(pdata))
# 返回False,说明不含缺省值,返回True说明包括缺省值
True

缺省值处理方式

1 缺省值过滤

bindex = np.all(pdata.notnull(), axis=1)
bindex
0     True
1     True
2    False
3    False
dtype: bool
pdata[bindex]
s1 s2
0 10.0 7.0
1 10.5 6.9

2 删除缺省值

pdata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

  • axis : 0 或 'index':按行操作,1 或 'columns':按列操作
  • how :根据 axis 指定操作方式,any:只要有一个 Na 就删除,all:全部为 Na 才删除
  • thresh: 指定非 Na 值的数量,非 Na 数量大于等于 thresh 时不删除
  • subset: 指定操作的列子集
  • inplace: True:在原始数据中进行修改350
s1 = [10, 10.5, None, 11]
s2 = [7, 6.9,7.5,None]
s3 = [7, 6.9,7.5,7]
s4 = [None, 6.9,None,7.2]
pdata = pd.DataFrame({'s1':s1, 's2':s2, 's3':s3,'s4':s4})
pdata
s1 s2 s3 s4
0 10.0 7.0 7.0 NaN
1 10.5 6.9 6.9 6.9
2 NaN 7.5 7.5 NaN
3 11.0 NaN 7.0 7.2
# 删除包含缺省值行
pdata.dropna()
s1 s2 s3 s4
1 10.5 6.9 6.9 6.9
# 缺省值数量大于1,thresh设置为3
# thresh:指定非Na数量(非Na数量>=thresh,不删除)
pdata.dropna(thresh=3)
s1 s2 s3 s4
0 10.0 7.0 7.0 NaN
1 10.5 6.9 6.9 6.9
3 11.0 NaN 7.0 7.2
# 指定列:['s1','s4']
pdata.dropna(subset=['s1','s4'])
s1 s2 s3 s4
1 10.5 6.9 6.9 6.9
3 11.0 NaN 7.0 7.2
# 删除包含缺省值列
pdata.dropna(axis=1)
s3
0 7.0
1 6.9
2 7.5
3 7.0

3 缺失值填充

pdata.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)

  • value 填充值-
    method 填充方式:{'backfill', 'bfill', 'pad', 'ffill', None}-
    axis 指定行列:0 或 'index' 表示按行,1 或 'columns' 表示按- 列
    limit 插入数量限制
pdata
s1 s2 s3 s4
0 10.0 7.0 7.0 NaN
1 10.5 6.9 6.9 6.9
2 NaN 7.5 7.5 NaN
3 11.0 NaN 7.0 7.2
# 固定值0
pdata.fillna(0)
s1 s2 s3 s4
0 10.0 7.0 7.0 0.0
1 10.5 6.9 6.9 6.9
2 0.0 7.5 7.5 0.0
3 11.0 0.0 7.0 7.2
# 使用前填充 (ffill)
pdata.ffill()
s1 s2 s3 s4
0 10.0 7.0 7.0 NaN
1 10.5 6.9 6.9 6.9
2 10.5 7.5 7.5 6.9
3 11.0 7.5 7.0 7.2
# 使用后填充 (bfill),行
pdata.bfill()      
s1 s2 s3 s4
0 10.0 7.0 7.0 6.9
1 10.5 6.9 6.9 6.9
2 11.0 7.5 7.5 7.2
3 11.0 NaN 7.0 7.2
# 使用后填充 (bfill),列
pdata.bfill(axis=1)
s1 s2 s3 s4
0 10.0 7.0 7.0 NaN
1 10.5 6.9 6.9 6.9
2 7.5 7.5 7.5 NaN
3 11.0 7.0 7.0 7.2
#插入均值
pdata.fillna(pdata.mean())
s1 s2 s3 s4
0 10.0 7.000000 7.0 7.05
1 10.5 6.900000 6.9 6.90
2 10.5 7.500000 7.5 7.05
3 11.0 7.133333 7.0 7.20
#插入中位数
pdata.fillna(pdata.median())
s1 s2 s3 s4
0 10.0 7.0 7.0 7.05
1 10.5 6.9 6.9 6.90
2 10.5 7.5 7.5 7.05
3 11.0 7.0 7.0 7.20

4 数据清洗

names = list('ABCD')
math = [90,100,50,80]
chinese = [89,96,58,77]
pdata = pd.DataFrame({'name':names, 'math':math, 'chinese':chinese})
pdata
name math chinese
0 A 90 89
1 B 100 96
2 C 50 58
3 D 80 77
# 需求1:数学成绩大于80的所有成绩;
bindex = pdata['math'] > 80
pdata[bindex]
name math chinese
0 A 90 89
1 B 100 96
# 需求2:获取同学A的成绩;
pdata[pdata['name']=='A']
name math chinese
0 A 90 89
# 需求1:获取数学语文都及格成绩
# 注意:两个条件要加括号
pdata[(pdata['math']>59) & (pdata['chinese']>59)]
name math chinese
0 A 90 89
1 B 100 96
3 D 80 77
# 需求2:获取数学语文有一门大于等于80分
# 注意:两个条件要加括号
pdata[(pdata['math']>=80) | (pdata['chinese']>=80)]
name math chinese
0 A 90 89
1 B 100 96
3 D 80 77
# 根据集合获取数据
bindex = pdata['math'].isin([100, 90])
pdata[bindex]
name math chinese
0 A 90 89
1 B 100 96
# 根据索引排序,降序,ascending=False
pdata.sort_index(ascending=False)
name math chinese
3 D 80 77
2 C 50 58
1 B 100 96
0 A 90 89
# 根据数学与语文成绩排序,降序,ascending=False
# sort_values中加入两列数据
pdata.sort_values(['math', 'chinese'], ascending=False)
name math chinese
1 B 100 96
0 A 90 89
3 D 80 77
2 C 50 58

5 pandas汇总与描述性统计

#最大值,最小值,四分位数,均值,数量,标准差
pdata.describe()
math chinese
count 4.000000 4.0000
mean 80.000000 80.0000
std 21.602469 16.6333
min 50.000000 58.0000
25% 72.500000 72.2500
50% 85.000000 83.0000
75% 92.500000 90.7500
max 100.000000 96.0000
#计算每个学生总分,平均分
print(pdata.iloc[:, 1:].sum(axis=1))
print(pdata.iloc[:, 1:].mean(axis=1))
0    179
1    196
2    108
3    157
dtype: int64
0    89.5
1    98.0
2    54.0
3    78.5
dtype: float64

6 索引

pdata
name math chinese
0 A 90 89
1 B 100 96
2 C 50 58
3 D 80 77
# 设置索引
# set_index:inplace返回副本,新数据
ndata = pdata.set_index('name')
ndata
math chinese
name
A 90 89
B 100 96
C 50 58
D 80 77

7 时间与时间序列

#年月日
print(pd.Timestamp(2020, 6, 2))
#字符串
print(pd.Timestamp('2020-05-07'))
#字符串时间
print(pd.Timestamp('2020-05-07 04:02:01'))
#2017-03-01与format对应
print(pd.to_datetime('2017-02-01',format="%Y-%m-%d"))
#20170301与年月日对应
print(pd.to_datetime('20170301',format="%Y%m%d"))
2020-06-02 00:00:00
2020-05-07 00:00:00
2020-05-07 04:02:01
2017-02-01 00:00:00
2017-03-01 00:00:00
# 周期单位为Day
print(pd.date_range('2017-01-01', periods=2))
# 周期单位为hour
print(pd.date_range('2017-01-01 02', periods=2, freq='h'))
# 每个月月初
print(pd.date_range('2017-01', periods=3, freq='MS'))
# 每个月月底
print(pd.date_range('2017-01', periods=3, freq='ME'))
DatetimeIndex(['2017-01-01', '2017-01-02'], dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2017-01-01 02:00:00', '2017-01-01 03:00:00'], dtype='datetime64[ns]', freq='h')
DatetimeIndex(['2017-01-01', '2017-02-01', '2017-03-01'], dtype='datetime64[ns]', freq='MS')
DatetimeIndex(['2017-01-31', '2017-02-28', '2017-03-31'], dtype='datetime64[ns]', freq='ME')

8 数据清洗

ts = ['2019-03-25','2019-03-26','2019-03-26','2019-03-26','2019-03-29','2019-03-30','2019-03-31',
 '2019-04-01','2019-04-02','2019-04-03','2019-04-04','2019-05-01','2019-04-05',]
values = np.arange(len(ts))
pdata = pd.DataFrame({'ts':ts, 'values':values})
pdata
ts values
0 2019-03-25 0
1 2019-03-26 1
2 2019-03-26 2
3 2019-03-26 3
4 2019-03-29 4
5 2019-03-30 5
6 2019-03-31 6
7 2019-04-01 7
8 2019-04-02 8
9 2019-04-03 9
10 2019-04-04 10
11 2019-05-01 11
12 2019-04-05 12
# 唯一值
pdata.ts.unique()
array(['2019-03-25', '2019-03-26', '2019-03-29', '2019-03-30',
       '2019-03-31', '2019-04-01', '2019-04-02', '2019-04-03',
       '2019-04-04', '2019-05-01', '2019-04-05'], dtype=object)
# 数值出现次数
pdata.ts.value_counts()
ts
2019-03-26    3
2019-03-25    1
2019-03-29    1
2019-03-30    1
2019-03-31    1
2019-04-01    1
2019-04-02    1
2019-04-03    1
2019-04-04    1
2019-05-01    1
2019-04-05    1
Name: count, dtype: int64
# 删除指定行列
# 删除单行
print(pdata.drop(0))
            ts  values
1   2019-03-26       1
2   2019-03-26       2
3   2019-03-26       3
4   2019-03-29       4
5   2019-03-30       5
6   2019-03-31       6
7   2019-04-01       7
8   2019-04-02       8
9   2019-04-03       9
10  2019-04-04      10
11  2019-05-01      11
12  2019-04-05      12
# 删除多行
print(pdata.drop(index=[1,2,3]))
            ts  values
0   2019-03-25       0
4   2019-03-29       4
5   2019-03-30       5
6   2019-03-31       6
7   2019-04-01       7
8   2019-04-02       8
9   2019-04-03       9
10  2019-04-04      10
11  2019-05-01      11
12  2019-04-05      12
# 删除列
print(pdata.drop(columns='ts'))
    values
0        0
1        1
2        2
3        3
4        4
5        5
6        6
7        7
8        8
9        9
10      10
11      11
12      12
# 删除index为0值,删除列为ts的值
pdata.drop(index=0, columns='ts')
values
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
# 去重
# 保留第一次
pdata.drop_duplicates('ts')
ts values
0 2019-03-25 0
1 2019-03-26 1
4 2019-03-29 4
5 2019-03-30 5
6 2019-03-31 6
7 2019-04-01 7
8 2019-04-02 8
9 2019-04-03 9
10 2019-04-04 10
11 2019-05-01 11
12 2019-04-05 12
#保留最后一次
pdata.drop_duplicates(subset='ts', keep='last')
ts values
0 2019-03-25 0
3 2019-03-26 3
4 2019-03-29 4
5 2019-03-30 5
6 2019-03-31 6
7 2019-04-01 7
8 2019-04-02 8
9 2019-04-03 9
10 2019-04-04 10
11 2019-05-01 11
12 2019-04-05 12

9 pandas数据处理常用函数

apply

n = list('ABCD')
math = [90,80,47,69]
chinese = [95,78,96,59]
nclass = ['001','001','002','002']
df = pd.DataFrame({'name':n,'math':math, 'chinese':chinese,'class':nclass})
df
name math chinese class
0 A 90 95 001
1 B 80 78 001
2 C 47 96 002
3 D 69 59 002
# 修改name名字
name =df['name']
name.apply(lambda x:x+x)
0    AA
1    BB
2    CC
3    DD
Name: name, dtype: object
# 将成绩转成True或者False
df[['math','chinese']].apply(lambda x : x>59)
math chinese
0 True True
1 True True
2 False True
3 True False

func

def func(value):
    print(type(value))
    return np.mean(value)
df[['math','chinese']].apply(func)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>





math       71.5
chinese    82.0
dtype: float64

map

df['math'].map(lambda x: 'pass' if x > 59 else 'failed')
0      pass
1      pass
2    failed
3      pass
Name: math, dtype: object

replace

# 将[A,B]替换成*
df.replace(['A','B'], '*')
name math chinese class
0 * 90 95 001
1 * 80 78 001
2 C 47 96 002
3 D 69 59 002
# 一组数据替换
df.replace(list('ABCD'), list('abcd'))
name math chinese class
0 a 90 95 001
1 b 80 78 001
2 c 47 96 002
3 d 69 59 002
# 正则:将所有字母替换成*
df.replace(r'[A-Z]','*', regex=True)
name math chinese class
0 * 90 95 001
1 * 80 78 001
2 * 47 96 002
3 * 69 59 002

10 分组处理

classname = ['001','001','002','002','003','003']
name = ['sun','li','zhou','wang','zao','wu']
height = [169, 172,180,170,165,175]
weights = [61,53,75,64,50,58]
df = pd.DataFrame({'cname':classname, 'user':name, 'height':height, 'weights':weights})
df
cname user height weights
0 001 sun 169 61
1 001 li 172 53
2 002 zhou 180 75
3 002 wang 170 64
4 003 zao 165 50
5 003 wu 175 58

groupby分组

dfg = df.groupby('cname')
dfg.groups
{'001': [0, 1], '002': [2, 3], '003': [4, 5]}
# 分组统计
dfg.count()
user height weights
cname
001 2 2 2
002 2 2 2
003 2 2 2
# 根据多列进行分组:
dfg = df.groupby(['cname','height'])
dfg.groups
{('001', 169): [0], ('001', 172): [1], ('002', 170): [3], ('002', 180): [2], ('003', 165): [4], ('003', 175): [5]}
# 统计结果为多级索引
dfg.count()
user weights
cname height
001 169 1 1
172 1 1
002 170 1 1
180 1 1
003 165 1 1
175 1 1
posted @ 2024-07-22 09:44  yao-ziyan  阅读(19)  评论(0)    收藏  举报