「数据分析 - Pandas 函数」【数据分析全栈攻略:爬虫+处理+可视化+报告】 - 指南

- 第 105 篇 -
Date: 2025 - 06 - 05
Author: 郑龙浩/仟墨

Pandas 核心功能详解与示例

1. 数据结构基础

1.1 Series 创建与操作

Series 是 Pandas 库中的一维带标签数组,可以简单理解为 Excel 中的单列数据(但功能更强大)。它是构建 DataFrame 的基础组件,也是数据操作的核心对象之一。

创建Series:

pd.Series(
data=None
, # 数据(列表、字典、标量等)
index=None
, # 索引(标签)
dtype=None
, # 数据类型
name=None
, # Series 名称
copy=False
, # 是否复制数据
fastpath=False # 内部优化参数(通常不直接使用)
)
  • data 是必须的,其他参数可选。
  • index 用于自定义标签,dtype 控制数据类型,name 用于命名 Series。
  • copy 一般不用改,除非需要避免数据被意外修改。
import pandas as pd
import numpy as np
# 从列表创建
arr1 = pd.Series([1
, 2
, 3]
) # 自动生成整数索引
print(arr1)
'''
0 1
1 2
2 3
dtype: int64
'''
arr2 = pd.Series([1
, 2
, 3]
, index=['a'
, 'b'
, 'c']
) # 自定义索引
print(arr2)
'''
a 1
b 2
c 3
dtype: int64
'''
arr3 = pd.Series([1
, 2
, 3]
, dtype=float
) # 指定float类型
print(arr3)
'''
0 1.0
1 2.0
2 3.0
dtype: float64
'''
# 从字典创建(自动使用字典键作为索引)
arr4 = pd.Series({
'a': 1
, 'b': 2
, 'c': 3
}
)
print(arr4)
'''
a 1
b 2
c 3
dtype: int64
'''
data = [10
, 20
, 30]
idx = ['A'
, 'B'
, 'C']
arr5 = pd.Series(data=data, index=idx, name='名字'
)
print(arr5)
'''
A 10
B 20
C 30
Name: 名字, dtype: int64
'''

Series操作:

import pandas as pd
import numpy as np
#
# # 从列表创建
# arr1 = pd.Series([1, 2, 3]) # 自动生成整数索引
# print(arr1)
# '''
# 0 1
# 1 2
# 2 3
# dtype: int64
# '''
#
# arr2 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) # 自定义索引
# print(arr2)
# '''
# a 1
# b 2
# c 3
# dtype: int64
# '''
#
# arr3 = pd.Series([1, 2, 3], dtype=float) # 指定float类型
# print(arr3)
# '''
# 0 1.0
# 1 2.0
# 2 3.0
# dtype: float64
# '''
#
# # 从字典创建(自动使用字典键作为索引)
# arr4 = pd.Series({'a': 1, 'b': 2, 'c': 3})
# print(arr4)
# '''
# a 1
# b 2
# c 3
# dtype: int64
# '''
data = [10
, 20
, 30]
idx = ['A'
, 'B'
, 'C']
arr5 = pd.Series(data=data, index=idx, name='名字'
)
print(arr5)
'''
A 10
B 20
C 30
Name: 名字, dtype: int64
'''
# 索引访问某个元素
print(arr5['B']
) # 20
# 向量化运算
print(arr5 * 2
) # 所有元素 * 2
'''
A 20
B 40
C 60
Name: 名字, dtype: int64
'''
# 布尔索引
print(arr5[arr5 >
15]
) # 取出 arr5 中,大于15的元素
'''
B 20
C 30
Name: 名字, dtype: int64
'''

1.2 DataFrame 创建与操作

DataFrame 是 Pandas 库中最核心的二维表格型数据结构,相当于 Python 中的"电子表格"或"SQL表"。它的主要作用是为数据分析和处理提供高效灵活的工具,以下是其核心作用和特点

创建DataFrame:

import pandas as pd
import numpy as np
# 从字典创建
df1 = pd.DataFrame({
'A': 1.0
,
'B': pd.Timestamp('2025-06-05'
)
,
'C': pd.Series(1
, index=list(range(4
)
)
)
,
'D': np.array([3] * 4
)
,
'E': pd.Categorical(["test"
, "train"
, "test"
, "train"]
)
,
'F': 'aaa'
}
)
print(df1)
"""
A B C D E F
0 1.0 2025-06-05 1 3 test aaa
1 1.0 2025-06-05 1 3 train aaa
2 1.0 2025-06-05 1 3 test aaa
3 1.0 2025-06-05 1 3 train aaa
"""
# 从二维数组创建
df2 = pd.DataFrame(np.random.randn(6
, 4
)
, columns=list('ABCD'
)
)
print(df2.head(2
)
)
"""
A B C D
0 -1.043791 -0.519411 -0.204117 -1.169345
1 -0.386409 0.978335 -0.092777 -1.832369
"""

DataFrame操作:

# 查看前几行
print(df2.head(2
)
)
# 查看统计信息
print(df2.describe(
)
)
"""
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.073711 -0.431125 -0.687758 -0.233103
std 0.843157 0.922818 0.779887 0.973118
min -0.861849 -2.104569 -1.509059 -1.135632
25% -0.611510 -0.600794 -1.368714 -1.006602
50% 0.022072 -0.228039 -0.767252 -0.386671
75% 0.658444 0.041933 -0.034326 0.461706
max 1.212112 0.567020 0.276232 1.491390
"""
# 转置
print(df2.T)

2. 数据选择与过滤

2.1 基本选择方法

# 选择列
print(df2['A']
) # 选择单列
print(df2[['A'
, 'C']]
) # 选择多列
# 使用loc按标签选择
print(df2.loc[0:3
, ['A'
, 'B']]
)
"""
A B
0 0.469112 -0.282863
1 1.212112 -0.173215
2 -0.861849 -2.104569
3 0.721555 -0.706771
"""
# 使用iloc按位置选择
print(df2.iloc[1:3
, 0:2]
)
"""
A B
1 1.212112 -0.173215
2 -0.861849 -2.104569
"""

2.2 布尔索引

# 简单条件过滤
print(df2[df2['A'] >
0]
)
"""
A B C D
0 0.469112 -0.282863 -1.509059 -1.135632
1 1.212112 -0.173215 0.119209 -1.044236
3 0.721555 -0.706771 -1.039575 0.271860
5 0.404705 0.577046 -1.715002 -1.039268
"""
# 多条件组合
print(df2[(df2['A'] >
0
) &
(df2['B'] <
0
)]
)
"""
A B C D
0 0.469112 -0.282863 -1.509059 -1.135632
1 1.212112 -0.173215 0.119209 -1.044236
3 0.721555 -0.706771 -1.039575 0.271860
"""

3. 数据处理与清洗

3.1 缺失值处理

# 创建含缺失值的DataFrame
df3 = pd.DataFrame({
'A': [1
, 2
, np.nan, 4]
,
'B': [5
, np.nan, np.nan, 8]
,
'C': [10
, 20
, 30
, 40]
}
)
print(df3)
"""
A B C
0 1.0 5.0 10
1 2.0 NaN 20
2 NaN NaN 30
3 4.0 8.0 40
"""
# 检测缺失值
print(df3.isnull(
)
)
"""
A B C
0 False False False
1 False True False
2 True True False
3 False False False
"""
# 删除缺失值
print(df3.dropna(
)
)
"""
A B C
0 1.0 5.0 10
3 4.0 8.0 40
"""
# 填充缺失值
print(df3.fillna(value=0
)
)
"""
A B C
0 1.0 5.0 10
1 2.0 0.0 20
2 0.0 0.0 30
3 4.0 8.0 40
"""

3.2 数据转换

# 应用函数
print(df3['A'].apply(
lambda x: x**2
if
not pd.isnull(x)
else 0
)
)
"""
0 1.0
1 4.0
2 0.0
3 16.0
Name: A, dtype: float64
"""
# 元素级转换
print(df3.applymap(
lambda x: '%.1f' % x if
not pd.isnull(x)
else 'NaN'
)
)
"""
A B C
0 1.0 5.0 10.0
1 2.0 NaN 20.0
2 NaN NaN 30.0
3 4.0 8.0 40.0
"""

4. 数据分组与聚合

4.1 基本分组操作

df4 = pd.DataFrame({
'A': ['foo'
, 'bar'
, 'foo'
, 'bar'
, 'foo'
, 'bar'
, 'foo'
, 'foo']
,
'B': ['one'
, 'one'
, 'two'
, 'three'
, 'two'
, 'two'
, 'one'
, 'three']
,
'C': np.random.randn(8
)
,
'D': np.random.randn(8
)
}
)
print(df4)
"""
A B C D
0 foo one 0.469112 -0.861849
1 bar one -0.282863 -2.104569
2 foo two -1.509059 -0.494929
3 bar three -1.135632 1.071804
4 foo two 1.212112 0.721555
5 bar two -0.173215 -0.706771
6 foo one 0.119209 -1.039575
7 foo three -1.044236 0.271860
"""
# 单列分组
grouped = df4.groupby('A'
)
print(grouped.mean(
)
)
"""
C D
A
bar -0.530570 -0.579845
foo -0.150572 -0.280588
"""
# 多列分组
print(df4.groupby(['A'
, 'B']
).mean(
)
)
"""
C D
A B
bar one -0.282863 -2.104569
three -1.135632 1.071804
two -0.173215 -0.706771
foo one 0.294161 -0.950712
three -1.044236 0.271860
two -0.148473 0.113313
"""

4.2 聚合函数

# 多种聚合函数
print(grouped.agg(['sum'
, 'mean'
, 'std']
)
)
"""
C D
sum mean std sum mean std
A
bar -1.591710 -0.530570 0.526860 -1.739536 -0.579845 1.591985
foo -0.752862 -0.150572 1.113308 -1.402938 -0.280588 0.739965
"""
# 不同列应用不同聚合
print(grouped.agg({
'C': 'sum'
, 'D': ['min'
, 'max']
}
)
)
"""
C D
sum min max
A
bar -1.591710 -2.104569 1.071804
foo -0.752862 -1.039575 0.721555
"""

5. 数据合并与连接

5.1 concat 连接

df5 = pd.DataFrame(np.random.randn(3
, 4
)
, columns=['A'
, 'B'
, 'C'
, 'D']
)
df6 = pd.DataFrame(np.random.randn(2
, 3
)
, columns=['A'
, 'B'
, 'C']
)
# 纵向连接
print(pd.concat([df5, df6]
)
)
"""
A B C D
0 1.075770 -0.109050 1.643563 -1.469388
1 0.357021 -0.674600 -1.776904 -0.968914
2 -1.294524 0.413738 0.276662 -0.472035
0 -0.013960 -0.362543 -0.006154 NaN
1 -0.923061 0.895717 0.805244 NaN
"""
# 横向连接
print(pd.concat([df5, df6]
, axis=1
)
)
"""
A B C D A B C
0 1.075770 -0.109050 1.643563 -1.469388 -0.013960 -0.362543 -0.006154
1 0.357021 -0.674600 -1.776904 -0.968914 -0.923061 0.895717 0.805244
2 -1.294524 0.413738 0.276662 -0.472035 NaN NaN NaN
"""

5.2 merge 合并

left = pd.DataFrame({
'key': ['K0'
, 'K1'
, 'K2'
, 'K3']
,
'A': ['A0'
, 'A1'
, 'A2'
, 'A3']
,
'B': ['B0'
, 'B1'
, 'B2'
, 'B3']
}
)
right = pd.DataFrame({
'key': ['K0'
, 'K1'
, 'K2'
, 'K3']
,
'C': ['C0'
, 'C1'
, 'C2'
, 'C3']
,
'D': ['D0'
, 'D1'
, 'D2'
, 'D3']
}
)
# 内连接
print(pd.merge(left, right, on='key'
)
)
"""
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
"""
# 左连接
print(pd.merge(left, right, how='left'
, on='key'
)
)
"""
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
"""

6. 时间序列处理

# 创建时间序列
date_rng = pd.date_range(start='1/1/2025'
, end='1/08/2025'
, freq='D'
)
df7 = pd.DataFrame(date_rng, columns=['date']
)
df7['data'] = np.random.randint(0
,100
,size=(len(date_rng)
)
)
print(df7)
"""
date data
0 2025-01-01 95
1 2025-01-02 27
2 2025-01-03 63
3 2025-01-04 60
4 2025-01-05 49
5 2025-01-06 42
6 2025-01-07 96
7 2025-01-08 99
"""
# 重采样
df7.set_index('date'
, inplace=True
)
print(df7.resample('3D'
).mean(
)
)
"""
data
date
2025-01-01 61.666667
2025-01-04 50.333333
2025-01-07 97.500000
"""

7. 文件读写

7.1 CSV文件

# 写入CSV
df7.to_csv('sample.csv'
)
# 读取CSV
df_read = pd.read_csv('sample.csv'
)
print(df_read.head(
)
)
"""
date data
0 2025-01-01 95
1 2025-01-02 27
2 2025-01-03 63
3 2025-01-04 60
4 2025-01-05 49
"""

7.2 Excel文件

# 写入Excel
df7.to_excel('sample.xlsx'
, sheet_name='Sheet1'
)
# 读取Excel
df_excel = pd.read_excel('sample.xlsx'
)
print(df_excel.head(
)
)

7.3 JSON文件

# 写入JSON
df7.to_json('sample.json'
)
# 读取JSON
df_json = pd.read_json('sample.json'
)
print(df_json.head(
)
)
posted @ 2025-07-20 16:21  wzzkaifa  阅读(20)  评论(0)    收藏  举报