数据结构
- Series
- 一种类似一维数组的对象,由一组数据与一组标签(索引)组成
- DataFrame
- 表格型的数据结构,Excel与R的data.frame;可以有行索引和列索引
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter('ignore')
# 通过list构建Series
df = pd.Series(range(10,20))
type(df)
pandas.core.series.Series
df
0 10
1 11
2 12
3 13
4 14
5 15
6 16
7 17
8 18
9 19
dtype: int64
df.values
array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
type(df.values)
numpy.ndarray
df.head(5)
0 10
1 11
2 12
3 13
4 14
dtype: int64
print(df[0])
10
print(df[8])
18
df[df>15]
6 16
7 17
8 18
9 19
dtype: int64
df>15
0 False
1 False
2 False
3 False
4 False
5 False
6 True
7 True
8 True
9 True
dtype: bool
# 通过字典构建Series
sales_data = {'a':1, 'b':2,'c':3}
df = pd.Series(sales_data)
print(df.head())
print(df.index)
a 1
b 2
c 3
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
df.name = 'Series1'
df.index.name = 'idx'
df.head()
idx
a 1
b 2
c 3
Name: Series1, dtype: int64
df = pd.Series(range(8,14), index = ['a','b','c','d','e','f'])
#print(df) # 打印整个 Series
print(df.head()) # 打印前几行(默认前5行) 显示 Series/DataFrame 的前 n 行数据
print(df.tail()) # 查看后5行
a 8
b 9
c 10
d 11
e 12
dtype: int64
b 9
c 10
d 11
e 12
f 13
dtype: int64
print(df[1]) # index行索引
print('-------------')
print(df[1:3]) # 切片索引 ,前闭后开
print('-------------')
9
-------------
b 9
c 10
dtype: int64
-------------
# DataFrame的构建
array = np.random.randn(6,4)
array
array([[ 0.06736358, -0.64255316, 0.20449614, -0.32843359],
[ 0.94659344, 0.29186784, -0.60989757, -0.74899701],
[-0.91717312, 1.48430298, 1.17129331, 1.81347392],
[-3.62074396, 0.04548781, 1.65180448, 1.55219322],
[ 0.32978011, -0.71838459, -1.21471285, 0.43443465],
[-1.86554116, -1.25161324, 0.0990453 , 0.46801472]])
df_obj = pd.DataFrame(array) # 通过Ndarray构建的DataFrame,如果没有声明,默认的行index和列index都是从0开始
df_obj.head()
|
0 |
1 |
2 |
3 |
| 0 |
0.067364 |
-0.642553 |
0.204496 |
-0.328434 |
| 1 |
0.946593 |
0.291868 |
-0.609898 |
-0.748997 |
| 2 |
-0.917173 |
1.484303 |
1.171293 |
1.813474 |
| 3 |
-3.620744 |
0.045488 |
1.651804 |
1.552193 |
| 4 |
0.329780 |
-0.718385 |
-1.214713 |
0.434435 |
df = pd.DataFrame(np.random.randn(6,4),columns = ['a','b','c','d'], index = ['e','f','g','h','i','j'])
df.head(6)
|
a |
b |
c |
d |
| e |
0.583592 |
-1.501183 |
0.116741 |
-0.442168 |
| f |
-0.161299 |
0.528328 |
-0.450201 |
-1.196337 |
| g |
1.042936 |
-0.152244 |
1.489381 |
0.968527 |
| h |
0.008104 |
0.342535 |
0.447326 |
0.136234 |
| i |
-0.835837 |
-0.912627 |
-0.100602 |
0.612302 |
| j |
0.206510 |
0.232865 |
-0.320405 |
-2.312483 |
# 通过字典构建DataFrame
dict_data = {'Date':pd.Timestamp('20251226'),
'Number':pd.Series(6,index=list(range(4))),
'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()
|
Date |
Number |
Course_name |
Company |
| 0 |
2025-12-26 |
6 |
Python |
JET |
| 1 |
2025-12-26 |
6 |
Quant |
JET |
| 2 |
2025-12-26 |
6 |
Finance |
JET |
| 3 |
2025-12-26 |
6 |
CFA |
JET |
# 自定义构建DataFrame
df = pd.DataFrame([10,30,60,80], columns=['Quantity'], index=['a','b','c','d'])
df
|
Quantity |
| a |
10 |
| b |
30 |
| c |
60 |
| d |
80 |
print(type(df))
<class 'pandas.core.frame.DataFrame'>
# DataFrame的选择操作
dict_data = {'Date':pd.Timestamp('20251226'),
'Number':pd.Series(6,index=list(range(4))),
'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()
|
Date |
Number |
Course_name |
Company |
| 0 |
2025-12-26 |
6 |
Python |
JET |
| 1 |
2025-12-26 |
6 |
Quant |
JET |
| 2 |
2025-12-26 |
6 |
Finance |
JET |
| 3 |
2025-12-26 |
6 |
CFA |
JET |
df['Course_name']
0 Python
1 Quant
2 Finance
3 CFA
Name: Course_name, dtype: object
# 增加具体的列
df['Period'] = range(21,25)
df.head()
|
Date |
Number |
Course_name |
Company |
Period |
| 0 |
2025-12-26 |
6 |
Python |
JET |
21 |
| 1 |
2025-12-26 |
6 |
Quant |
JET |
22 |
| 2 |
2025-12-26 |
6 |
Finance |
JET |
23 |
| 3 |
2025-12-26 |
6 |
CFA |
JET |
24 |
# 删除列
del(df['Period'])
df.head()
|
Date |
Number |
Course_name |
Company |
| 0 |
2025-12-26 |
6 |
Python |
JET |
| 1 |
2025-12-26 |
6 |
Quant |
JET |
| 2 |
2025-12-26 |
6 |
Finance |
JET |
| 3 |
2025-12-26 |
6 |
CFA |
JET |
# 将一个完整的DataFrame作为新建的列,Pandas会自动根据index的lable进行匹配
df['Names'] = pd.DataFrame(['Alpha','Beta','Gramma','Theta']) # ,index=['4','2','3','1'] 这个没试成功
df
|
Date |
Number |
Course_name |
Company |
Names |
| 0 |
2025-12-26 |
6 |
Python |
JET |
Alpha |
| 1 |
2025-12-26 |
6 |
Quant |
JET |
Beta |
| 2 |
2025-12-26 |
6 |
Finance |
JET |
Gramma |
| 3 |
2025-12-26 |
6 |
CFA |
JET |
Theta |
df = pd.DataFrame([80.5,90,58,88],columns=['Scores'],
index=['a','b','c','d'],dtype='f')
df
|
Scores |
| a |
80.5 |
| b |
90.0 |
| c |
58.0 |
| d |
88.0 |
df.dtypes
Scores float32
dtype: object
df.columns #获得列名
Index(['Scores'], dtype='object')
df.index[1:3] # 选择index
Index(['b', 'c'], dtype='object')
df.iloc[1:3]
df.loc['c']
Scores 58.0
Name: c, dtype: float32
df.sum()
Scores 316.5
dtype: float32
df.rename(columns={'Scores':'Score'},inplace=True) #重命名列,注意重命名的时候要用字典来进行传输
df
|
Score |
| a |
80.5 |
| b |
90.0 |
| c |
58.0 |
| d |
88.0 |
df.shape
(4, 1)
#DataFrame的向量化操作
df
|
Score |
| a |
80.5 |
| b |
90.0 |
| c |
58.0 |
| d |
88.0 |
df+df
|
Score |
| a |
161.0 |
| b |
180.0 |
| c |
116.0 |
| d |
176.0 |
df - df
|
Score |
| a |
0.0 |
| b |
0.0 |
| c |
0.0 |
| d |
0.0 |
df * 3
|
Score |
| a |
241.5 |
| b |
270.0 |
| c |
174.0 |
| d |
264.0 |
df
|
Score |
| a |
80.5 |
| b |
90.0 |
| c |
58.0 |
| d |
88.0 |
df['Score'] = df['Score'].astype('int')
df.dtypes
Score int64
dtype: object
d = {
'PE':pd.Series([10,20,30,40],index=['Company a','Company b','Company c','Company d']),
'PB':pd.Series([2,3,2.5,4],index=['Company a','Company b','Company c','Company d']),
'ROE':pd.Series([0.06,0.1,0.08,0.02],index=['Company a','Company b','Company c','Company d'])
}
df = pd.DataFrame(d)
df
|
PE |
PB |
ROE |
| Company a |
10 |
2.0 |
0.06 |
| Company b |
20 |
3.0 |
0.10 |
| Company c |
30 |
2.5 |
0.08 |
| Company d |
40 |
4.0 |
0.02 |
(df.PE <25) * 1
Company a 1
Company b 1
Company c 0
Company d 0
Name: PE, dtype: int64
df[df['PE']<25]
|
PE |
PB |
ROE |
| Company a |
10 |
2.0 |
0.06 |
| Company b |
20 |
3.0 |
0.10 |
df[df.PE < 25]
|
PE |
PB |
ROE |
| Company a |
10 |
2.0 |
0.06 |
| Company b |
20 |
3.0 |
0.10 |
df[(df.PE<25) & (df.PB<2.5)]
|
PE |
PB |
ROE |
| Company a |
10 |
2.0 |
0.06 |
df[ (df.PE<25) * 1 + (df.PB < 2.5) * 1 == 2] #多个条件
|
PE |
PB |
ROE |
| Company a |
10 |
2.0 |
0.06 |
df[ (df.PE < 25) * 1 + (df.PB < 2.5) * 1 + (df.ROE > 0.07) * 1 >= 2] # 三个当中满足任意两个条件
|
PE |
PB |
ROE |
| Company a |
10 |
2.0 |
0.06 |
| Company b |
20 |
3.0 |
0.10 |
#DataFrame Apply函数
a = np.random.randn(9,6)
a.round(5)
array([[-7.87620e-01, -1.24492e+00, 1.09361e+00, -5.22550e-01,
2.12030e-01, -1.30360e+00],
[-4.30230e-01, -6.23100e-01, 1.66740e-01, -8.75420e-01,
-1.78512e+00, 1.59138e+00],
[-1.94390e+00, -1.47000e-03, 1.31750e-01, 1.13086e+00,
4.62110e-01, -7.69850e-01],
[-1.08891e+00, -1.12085e+00, -5.33600e-02, -9.80170e-01,
1.77180e-01, -1.36398e+00],
[-6.52240e-01, -6.07170e-01, 5.18720e-01, -7.06020e-01,
2.30450e-01, 9.53490e-01],
[ 6.51890e-01, -2.44220e-01, -4.47850e-01, -7.46110e-01,
6.64090e-01, -3.57600e-02],
[-1.05783e+00, -3.45810e-01, 3.88430e-01, -2.13310e-01,
-3.91820e-01, 6.51190e-01],
[-1.60257e+00, 5.14340e-01, -3.61200e-02, 2.39620e-01,
-7.57500e-01, 2.22660e-01],
[ 7.30130e-01, -3.21070e-01, 5.67360e-01, 1.83213e+00,
6.88210e-01, 2.87610e-01]])
df = pd.DataFrame(a)
df
|
0 |
1 |
2 |
3 |
4 |
5 |
| 0 |
-0.787624 |
-1.244919 |
1.093608 |
-0.522551 |
0.212032 |
-1.303601 |
| 1 |
-0.430226 |
-0.623103 |
0.166744 |
-0.875421 |
-1.785116 |
1.591384 |
| 2 |
-1.943899 |
-0.001466 |
0.131749 |
1.130863 |
0.462114 |
-0.769851 |
| 3 |
-1.088908 |
-1.120851 |
-0.053360 |
-0.980173 |
0.177176 |
-1.363977 |
| 4 |
-0.652240 |
-0.607174 |
0.518715 |
-0.706020 |
0.230450 |
0.953491 |
| 5 |
0.651890 |
-0.244218 |
-0.447855 |
-0.746109 |
0.664087 |
-0.035765 |
| 6 |
-1.057828 |
-0.345811 |
0.388428 |
-0.213307 |
-0.391816 |
0.651193 |
| 7 |
-1.602573 |
0.514335 |
-0.036118 |
0.239622 |
-0.757499 |
0.222661 |
| 8 |
0.730135 |
-0.321074 |
0.567365 |
1.832134 |
0.688206 |
0.287612 |
df.columns = ['a','b','c','d','e','f']
df
|
a |
b |
c |
d |
e |
f |
| 0 |
-0.787624 |
-1.244919 |
1.093608 |
-0.522551 |
0.212032 |
-1.303601 |
| 1 |
-0.430226 |
-0.623103 |
0.166744 |
-0.875421 |
-1.785116 |
1.591384 |
| 2 |
-1.943899 |
-0.001466 |
0.131749 |
1.130863 |
0.462114 |
-0.769851 |
| 3 |
-1.088908 |
-1.120851 |
-0.053360 |
-0.980173 |
0.177176 |
-1.363977 |
| 4 |
-0.652240 |
-0.607174 |
0.518715 |
-0.706020 |
0.230450 |
0.953491 |
| 5 |
0.651890 |
-0.244218 |
-0.447855 |
-0.746109 |
0.664087 |
-0.035765 |
| 6 |
-1.057828 |
-0.345811 |
0.388428 |
-0.213307 |
-0.391816 |
0.651193 |
| 7 |
-1.602573 |
0.514335 |
-0.036118 |
0.239622 |
-0.757499 |
0.222661 |
| 8 |
0.730135 |
-0.321074 |
0.567365 |
1.832134 |
0.688206 |
0.287612 |
dates = pd.date_range('2024-7-1',periods=9,freq='M')
dates
DatetimeIndex(['2024-07-31', '2024-08-31', '2024-09-30', '2024-10-31',
'2024-11-30', '2024-12-31', '2025-01-31', '2025-02-28',
'2025-03-31'],
dtype='datetime64[ns]', freq='ME')
df.index = dates
df
|
a |
b |
c |
d |
e |
f |
| 2024-07-31 |
-0.787624 |
-1.244919 |
1.093608 |
-0.522551 |
0.212032 |
-1.303601 |
| 2024-08-31 |
-0.430226 |
-0.623103 |
0.166744 |
-0.875421 |
-1.785116 |
1.591384 |
| 2024-09-30 |
-1.943899 |
-0.001466 |
0.131749 |
1.130863 |
0.462114 |
-0.769851 |
| 2024-10-31 |
-1.088908 |
-1.120851 |
-0.053360 |
-0.980173 |
0.177176 |
-1.363977 |
| 2024-11-30 |
-0.652240 |
-0.607174 |
0.518715 |
-0.706020 |
0.230450 |
0.953491 |
| 2024-12-31 |
0.651890 |
-0.244218 |
-0.447855 |
-0.746109 |
0.664087 |
-0.035765 |
| 2025-01-31 |
-1.057828 |
-0.345811 |
0.388428 |
-0.213307 |
-0.391816 |
0.651193 |
| 2025-02-28 |
-1.602573 |
0.514335 |
-0.036118 |
0.239622 |
-0.757499 |
0.222661 |
| 2025-03-31 |
0.730135 |
-0.321074 |
0.567365 |
1.832134 |
0.688206 |
0.287612 |
def square_fun(x):
return x ** 2
df.apply(square_fun)
|
a |
b |
c |
d |
e |
f |
| 2024-07-31 |
0.620352 |
1.549823 |
1.195979 |
0.273060 |
0.044957 |
1.699374 |
| 2024-08-31 |
0.185095 |
0.388257 |
0.027803 |
0.766361 |
3.186637 |
2.532503 |
| 2024-09-30 |
3.778741 |
0.000002 |
0.017358 |
1.278851 |
0.213550 |
0.592671 |
| 2024-10-31 |
1.185720 |
1.256308 |
0.002847 |
0.960740 |
0.031391 |
1.860434 |
| 2024-11-30 |
0.425417 |
0.368660 |
0.269066 |
0.498464 |
0.053107 |
0.909145 |
| 2024-12-31 |
0.424961 |
0.059642 |
0.200574 |
0.556678 |
0.441011 |
0.001279 |
| 2025-01-31 |
1.119000 |
0.119585 |
0.150876 |
0.045500 |
0.153520 |
0.424052 |
| 2025-02-28 |
2.568239 |
0.264541 |
0.001305 |
0.057419 |
0.573804 |
0.049578 |
| 2025-03-31 |
0.533097 |
0.103089 |
0.321903 |
3.356713 |
0.473627 |
0.082721 |
df.apply(lambda x : x ** 0.5)
|
a |
b |
c |
d |
e |
f |
| 2024-07-31 |
NaN |
NaN |
1.045757 |
NaN |
0.460469 |
NaN |
| 2024-08-31 |
NaN |
NaN |
0.408343 |
NaN |
NaN |
1.261501 |
| 2024-09-30 |
NaN |
NaN |
0.362972 |
1.063420 |
0.679790 |
NaN |
| 2024-10-31 |
NaN |
NaN |
NaN |
NaN |
0.420923 |
NaN |
| 2024-11-30 |
NaN |
NaN |
0.720219 |
NaN |
0.480052 |
0.976469 |
| 2024-12-31 |
0.807397 |
NaN |
NaN |
NaN |
0.814915 |
NaN |
| 2025-01-31 |
NaN |
NaN |
0.623240 |
NaN |
NaN |
0.806965 |
| 2025-02-28 |
NaN |
0.717172 |
NaN |
0.489512 |
NaN |
0.471870 |
| 2025-03-31 |
0.854479 |
NaN |
0.753236 |
1.353563 |
0.829582 |
0.536295 |
def find_min(x):
return x.min()
df.apply(find_min)
a -1.943899
b -1.244919
c -0.447855
d -0.980173
e -1.785116
f -1.363977
dtype: float64
df.apply(find_min,axis=1)
2024-07-31 -1.303601
2024-08-31 -1.785116
2024-09-30 -1.943899
2024-10-31 -1.363977
2024-11-30 -0.706020
2024-12-31 -0.746109
2025-01-31 -1.057828
2025-02-28 -1.602573
2025-03-31 -0.321074
Freq: ME, dtype: float64
df
|
a |
b |
c |
d |
e |
f |
| 2024-07-31 |
-0.787624 |
-1.244919 |
1.093608 |
-0.522551 |
0.212032 |
-1.303601 |
| 2024-08-31 |
-0.430226 |
-0.623103 |
0.166744 |
-0.875421 |
-1.785116 |
1.591384 |
| 2024-09-30 |
-1.943899 |
-0.001466 |
0.131749 |
1.130863 |
0.462114 |
-0.769851 |
| 2024-10-31 |
-1.088908 |
-1.120851 |
-0.053360 |
-0.980173 |
0.177176 |
-1.363977 |
| 2024-11-30 |
-0.652240 |
-0.607174 |
0.518715 |
-0.706020 |
0.230450 |
0.953491 |
| 2024-12-31 |
0.651890 |
-0.244218 |
-0.447855 |
-0.746109 |
0.664087 |
-0.035765 |
| 2025-01-31 |
-1.057828 |
-0.345811 |
0.388428 |
-0.213307 |
-0.391816 |
0.651193 |
| 2025-02-28 |
-1.602573 |
0.514335 |
-0.036118 |
0.239622 |
-0.757499 |
0.222661 |
| 2025-03-31 |
0.730135 |
-0.321074 |
0.567365 |
1.832134 |
0.688206 |
0.287612 |
#DataFrame排序
df.sort_index(ascending=False) # 降序排序
|
a |
b |
c |
d |
e |
f |
| 2025-03-31 |
0.730135 |
-0.321074 |
0.567365 |
1.832134 |
0.688206 |
0.287612 |
| 2025-02-28 |
-1.602573 |
0.514335 |
-0.036118 |
0.239622 |
-0.757499 |
0.222661 |
| 2025-01-31 |
-1.057828 |
-0.345811 |
0.388428 |
-0.213307 |
-0.391816 |
0.651193 |
| 2024-12-31 |
0.651890 |
-0.244218 |
-0.447855 |
-0.746109 |
0.664087 |
-0.035765 |
| 2024-11-30 |
-0.652240 |
-0.607174 |
0.518715 |
-0.706020 |
0.230450 |
0.953491 |
| 2024-10-31 |
-1.088908 |
-1.120851 |
-0.053360 |
-0.980173 |
0.177176 |
-1.363977 |
| 2024-09-30 |
-1.943899 |
-0.001466 |
0.131749 |
1.130863 |
0.462114 |
-0.769851 |
| 2024-08-31 |
-0.430226 |
-0.623103 |
0.166744 |
-0.875421 |
-1.785116 |
1.591384 |
| 2024-07-31 |
-0.787624 |
-1.244919 |
1.093608 |
-0.522551 |
0.212032 |
-1.303601 |
df.sort_index(axis=1,ascending=False)
|
f |
e |
d |
c |
b |
a |
| 2024-07-31 |
-1.303601 |
0.212032 |
-0.522551 |
1.093608 |
-1.244919 |
-0.787624 |
| 2024-08-31 |
1.591384 |
-1.785116 |
-0.875421 |
0.166744 |
-0.623103 |
-0.430226 |
| 2024-09-30 |
-0.769851 |
0.462114 |
1.130863 |
0.131749 |
-0.001466 |
-1.943899 |
| 2024-10-31 |
-1.363977 |
0.177176 |
-0.980173 |
-0.053360 |
-1.120851 |
-1.088908 |
| 2024-11-30 |
0.953491 |
0.230450 |
-0.706020 |
0.518715 |
-0.607174 |
-0.652240 |
| 2024-12-31 |
-0.035765 |
0.664087 |
-0.746109 |
-0.447855 |
-0.244218 |
0.651890 |
| 2025-01-31 |
0.651193 |
-0.391816 |
-0.213307 |
0.388428 |
-0.345811 |
-1.057828 |
| 2025-02-28 |
0.222661 |
-0.757499 |
0.239622 |
-0.036118 |
0.514335 |
-1.602573 |
| 2025-03-31 |
0.287612 |
0.688206 |
1.832134 |
0.567365 |
-0.321074 |
0.730135 |
df.sort_values(by='a') # 按values排序
|
a |
b |
c |
d |
e |
f |
| 2024-09-30 |
-1.943899 |
-0.001466 |
0.131749 |
1.130863 |
0.462114 |
-0.769851 |
| 2025-02-28 |
-1.602573 |
0.514335 |
-0.036118 |
0.239622 |
-0.757499 |
0.222661 |
| 2024-10-31 |
-1.088908 |
-1.120851 |
-0.053360 |
-0.980173 |
0.177176 |
-1.363977 |
| 2025-01-31 |
-1.057828 |
-0.345811 |
0.388428 |
-0.213307 |
-0.391816 |
0.651193 |
| 2024-07-31 |
-0.787624 |
-1.244919 |
1.093608 |
-0.522551 |
0.212032 |
-1.303601 |
| 2024-11-30 |
-0.652240 |
-0.607174 |
0.518715 |
-0.706020 |
0.230450 |
0.953491 |
| 2024-08-31 |
-0.430226 |
-0.623103 |
0.166744 |
-0.875421 |
-1.785116 |
1.591384 |
| 2024-12-31 |
0.651890 |
-0.244218 |
-0.447855 |
-0.746109 |
0.664087 |
-0.035765 |
| 2025-03-31 |
0.730135 |
-0.321074 |
0.567365 |
1.832134 |
0.688206 |
0.287612 |
#通用函数
df.sum(axis =1) #按行排序
2024-07-31 -2.553054
2024-08-31 -1.955737
2024-09-30 -0.990490
2024-10-31 -4.430094
2024-11-30 -0.262777
2024-12-31 -0.157969
2025-01-31 -0.969142
2025-02-28 -1.419570
2025-03-31 3.784377
Freq: ME, dtype: float64
df.mean()
a -0.686808
b -0.443809
c 0.258808
d -0.093440
e -0.055596
f 0.025905
dtype: float64
df.cumsum()
|
a |
b |
c |
d |
e |
f |
| 2024-07-31 |
-0.787624 |
-1.244919 |
1.093608 |
-0.522551 |
0.212032 |
-1.303601 |
| 2024-08-31 |
-1.217850 |
-1.868021 |
1.260352 |
-1.397972 |
-1.573084 |
0.287784 |
| 2024-09-30 |
-3.161749 |
-1.869487 |
1.392101 |
-0.267109 |
-1.110969 |
-0.482068 |
| 2024-10-31 |
-4.250657 |
-2.990339 |
1.338741 |
-1.247282 |
-0.933793 |
-1.846045 |
| 2024-11-30 |
-4.902897 |
-3.597513 |
1.857456 |
-1.953302 |
-0.703343 |
-0.892554 |
| 2024-12-31 |
-4.251007 |
-3.841731 |
1.409601 |
-2.699411 |
-0.039256 |
-0.928319 |
| 2025-01-31 |
-5.308835 |
-4.187542 |
1.798029 |
-2.912717 |
-0.431072 |
-0.277126 |
| 2025-02-28 |
-6.911407 |
-3.673207 |
1.761911 |
-2.673095 |
-1.188571 |
-0.054465 |
| 2025-03-31 |
-6.181272 |
-3.994281 |
2.329276 |
-0.840961 |
-0.500365 |
0.233147 |
df.describe()
|
a |
b |
c |
d |
e |
f |
| count |
9.000000 |
9.000000 |
9.000000 |
9.000000 |
9.000000 |
9.000000 |
| mean |
-0.686808 |
-0.443809 |
0.258808 |
-0.093440 |
-0.055596 |
0.025905 |
| std |
0.907970 |
0.540510 |
0.445095 |
0.982039 |
0.801856 |
1.009066 |
| min |
-1.943899 |
-1.244919 |
-0.447855 |
-0.980173 |
-1.785116 |
-1.363977 |
| 25% |
-1.088908 |
-0.623103 |
-0.036118 |
-0.746109 |
-0.391816 |
-0.769851 |
| 50% |
-0.787624 |
-0.345811 |
0.166744 |
-0.522551 |
0.212032 |
0.222661 |
| 75% |
-0.430226 |
-0.244218 |
0.518715 |
0.239622 |
0.462114 |
0.651193 |
| max |
0.730135 |
0.514335 |
1.093608 |
1.832134 |
0.688206 |
1.591384 |
import numpy as np
import pandas as pd
import seaborn as sns
# 创建一个包含10000个日期的时间序列,起始日期为2026年1月1日,频率为每天('D')
period = pd.date_range('2026-1-1', periods = 10000 , freq='D')
df = pd.DataFrame(np.random.randn(10000,4), columns=['Data1','Data2','Data3','Data4'], index = period)
# np.random.randn(10000,4):生成10000行×4列的随机数矩阵,数据符合标准正态分布(均值为0,标准差为1)
# columns=['Data1','Data2','Data3','Data4']:设置四列的列名
# index = period:使用前面创建的时间序列作为行索引
# 最终得到一个10000行×4列的DataFrame,每行对应一个日期,每列包含随机数据
df.head(100)
# 显示DataFrame的前5行数据(默认显示5行),用于快速预览数据结构和内容
|
Data1 |
Data2 |
Data3 |
Data4 |
| 2026-01-01 |
-0.301301 |
-0.066839 |
2.569558 |
0.782728 |
| 2026-01-02 |
-0.825641 |
-0.055080 |
-0.147181 |
0.204207 |
| 2026-01-03 |
-0.088487 |
-0.244936 |
0.548284 |
-1.444184 |
| 2026-01-04 |
0.615162 |
-2.579200 |
-1.301881 |
1.563105 |
| 2026-01-05 |
2.577470 |
0.330930 |
0.594907 |
1.097716 |
| ... |
... |
... |
... |
... |
| 2026-04-06 |
0.107950 |
0.255822 |
0.017296 |
-0.200156 |
| 2026-04-07 |
0.066923 |
0.182171 |
0.764556 |
1.502252 |
| 2026-04-08 |
1.514781 |
0.527656 |
0.823396 |
2.165366 |
| 2026-04-09 |
0.611559 |
0.394781 |
-0.538892 |
0.073134 |
| 2026-04-10 |
-1.270301 |
0.177674 |
0.939690 |
0.462134 |
100 rows × 4 columns
# 新增一列用于Group操作
df['group1'] = np.random.choice(['A','B','C','D'], 10000)
df.tail()
|
Data1 |
Data2 |
Data3 |
Data4 |
group1 |
| 2053-05-14 |
-1.247965 |
0.973246 |
0.329554 |
-1.354778 |
B |
| 2053-05-15 |
-0.248909 |
-1.274829 |
-0.033650 |
-0.924811 |
B |
| 2053-05-16 |
0.259964 |
-1.488595 |
-0.448005 |
-2.367739 |
D |
| 2053-05-17 |
0.214252 |
0.171712 |
1.113388 |
0.405221 |
C |
| 2053-05-18 |
-0.139803 |
-0.029577 |
0.003300 |
0.140935 |
A |
# 生成DataFrameGroupBy对象
grouped = df.groupby('group1')
type(grouped)
pandas.core.groupby.generic.DataFrameGroupBy
grouped.size() # 每组元素数量
group1
A 2460
B 2493
C 2491
D 2556
dtype: int64
grouped.sum()
|
Data1 |
Data2 |
Data3 |
Data4 |
| group1 |
|
|
|
|
| A |
-25.652763 |
-4.789793 |
32.223843 |
-21.093877 |
| B |
-25.363686 |
37.223025 |
88.241825 |
-65.057371 |
| C |
86.237195 |
74.036435 |
-12.382822 |
20.339041 |
| D |
-27.328884 |
-4.406674 |
-105.510381 |
5.173027 |
grouped.max()
|
Data1 |
Data2 |
Data3 |
Data4 |
| group1 |
|
|
|
|
| A |
3.285374 |
3.396309 |
3.512840 |
3.371779 |
| B |
4.068218 |
3.701382 |
3.496006 |
3.160612 |
| C |
2.872331 |
3.222222 |
3.402279 |
3.261225 |
| D |
3.059933 |
3.335675 |
3.550492 |
3.276412 |
grouped.min()
|
Data1 |
Data2 |
Data3 |
Data4 |
| group1 |
|
|
|
|
| A |
-3.596096 |
-3.114213 |
-3.614452 |
-3.653505 |
| B |
-3.716233 |
-3.300269 |
-3.657607 |
-3.295326 |
| C |
-3.663596 |
-3.078911 |
-3.691946 |
-3.363301 |
| D |
-3.558211 |
-3.736870 |
-3.157152 |
-3.381865 |
grouped.mean()
|
Data1 |
Data2 |
Data3 |
Data4 |
| group1 |
|
|
|
|
| A |
-0.010428 |
-0.001947 |
0.013099 |
-0.008575 |
| B |
-0.010174 |
0.014931 |
0.035396 |
-0.026096 |
| C |
0.034620 |
0.029722 |
-0.004971 |
0.008165 |
| D |
-0.010692 |
-0.001724 |
-0.041279 |
0.002024 |
grouped.describe() #整体数据特征概览
|
Data1 |
Data2 |
... |
Data3 |
Data4 |
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
count |
mean |
... |
75% |
max |
count |
mean |
std |
min |
25% |
50% |
75% |
max |
| group1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| A |
2460.0 |
-0.010428 |
0.990728 |
-3.596096 |
-0.678900 |
-0.007923 |
0.641929 |
3.285374 |
2460.0 |
-0.001947 |
... |
0.661997 |
3.512840 |
2460.0 |
-0.008575 |
1.012226 |
-3.653505 |
-0.673584 |
-0.023260 |
0.654113 |
3.371779 |
| B |
2493.0 |
-0.010174 |
0.996869 |
-3.716233 |
-0.652177 |
-0.025997 |
0.629116 |
4.068218 |
2493.0 |
0.014931 |
... |
0.722496 |
3.496006 |
2493.0 |
-0.026096 |
1.006106 |
-3.295326 |
-0.710430 |
-0.010635 |
0.651514 |
3.160612 |
| C |
2491.0 |
0.034620 |
1.003380 |
-3.663596 |
-0.637421 |
0.032357 |
0.713676 |
2.872331 |
2491.0 |
0.029722 |
... |
0.672693 |
3.402279 |
2491.0 |
0.008165 |
1.009963 |
-3.363301 |
-0.678624 |
0.021889 |
0.686504 |
3.261225 |
| D |
2556.0 |
-0.010692 |
0.972185 |
-3.558211 |
-0.654749 |
0.008334 |
0.633307 |
3.059933 |
2556.0 |
-0.001724 |
... |
0.626267 |
3.550492 |
2556.0 |
0.002024 |
1.000016 |
-3.381865 |
-0.680296 |
-0.011760 |
0.669893 |
3.276412 |
4 rows × 32 columns
np.transpose(grouped.describe())
|
group1 |
A |
B |
C |
D |
| Data1 |
count |
2460.000000 |
2493.000000 |
2491.000000 |
2556.000000 |
| mean |
-0.010428 |
-0.010174 |
0.034620 |
-0.010692 |
| std |
0.990728 |
0.996869 |
1.003380 |
0.972185 |
| min |
-3.596096 |
-3.716233 |
-3.663596 |
-3.558211 |
| 25% |
-0.678900 |
-0.652177 |
-0.637421 |
-0.654749 |
| 50% |
-0.007923 |
-0.025997 |
0.032357 |
0.008334 |
| 75% |
0.641929 |
0.629116 |
0.713676 |
0.633307 |
| max |
3.285374 |
4.068218 |
2.872331 |
3.059933 |
| Data2 |
count |
2460.000000 |
2493.000000 |
2491.000000 |
2556.000000 |
| mean |
-0.001947 |
0.014931 |
0.029722 |
-0.001724 |
| std |
1.019700 |
0.972062 |
0.994766 |
1.014706 |
| min |
-3.114213 |
-3.300269 |
-3.078911 |
-3.736870 |
| 25% |
-0.709019 |
-0.655433 |
-0.620272 |
-0.705857 |
| 50% |
-0.016191 |
0.018039 |
0.026946 |
-0.000068 |
| 75% |
0.688548 |
0.652524 |
0.719439 |
0.703314 |
| max |
3.396309 |
3.701382 |
3.222222 |
3.335675 |
| Data3 |
count |
2460.000000 |
2493.000000 |
2491.000000 |
2556.000000 |
| mean |
0.013099 |
0.035396 |
-0.004971 |
-0.041279 |
| std |
1.001428 |
1.006164 |
0.987258 |
0.994164 |
| min |
-3.614452 |
-3.657607 |
-3.691946 |
-3.157152 |
| 25% |
-0.649259 |
-0.669193 |
-0.659139 |
-0.711592 |
| 50% |
0.020383 |
0.026139 |
0.015575 |
-0.014482 |
| 75% |
0.661997 |
0.722496 |
0.672693 |
0.626267 |
| max |
3.512840 |
3.496006 |
3.402279 |
3.550492 |
| Data4 |
count |
2460.000000 |
2493.000000 |
2491.000000 |
2556.000000 |
| mean |
-0.008575 |
-0.026096 |
0.008165 |
0.002024 |
| std |
1.012226 |
1.006106 |
1.009963 |
1.000016 |
| min |
-3.653505 |
-3.295326 |
-3.363301 |
-3.381865 |
| 25% |
-0.673584 |
-0.710430 |
-0.678624 |
-0.680296 |
| 50% |
-0.023260 |
-0.010635 |
0.021889 |
-0.011760 |
| 75% |
0.654113 |
0.651514 |
0.686504 |
0.669893 |
| max |
3.371779 |
3.160612 |
3.261225 |
3.276412 |
grouped.get_group('A').head()
|
Data1 |
Data2 |
Data3 |
Data4 |
group1 |
| 2026-01-01 |
-0.301301 |
-0.066839 |
2.569558 |
0.782728 |
A |
| 2026-01-04 |
0.615162 |
-2.579200 |
-1.301881 |
1.563105 |
A |
| 2026-01-05 |
2.577470 |
0.330930 |
0.594907 |
1.097716 |
A |
| 2026-01-08 |
-0.969435 |
-0.086215 |
1.613460 |
-0.039984 |
A |
| 2026-01-09 |
0.929410 |
-0.513562 |
-2.225990 |
-1.548603 |
A |
# 双重Group
df['group2'] = np.random.choice(['Python','C++','Java'], 10000)
df.head()
|
Data1 |
Data2 |
Data3 |
Data4 |
group1 |
group2 |
| 2026-01-01 |
-0.301301 |
-0.066839 |
2.569558 |
0.782728 |
A |
Java |
| 2026-01-02 |
-0.825641 |
-0.055080 |
-0.147181 |
0.204207 |
D |
Java |
| 2026-01-03 |
-0.088487 |
-0.244936 |
0.548284 |
-1.444184 |
B |
Python |
| 2026-01-04 |
0.615162 |
-2.579200 |
-1.301881 |
1.563105 |
A |
C++ |
| 2026-01-05 |
2.577470 |
0.330930 |
0.594907 |
1.097716 |
A |
Python |
grouped = df.groupby(['group1','group2'])
grouped.size()
group1 group2
A C++ 861
Java 791
Python 808
B C++ 820
Java 835
Python 838
C C++ 810
Java 865
Python 816
D C++ 861
Java 848
Python 847
dtype: int64
grouped.max()
|
|
Data1 |
Data2 |
Data3 |
Data4 |
| group1 |
group2 |
|
|
|
|
| A |
C++ |
3.285374 |
3.396309 |
3.445427 |
3.305271 |
| Java |
3.156763 |
2.811295 |
3.512840 |
3.371779 |
| Python |
2.913568 |
3.320395 |
3.439421 |
3.292169 |
| B |
C++ |
2.819403 |
3.368093 |
3.496006 |
2.923737 |
| Java |
4.068218 |
3.046342 |
3.407152 |
3.160612 |
| Python |
3.170314 |
3.701382 |
2.926639 |
2.972768 |
| C |
C++ |
2.705629 |
2.912055 |
3.003783 |
3.261225 |
| Java |
2.690621 |
2.886152 |
3.402279 |
2.868535 |
| Python |
2.872331 |
3.222222 |
3.022103 |
3.192404 |
| D |
C++ |
2.704116 |
3.318817 |
3.356272 |
3.276412 |
| Java |
3.059933 |
3.182096 |
3.550492 |
2.692905 |
| Python |
2.908952 |
3.335675 |
3.330516 |
2.846557 |
grouped.agg(["mean", "sum"])
|
|
Data1 |
Data2 |
Data3 |
Data4 |
|
|
mean |
sum |
mean |
sum |
mean |
sum |
mean |
sum |
| group1 |
group2 |
|
|
|
|
|
|
|
|
| A |
C++ |
-0.001218 |
-1.048915 |
-0.026585 |
-22.889789 |
0.027946 |
24.061622 |
0.016807 |
14.470629 |
| Java |
0.022964 |
18.164195 |
0.006585 |
5.208598 |
0.028959 |
22.906719 |
-0.082633 |
-65.362462 |
| Python |
-0.052931 |
-42.768043 |
0.015955 |
12.891398 |
-0.018248 |
-14.744498 |
0.036879 |
29.797956 |
| B |
C++ |
-0.026381 |
-21.632712 |
0.028911 |
23.706887 |
0.059737 |
48.984659 |
0.023566 |
19.323968 |
| Java |
-0.025143 |
-20.994664 |
0.000074 |
0.061484 |
-0.001376 |
-1.148683 |
-0.037776 |
-31.543048 |
| Python |
0.020601 |
17.263691 |
0.016056 |
13.454654 |
0.048217 |
40.405849 |
-0.063053 |
-52.838291 |
| C |
C++ |
0.096891 |
78.481366 |
0.031978 |
25.902258 |
-0.040558 |
-32.852304 |
0.001194 |
0.966944 |
| Java |
0.035349 |
30.576985 |
0.050079 |
43.318363 |
-0.006473 |
-5.599206 |
0.028582 |
24.723654 |
| Python |
-0.027967 |
-22.821157 |
0.005902 |
4.815814 |
0.031947 |
26.068688 |
-0.006558 |
-5.351557 |
| D |
C++ |
-0.022580 |
-19.441690 |
0.007852 |
6.760476 |
-0.014059 |
-12.105210 |
0.006689 |
5.759175 |
| Java |
-0.037832 |
-32.081569 |
0.018611 |
15.782321 |
-0.036506 |
-30.957221 |
0.006046 |
5.127194 |
| Python |
0.028565 |
24.194374 |
-0.031818 |
-26.949471 |
-0.073728 |
-62.447950 |
-0.006745 |
-5.713342 |
grouped.agg({'Data1':"mean", 'Data2':"sum"})
|
|
Data1 |
Data2 |
| group1 |
group2 |
|
|
| A |
C++ |
-0.001218 |
-22.889789 |
| Java |
0.022964 |
5.208598 |
| Python |
-0.052931 |
12.891398 |
| B |
C++ |
-0.026381 |
23.706887 |
| Java |
-0.025143 |
0.061484 |
| Python |
0.020601 |
13.454654 |
| C |
C++ |
0.096891 |
25.902258 |
| Java |
0.035349 |
43.318363 |
| Python |
-0.027967 |
4.815814 |
| D |
C++ |
-0.022580 |
6.760476 |
| Java |
-0.037832 |
15.782321 |
| Python |
0.028565 |
-26.949471 |
# Pandas中的Concat、Join、merge操作
df1 = pd.DataFrame(['10','20','25','30'], index = ['a','b','c','d'], columns=['PE'])
df1
df2 = pd.DataFrame(['2.5','3.2','2.5'], index = ['a','d','g'], columns=['PB'])
df2
# Concat操作
pd.concat((df1,df2))
|
PE |
PB |
| a |
10 |
NaN |
| b |
20 |
NaN |
| c |
25 |
NaN |
| d |
30 |
NaN |
| a |
NaN |
2.5 |
| d |
NaN |
3.2 |
| g |
NaN |
2.5 |
pd.concat((df1,df2),axis=1)
|
PE |
PB |
| a |
10 |
2.5 |
| b |
20 |
NaN |
| c |
25 |
NaN |
| d |
30 |
3.2 |
| g |
NaN |
2.5 |
pd.concat((df1,df2),ignore_index=True)
|
PE |
PB |
| 0 |
10 |
NaN |
| 1 |
20 |
NaN |
| 2 |
25 |
NaN |
| 3 |
30 |
NaN |
| 4 |
NaN |
2.5 |
| 5 |
NaN |
3.2 |
| 6 |
NaN |
2.5 |
# JOIN操作
df1.join(df2) # 根据
|
PE |
PB |
| a |
10 |
2.5 |
| b |
20 |
NaN |
| c |
25 |
NaN |
| d |
30 |
3.2 |
df1.join(df2,how='left') #default how控制的是按索引合并的方法
|
PE |
PB |
| a |
10 |
2.5 |
| b |
20 |
NaN |
| c |
25 |
NaN |
| d |
30 |
3.2 |
df2.join(df1)
|
PB |
PE |
| a |
2.5 |
10 |
| d |
3.2 |
30 |
| g |
2.5 |
NaN |
df = pd.DataFrame({'PE':df1['PE'],'PB':df2['PB']}) # 使用DataFrame进行拼接操作
df
|
PE |
PB |
| a |
10 |
2.5 |
| b |
20 |
NaN |
| c |
25 |
NaN |
| d |
30 |
3.2 |
| g |
NaN |
2.5 |
df1.join(df2,how='right')
|
PE |
PB |
| a |
10 |
2.5 |
| d |
30 |
3.2 |
| g |
NaN |
2.5 |
df1.join(df2,how='inner') # 取交集,双方都有的才进行显示
df1.join(df2,how='outer') # 取并集,保留双方所有的内容
|
PE |
PB |
| a |
10 |
2.5 |
| b |
20 |
NaN |
| c |
25 |
NaN |
| d |
30 |
3.2 |
| g |
NaN |
2.5 |
#Merge操作:同等数量合并
# 按列来进行合并(也支持按index来进行合并)
df2 = pd.DataFrame(['2.5','3.2','2.5','2'], index=['a','b','c','d'], columns=['PB',])
df2
df1
df2
roe = pd.Series([0.12,0.06,0.08,0.02], index=['a','b','c','d'])
df1['ROE'] = roe
df2['ROE'] = roe
roe
a 0.12
b 0.06
c 0.08
d 0.02
dtype: float64
df1
|
PE |
ROE |
| a |
10 |
0.12 |
| b |
20 |
0.06 |
| c |
25 |
0.08 |
| d |
30 |
0.02 |
df2
|
PB |
ROE |
| a |
2.5 |
0.12 |
| b |
3.2 |
0.06 |
| c |
2.5 |
0.08 |
| d |
2 |
0.02 |
pd.merge(df1,df2)
|
PE |
ROE |
PB |
| 0 |
10 |
0.12 |
2.5 |
| 1 |
20 |
0.06 |
3.2 |
| 2 |
25 |
0.08 |
2.5 |
| 3 |
30 |
0.02 |
2 |
roe2 = pd.Series([2,0.06,0.08,0.02], index=['a','b','c','d'])
df2['ROE'] = roe2
df2
|
PB |
ROE |
| a |
2.5 |
2.00 |
| b |
3.2 |
0.06 |
| c |
2.5 |
0.08 |
| d |
2 |
0.02 |
pd.merge(df1,df2)
|
PE |
ROE |
PB |
| 0 |
20 |
0.06 |
3.2 |
| 1 |
25 |
0.08 |
2.5 |
| 2 |
30 |
0.02 |
2 |
df1.join(df2, rsuffix = '_r')
|
PE |
ROE |
PB |
ROE_r |
| a |
10 |
0.12 |
2.5 |
2.00 |
| b |
20 |
0.06 |
3.2 |
0.06 |
| c |
25 |
0.08 |
2.5 |
0.08 |
| d |
30 |
0.02 |
2 |
0.02 |
pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
|
PE |
ROE_x |
PB |
ROE_y |
| a |
10 |
0.12 |
2.5 |
2.00 |
| b |
20 |
0.06 |
3.2 |
0.06 |
| c |
25 |
0.08 |
2.5 |
0.08 |
| d |
30 |
0.02 |
2 |
0.02 |
# 非同等数量的合并
df2 = pd.DataFrame(['2.5','3.2','2.5'], index=['a','b','c'], columns = ['PE'])
df2
df1 =pd.DataFrame(['10','20','25','30'], index = ['a','b','c','d'], columns=['PE'])
df1
roe=pd.Series([0.12,0.08,0.06],index=['b','d','c']) #添加一列以进行合并
df1['ROE'] = roe
df2['ROE'] = roe
df1
|
PE |
ROE |
| a |
10 |
NaN |
| b |
20 |
0.12 |
| c |
25 |
0.06 |
| d |
30 |
0.08 |
df2
|
PE |
ROE |
| a |
2.5 |
NaN |
| b |
3.2 |
0.12 |
| c |
2.5 |
0.06 |
pd.merge(df1,df2)
pd.merge(df1,df2,how='outer')
|
PE |
ROE |
| 0 |
10 |
NaN |
| 1 |
2.5 |
0.06 |
| 2 |
2.5 |
NaN |
| 3 |
20 |
0.12 |
| 4 |
25 |
0.06 |
| 5 |
3.2 |
0.12 |
| 6 |
30 |
0.08 |
#层次化索引
df = pd.Series(np.random.randn(5), index=[['a','a','b','b','b'],[1,2,1,2,3]])
df
a 1 1.238082
2 0.031972
b 1 0.518654
2 1.004512
3 -0.134055
dtype: float64
df.index
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2),
('b', 3)],
)
df['b']
1 0.518654
2 1.004512
3 -0.134055
dtype: float64
#层次化索引在数据重塑中的应用
df.unstack()
|
1 |
2 |
3 |
| a |
1.238082 |
0.031972 |
NaN |
| b |
0.518654 |
1.004512 |
-0.134055 |
df.unstack().unstack()
1 a 1.238082
b 0.518654
2 a 0.031972
b 1.004512
3 a NaN
b -0.134055
dtype: float64