Pandas

数据结构

  • Series
    • 一种类似一维数组的对象,由一组数据与一组标签(索引)组成
  • DataFrame
    • 表格型的数据结构,Excel与R的data.frame;可以有行索引和列索引
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter('ignore')

# 通过list构建Series
df = pd.Series(range(10,20))
type(df)
pandas.core.series.Series
df
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
df.values
array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
type(df.values)
numpy.ndarray
df.head(5)
0    10
1    11
2    12
3    13
4    14
dtype: int64
print(df[0])
10
print(df[8])
18
df[df>15]
6    16
7    17
8    18
9    19
dtype: int64
df>15
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool
# 通过字典构建Series
sales_data = {'a':1, 'b':2,'c':3}
df = pd.Series(sales_data)
print(df.head())
print(df.index)
a    1
b    2
c    3
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
df.name = 'Series1'
df.index.name = 'idx'
df.head()
idx
a    1
b    2
c    3
Name: Series1, dtype: int64
df = pd.Series(range(8,14), index = ['a','b','c','d','e','f'])
#print(df)         # 打印整个 Series
print(df.head())  # 打印前几行(默认前5行) 显示 Series/DataFrame 的前 n 行数据
print(df.tail())  # 查看后5行
a     8
b     9
c    10
d    11
e    12
dtype: int64
b     9
c    10
d    11
e    12
f    13
dtype: int64
print(df[1]) # index行索引
print('-------------')
print(df[1:3]) # 切片索引 ,前闭后开
print('-------------')
9
-------------
b     9
c    10
dtype: int64
-------------
# DataFrame的构建
array = np.random.randn(6,4)
array
array([[ 0.06736358, -0.64255316,  0.20449614, -0.32843359],
       [ 0.94659344,  0.29186784, -0.60989757, -0.74899701],
       [-0.91717312,  1.48430298,  1.17129331,  1.81347392],
       [-3.62074396,  0.04548781,  1.65180448,  1.55219322],
       [ 0.32978011, -0.71838459, -1.21471285,  0.43443465],
       [-1.86554116, -1.25161324,  0.0990453 ,  0.46801472]])
df_obj = pd.DataFrame(array) # 通过Ndarray构建的DataFrame,如果没有声明,默认的行index和列index都是从0开始
df_obj.head()
0 1 2 3
0 0.067364 -0.642553 0.204496 -0.328434
1 0.946593 0.291868 -0.609898 -0.748997
2 -0.917173 1.484303 1.171293 1.813474
3 -3.620744 0.045488 1.651804 1.552193
4 0.329780 -0.718385 -1.214713 0.434435
df = pd.DataFrame(np.random.randn(6,4),columns = ['a','b','c','d'], index = ['e','f','g','h','i','j'])
df.head(6)
a b c d
e 0.583592 -1.501183 0.116741 -0.442168
f -0.161299 0.528328 -0.450201 -1.196337
g 1.042936 -0.152244 1.489381 0.968527
h 0.008104 0.342535 0.447326 0.136234
i -0.835837 -0.912627 -0.100602 0.612302
j 0.206510 0.232865 -0.320405 -2.312483
# 通过字典构建DataFrame
dict_data = {'Date':pd.Timestamp('20251226'),
            'Number':pd.Series(6,index=list(range(4))),
            'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
            'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()
Date Number Course_name Company
0 2025-12-26 6 Python JET
1 2025-12-26 6 Quant JET
2 2025-12-26 6 Finance JET
3 2025-12-26 6 CFA JET
# 自定义构建DataFrame
df = pd.DataFrame([10,30,60,80], columns=['Quantity'], index=['a','b','c','d'])
df
Quantity
a 10
b 30
c 60
d 80
print(type(df))
<class 'pandas.core.frame.DataFrame'>
# DataFrame的选择操作
dict_data = {'Date':pd.Timestamp('20251226'),
            'Number':pd.Series(6,index=list(range(4))),
            'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
            'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()
Date Number Course_name Company
0 2025-12-26 6 Python JET
1 2025-12-26 6 Quant JET
2 2025-12-26 6 Finance JET
3 2025-12-26 6 CFA JET
df['Course_name']
0     Python
1      Quant
2    Finance
3        CFA
Name: Course_name, dtype: object
# 增加具体的列
df['Period'] = range(21,25)
df.head()
Date Number Course_name Company Period
0 2025-12-26 6 Python JET 21
1 2025-12-26 6 Quant JET 22
2 2025-12-26 6 Finance JET 23
3 2025-12-26 6 CFA JET 24
# 删除列
del(df['Period'])
df.head()
Date Number Course_name Company
0 2025-12-26 6 Python JET
1 2025-12-26 6 Quant JET
2 2025-12-26 6 Finance JET
3 2025-12-26 6 CFA JET
# 将一个完整的DataFrame作为新建的列,Pandas会自动根据index的lable进行匹配
df['Names'] = pd.DataFrame(['Alpha','Beta','Gramma','Theta']) # ,index=['4','2','3','1'] 这个没试成功
df
Date Number Course_name Company Names
0 2025-12-26 6 Python JET Alpha
1 2025-12-26 6 Quant JET Beta
2 2025-12-26 6 Finance JET Gramma
3 2025-12-26 6 CFA JET Theta
df = pd.DataFrame([80.5,90,58,88],columns=['Scores'],
                  index=['a','b','c','d'],dtype='f')
df
Scores
a 80.5
b 90.0
c 58.0
d 88.0
df.dtypes
Scores    float32
dtype: object
df.columns #获得列名
Index(['Scores'], dtype='object')
df.index[1:3] # 选择index
Index(['b', 'c'], dtype='object')
df.iloc[1:3]
Scores
b 90.0
c 58.0
df.loc['c']
Scores    58.0
Name: c, dtype: float32
df.sum()
Scores    316.5
dtype: float32
df.rename(columns={'Scores':'Score'},inplace=True) #重命名列,注意重命名的时候要用字典来进行传输
df
Score
a 80.5
b 90.0
c 58.0
d 88.0
df.shape
(4, 1)
#DataFrame的向量化操作
df
Score
a 80.5
b 90.0
c 58.0
d 88.0
df+df
Score
a 161.0
b 180.0
c 116.0
d 176.0
df - df
Score
a 0.0
b 0.0
c 0.0
d 0.0
df * 3
Score
a 241.5
b 270.0
c 174.0
d 264.0
df
Score
a 80.5
b 90.0
c 58.0
d 88.0
df['Score'] = df['Score'].astype('int')
df.dtypes
Score    int64
dtype: object
d = {
    'PE':pd.Series([10,20,30,40],index=['Company a','Company b','Company c','Company d']),
    'PB':pd.Series([2,3,2.5,4],index=['Company a','Company b','Company c','Company d']),
    'ROE':pd.Series([0.06,0.1,0.08,0.02],index=['Company a','Company b','Company c','Company d'])
}
df = pd.DataFrame(d)
df
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
Company c 30 2.5 0.08
Company d 40 4.0 0.02
(df.PE <25) * 1
Company a    1
Company b    1
Company c    0
Company d    0
Name: PE, dtype: int64
df[df['PE']<25]
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
df[df.PE < 25]
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
df[(df.PE<25) & (df.PB<2.5)]
PE PB ROE
Company a 10 2.0 0.06
df[ (df.PE<25) * 1 + (df.PB < 2.5) * 1 == 2] #多个条件
PE PB ROE
Company a 10 2.0 0.06
df[ (df.PE < 25) * 1 + (df.PB < 2.5) * 1 + (df.ROE > 0.07) * 1 >= 2] # 三个当中满足任意两个条件
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
#DataFrame Apply函数
a = np.random.randn(9,6)
a.round(5)
array([[-7.87620e-01, -1.24492e+00,  1.09361e+00, -5.22550e-01,
         2.12030e-01, -1.30360e+00],
       [-4.30230e-01, -6.23100e-01,  1.66740e-01, -8.75420e-01,
        -1.78512e+00,  1.59138e+00],
       [-1.94390e+00, -1.47000e-03,  1.31750e-01,  1.13086e+00,
         4.62110e-01, -7.69850e-01],
       [-1.08891e+00, -1.12085e+00, -5.33600e-02, -9.80170e-01,
         1.77180e-01, -1.36398e+00],
       [-6.52240e-01, -6.07170e-01,  5.18720e-01, -7.06020e-01,
         2.30450e-01,  9.53490e-01],
       [ 6.51890e-01, -2.44220e-01, -4.47850e-01, -7.46110e-01,
         6.64090e-01, -3.57600e-02],
       [-1.05783e+00, -3.45810e-01,  3.88430e-01, -2.13310e-01,
        -3.91820e-01,  6.51190e-01],
       [-1.60257e+00,  5.14340e-01, -3.61200e-02,  2.39620e-01,
        -7.57500e-01,  2.22660e-01],
       [ 7.30130e-01, -3.21070e-01,  5.67360e-01,  1.83213e+00,
         6.88210e-01,  2.87610e-01]])
df = pd.DataFrame(a)
df
0 1 2 3 4 5
0 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
1 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
3 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
4 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
5 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
6 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
7 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
8 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
df.columns = ['a','b','c','d','e','f']
df
a b c d e f
0 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
1 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
3 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
4 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
5 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
6 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
7 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
8 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
dates = pd.date_range('2024-7-1',periods=9,freq='M')
dates
DatetimeIndex(['2024-07-31', '2024-08-31', '2024-09-30', '2024-10-31',
               '2024-11-30', '2024-12-31', '2025-01-31', '2025-02-28',
               '2025-03-31'],
              dtype='datetime64[ns]', freq='ME')
df.index = dates
df
a b c d e f
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
def square_fun(x):
    return x ** 2
df.apply(square_fun)
a b c d e f
2024-07-31 0.620352 1.549823 1.195979 0.273060 0.044957 1.699374
2024-08-31 0.185095 0.388257 0.027803 0.766361 3.186637 2.532503
2024-09-30 3.778741 0.000002 0.017358 1.278851 0.213550 0.592671
2024-10-31 1.185720 1.256308 0.002847 0.960740 0.031391 1.860434
2024-11-30 0.425417 0.368660 0.269066 0.498464 0.053107 0.909145
2024-12-31 0.424961 0.059642 0.200574 0.556678 0.441011 0.001279
2025-01-31 1.119000 0.119585 0.150876 0.045500 0.153520 0.424052
2025-02-28 2.568239 0.264541 0.001305 0.057419 0.573804 0.049578
2025-03-31 0.533097 0.103089 0.321903 3.356713 0.473627 0.082721
df.apply(lambda x : x ** 0.5)
a b c d e f
2024-07-31 NaN NaN 1.045757 NaN 0.460469 NaN
2024-08-31 NaN NaN 0.408343 NaN NaN 1.261501
2024-09-30 NaN NaN 0.362972 1.063420 0.679790 NaN
2024-10-31 NaN NaN NaN NaN 0.420923 NaN
2024-11-30 NaN NaN 0.720219 NaN 0.480052 0.976469
2024-12-31 0.807397 NaN NaN NaN 0.814915 NaN
2025-01-31 NaN NaN 0.623240 NaN NaN 0.806965
2025-02-28 NaN 0.717172 NaN 0.489512 NaN 0.471870
2025-03-31 0.854479 NaN 0.753236 1.353563 0.829582 0.536295
def find_min(x):
    return x.min()

df.apply(find_min)
a   -1.943899
b   -1.244919
c   -0.447855
d   -0.980173
e   -1.785116
f   -1.363977
dtype: float64
df.apply(find_min,axis=1)
2024-07-31   -1.303601
2024-08-31   -1.785116
2024-09-30   -1.943899
2024-10-31   -1.363977
2024-11-30   -0.706020
2024-12-31   -0.746109
2025-01-31   -1.057828
2025-02-28   -1.602573
2025-03-31   -0.321074
Freq: ME, dtype: float64
df
a b c d e f
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
#DataFrame排序
df.sort_index(ascending=False) # 降序排序
a b c d e f
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
df.sort_index(axis=1,ascending=False)
f e d c b a
2024-07-31 -1.303601 0.212032 -0.522551 1.093608 -1.244919 -0.787624
2024-08-31 1.591384 -1.785116 -0.875421 0.166744 -0.623103 -0.430226
2024-09-30 -0.769851 0.462114 1.130863 0.131749 -0.001466 -1.943899
2024-10-31 -1.363977 0.177176 -0.980173 -0.053360 -1.120851 -1.088908
2024-11-30 0.953491 0.230450 -0.706020 0.518715 -0.607174 -0.652240
2024-12-31 -0.035765 0.664087 -0.746109 -0.447855 -0.244218 0.651890
2025-01-31 0.651193 -0.391816 -0.213307 0.388428 -0.345811 -1.057828
2025-02-28 0.222661 -0.757499 0.239622 -0.036118 0.514335 -1.602573
2025-03-31 0.287612 0.688206 1.832134 0.567365 -0.321074 0.730135
df.sort_values(by='a') # 按values排序
a b c d e f
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
#通用函数
df.sum(axis =1) #按行排序
2024-07-31   -2.553054
2024-08-31   -1.955737
2024-09-30   -0.990490
2024-10-31   -4.430094
2024-11-30   -0.262777
2024-12-31   -0.157969
2025-01-31   -0.969142
2025-02-28   -1.419570
2025-03-31    3.784377
Freq: ME, dtype: float64
df.mean()
a   -0.686808
b   -0.443809
c    0.258808
d   -0.093440
e   -0.055596
f    0.025905
dtype: float64
df.cumsum()
a b c d e f
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-08-31 -1.217850 -1.868021 1.260352 -1.397972 -1.573084 0.287784
2024-09-30 -3.161749 -1.869487 1.392101 -0.267109 -1.110969 -0.482068
2024-10-31 -4.250657 -2.990339 1.338741 -1.247282 -0.933793 -1.846045
2024-11-30 -4.902897 -3.597513 1.857456 -1.953302 -0.703343 -0.892554
2024-12-31 -4.251007 -3.841731 1.409601 -2.699411 -0.039256 -0.928319
2025-01-31 -5.308835 -4.187542 1.798029 -2.912717 -0.431072 -0.277126
2025-02-28 -6.911407 -3.673207 1.761911 -2.673095 -1.188571 -0.054465
2025-03-31 -6.181272 -3.994281 2.329276 -0.840961 -0.500365 0.233147
df.describe()
a b c d e f
count 9.000000 9.000000 9.000000 9.000000 9.000000 9.000000
mean -0.686808 -0.443809 0.258808 -0.093440 -0.055596 0.025905
std 0.907970 0.540510 0.445095 0.982039 0.801856 1.009066
min -1.943899 -1.244919 -0.447855 -0.980173 -1.785116 -1.363977
25% -1.088908 -0.623103 -0.036118 -0.746109 -0.391816 -0.769851
50% -0.787624 -0.345811 0.166744 -0.522551 0.212032 0.222661
75% -0.430226 -0.244218 0.518715 0.239622 0.462114 0.651193
max 0.730135 0.514335 1.093608 1.832134 0.688206 1.591384
import numpy as np
import pandas as pd
import seaborn as sns

# 创建一个包含10000个日期的时间序列,起始日期为2026年1月1日,频率为每天('D')
period = pd.date_range('2026-1-1', periods = 10000 , freq='D')
df = pd.DataFrame(np.random.randn(10000,4), columns=['Data1','Data2','Data3','Data4'], index = period)
# np.random.randn(10000,4):生成10000行×4列的随机数矩阵,数据符合标准正态分布(均值为0,标准差为1)
# columns=['Data1','Data2','Data3','Data4']:设置四列的列名
# index = period:使用前面创建的时间序列作为行索引
# 最终得到一个10000行×4列的DataFrame,每行对应一个日期,每列包含随机数据
df.head(100)
# 显示DataFrame的前5行数据(默认显示5行),用于快速预览数据结构和内容
Data1 Data2 Data3 Data4
2026-01-01 -0.301301 -0.066839 2.569558 0.782728
2026-01-02 -0.825641 -0.055080 -0.147181 0.204207
2026-01-03 -0.088487 -0.244936 0.548284 -1.444184
2026-01-04 0.615162 -2.579200 -1.301881 1.563105
2026-01-05 2.577470 0.330930 0.594907 1.097716
... ... ... ... ...
2026-04-06 0.107950 0.255822 0.017296 -0.200156
2026-04-07 0.066923 0.182171 0.764556 1.502252
2026-04-08 1.514781 0.527656 0.823396 2.165366
2026-04-09 0.611559 0.394781 -0.538892 0.073134
2026-04-10 -1.270301 0.177674 0.939690 0.462134

100 rows × 4 columns

# 新增一列用于Group操作
df['group1'] = np.random.choice(['A','B','C','D'], 10000)
df.tail()
Data1 Data2 Data3 Data4 group1
2053-05-14 -1.247965 0.973246 0.329554 -1.354778 B
2053-05-15 -0.248909 -1.274829 -0.033650 -0.924811 B
2053-05-16 0.259964 -1.488595 -0.448005 -2.367739 D
2053-05-17 0.214252 0.171712 1.113388 0.405221 C
2053-05-18 -0.139803 -0.029577 0.003300 0.140935 A
# 生成DataFrameGroupBy对象
grouped = df.groupby('group1')
type(grouped)
pandas.core.groupby.generic.DataFrameGroupBy
grouped.size() # 每组元素数量
group1
A    2460
B    2493
C    2491
D    2556
dtype: int64
grouped.sum()
Data1 Data2 Data3 Data4
group1
A -25.652763 -4.789793 32.223843 -21.093877
B -25.363686 37.223025 88.241825 -65.057371
C 86.237195 74.036435 -12.382822 20.339041
D -27.328884 -4.406674 -105.510381 5.173027
grouped.max()
Data1 Data2 Data3 Data4
group1
A 3.285374 3.396309 3.512840 3.371779
B 4.068218 3.701382 3.496006 3.160612
C 2.872331 3.222222 3.402279 3.261225
D 3.059933 3.335675 3.550492 3.276412
grouped.min()
Data1 Data2 Data3 Data4
group1
A -3.596096 -3.114213 -3.614452 -3.653505
B -3.716233 -3.300269 -3.657607 -3.295326
C -3.663596 -3.078911 -3.691946 -3.363301
D -3.558211 -3.736870 -3.157152 -3.381865
grouped.mean()
Data1 Data2 Data3 Data4
group1
A -0.010428 -0.001947 0.013099 -0.008575
B -0.010174 0.014931 0.035396 -0.026096
C 0.034620 0.029722 -0.004971 0.008165
D -0.010692 -0.001724 -0.041279 0.002024
grouped.describe()  #整体数据特征概览
Data1 Data2 ... Data3 Data4
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
group1
A 2460.0 -0.010428 0.990728 -3.596096 -0.678900 -0.007923 0.641929 3.285374 2460.0 -0.001947 ... 0.661997 3.512840 2460.0 -0.008575 1.012226 -3.653505 -0.673584 -0.023260 0.654113 3.371779
B 2493.0 -0.010174 0.996869 -3.716233 -0.652177 -0.025997 0.629116 4.068218 2493.0 0.014931 ... 0.722496 3.496006 2493.0 -0.026096 1.006106 -3.295326 -0.710430 -0.010635 0.651514 3.160612
C 2491.0 0.034620 1.003380 -3.663596 -0.637421 0.032357 0.713676 2.872331 2491.0 0.029722 ... 0.672693 3.402279 2491.0 0.008165 1.009963 -3.363301 -0.678624 0.021889 0.686504 3.261225
D 2556.0 -0.010692 0.972185 -3.558211 -0.654749 0.008334 0.633307 3.059933 2556.0 -0.001724 ... 0.626267 3.550492 2556.0 0.002024 1.000016 -3.381865 -0.680296 -0.011760 0.669893 3.276412

4 rows × 32 columns

np.transpose(grouped.describe())
group1 A B C D
Data1 count 2460.000000 2493.000000 2491.000000 2556.000000
mean -0.010428 -0.010174 0.034620 -0.010692
std 0.990728 0.996869 1.003380 0.972185
min -3.596096 -3.716233 -3.663596 -3.558211
25% -0.678900 -0.652177 -0.637421 -0.654749
50% -0.007923 -0.025997 0.032357 0.008334
75% 0.641929 0.629116 0.713676 0.633307
max 3.285374 4.068218 2.872331 3.059933
Data2 count 2460.000000 2493.000000 2491.000000 2556.000000
mean -0.001947 0.014931 0.029722 -0.001724
std 1.019700 0.972062 0.994766 1.014706
min -3.114213 -3.300269 -3.078911 -3.736870
25% -0.709019 -0.655433 -0.620272 -0.705857
50% -0.016191 0.018039 0.026946 -0.000068
75% 0.688548 0.652524 0.719439 0.703314
max 3.396309 3.701382 3.222222 3.335675
Data3 count 2460.000000 2493.000000 2491.000000 2556.000000
mean 0.013099 0.035396 -0.004971 -0.041279
std 1.001428 1.006164 0.987258 0.994164
min -3.614452 -3.657607 -3.691946 -3.157152
25% -0.649259 -0.669193 -0.659139 -0.711592
50% 0.020383 0.026139 0.015575 -0.014482
75% 0.661997 0.722496 0.672693 0.626267
max 3.512840 3.496006 3.402279 3.550492
Data4 count 2460.000000 2493.000000 2491.000000 2556.000000
mean -0.008575 -0.026096 0.008165 0.002024
std 1.012226 1.006106 1.009963 1.000016
min -3.653505 -3.295326 -3.363301 -3.381865
25% -0.673584 -0.710430 -0.678624 -0.680296
50% -0.023260 -0.010635 0.021889 -0.011760
75% 0.654113 0.651514 0.686504 0.669893
max 3.371779 3.160612 3.261225 3.276412
grouped.get_group('A').head()
Data1 Data2 Data3 Data4 group1
2026-01-01 -0.301301 -0.066839 2.569558 0.782728 A
2026-01-04 0.615162 -2.579200 -1.301881 1.563105 A
2026-01-05 2.577470 0.330930 0.594907 1.097716 A
2026-01-08 -0.969435 -0.086215 1.613460 -0.039984 A
2026-01-09 0.929410 -0.513562 -2.225990 -1.548603 A
# 双重Group
df['group2'] = np.random.choice(['Python','C++','Java'], 10000)
df.head()
Data1 Data2 Data3 Data4 group1 group2
2026-01-01 -0.301301 -0.066839 2.569558 0.782728 A Java
2026-01-02 -0.825641 -0.055080 -0.147181 0.204207 D Java
2026-01-03 -0.088487 -0.244936 0.548284 -1.444184 B Python
2026-01-04 0.615162 -2.579200 -1.301881 1.563105 A C++
2026-01-05 2.577470 0.330930 0.594907 1.097716 A Python
grouped = df.groupby(['group1','group2'])
grouped.size()
group1  group2
A       C++       861
        Java      791
        Python    808
B       C++       820
        Java      835
        Python    838
C       C++       810
        Java      865
        Python    816
D       C++       861
        Java      848
        Python    847
dtype: int64
grouped.max()
Data1 Data2 Data3 Data4
group1 group2
A C++ 3.285374 3.396309 3.445427 3.305271
Java 3.156763 2.811295 3.512840 3.371779
Python 2.913568 3.320395 3.439421 3.292169
B C++ 2.819403 3.368093 3.496006 2.923737
Java 4.068218 3.046342 3.407152 3.160612
Python 3.170314 3.701382 2.926639 2.972768
C C++ 2.705629 2.912055 3.003783 3.261225
Java 2.690621 2.886152 3.402279 2.868535
Python 2.872331 3.222222 3.022103 3.192404
D C++ 2.704116 3.318817 3.356272 3.276412
Java 3.059933 3.182096 3.550492 2.692905
Python 2.908952 3.335675 3.330516 2.846557
grouped.agg(["mean", "sum"])
Data1 Data2 Data3 Data4
mean sum mean sum mean sum mean sum
group1 group2
A C++ -0.001218 -1.048915 -0.026585 -22.889789 0.027946 24.061622 0.016807 14.470629
Java 0.022964 18.164195 0.006585 5.208598 0.028959 22.906719 -0.082633 -65.362462
Python -0.052931 -42.768043 0.015955 12.891398 -0.018248 -14.744498 0.036879 29.797956
B C++ -0.026381 -21.632712 0.028911 23.706887 0.059737 48.984659 0.023566 19.323968
Java -0.025143 -20.994664 0.000074 0.061484 -0.001376 -1.148683 -0.037776 -31.543048
Python 0.020601 17.263691 0.016056 13.454654 0.048217 40.405849 -0.063053 -52.838291
C C++ 0.096891 78.481366 0.031978 25.902258 -0.040558 -32.852304 0.001194 0.966944
Java 0.035349 30.576985 0.050079 43.318363 -0.006473 -5.599206 0.028582 24.723654
Python -0.027967 -22.821157 0.005902 4.815814 0.031947 26.068688 -0.006558 -5.351557
D C++ -0.022580 -19.441690 0.007852 6.760476 -0.014059 -12.105210 0.006689 5.759175
Java -0.037832 -32.081569 0.018611 15.782321 -0.036506 -30.957221 0.006046 5.127194
Python 0.028565 24.194374 -0.031818 -26.949471 -0.073728 -62.447950 -0.006745 -5.713342
grouped.agg({'Data1':"mean", 'Data2':"sum"})
Data1 Data2
group1 group2
A C++ -0.001218 -22.889789
Java 0.022964 5.208598
Python -0.052931 12.891398
B C++ -0.026381 23.706887
Java -0.025143 0.061484
Python 0.020601 13.454654
C C++ 0.096891 25.902258
Java 0.035349 43.318363
Python -0.027967 4.815814
D C++ -0.022580 6.760476
Java -0.037832 15.782321
Python 0.028565 -26.949471
# Pandas中的Concat、Join、merge操作

df1 = pd.DataFrame(['10','20','25','30'], index = ['a','b','c','d'], columns=['PE'])
df1
PE
a 10
b 20
c 25
d 30
df2 = pd.DataFrame(['2.5','3.2','2.5'], index = ['a','d','g'], columns=['PB'])
df2
PB
a 2.5
d 3.2
g 2.5
# Concat操作
pd.concat((df1,df2))
PE PB
a 10 NaN
b 20 NaN
c 25 NaN
d 30 NaN
a NaN 2.5
d NaN 3.2
g NaN 2.5
pd.concat((df1,df2),axis=1)
PE PB
a 10 2.5
b 20 NaN
c 25 NaN
d 30 3.2
g NaN 2.5
pd.concat((df1,df2),ignore_index=True)
PE PB
0 10 NaN
1 20 NaN
2 25 NaN
3 30 NaN
4 NaN 2.5
5 NaN 3.2
6 NaN 2.5
# JOIN操作
df1.join(df2) #  根据
PE PB
a 10 2.5
b 20 NaN
c 25 NaN
d 30 3.2
df1.join(df2,how='left') #default how控制的是按索引合并的方法
PE PB
a 10 2.5
b 20 NaN
c 25 NaN
d 30 3.2
df2.join(df1)
PB PE
a 2.5 10
d 3.2 30
g 2.5 NaN
df = pd.DataFrame({'PE':df1['PE'],'PB':df2['PB']}) # 使用DataFrame进行拼接操作
df
PE PB
a 10 2.5
b 20 NaN
c 25 NaN
d 30 3.2
g NaN 2.5
df1.join(df2,how='right')
PE PB
a 10 2.5
d 30 3.2
g NaN 2.5
df1.join(df2,how='inner') # 取交集,双方都有的才进行显示
PE PB
a 10 2.5
d 30 3.2
df1.join(df2,how='outer') # 取并集,保留双方所有的内容
PE PB
a 10 2.5
b 20 NaN
c 25 NaN
d 30 3.2
g NaN 2.5
#Merge操作:同等数量合并
# 按列来进行合并(也支持按index来进行合并)
df2 = pd.DataFrame(['2.5','3.2','2.5','2'], index=['a','b','c','d'], columns=['PB',])
df2
PB
a 2.5
b 3.2
c 2.5
d 2
df1
PE
a 10
b 20
c 25
d 30
df2
PB
a 2.5
b 3.2
c 2.5
d 2
roe = pd.Series([0.12,0.06,0.08,0.02], index=['a','b','c','d'])
df1['ROE'] = roe
df2['ROE'] = roe
roe
a    0.12
b    0.06
c    0.08
d    0.02
dtype: float64
df1
PE ROE
a 10 0.12
b 20 0.06
c 25 0.08
d 30 0.02
df2
PB ROE
a 2.5 0.12
b 3.2 0.06
c 2.5 0.08
d 2 0.02
pd.merge(df1,df2)
PE ROE PB
0 10 0.12 2.5
1 20 0.06 3.2
2 25 0.08 2.5
3 30 0.02 2
roe2 = pd.Series([2,0.06,0.08,0.02], index=['a','b','c','d'])
df2['ROE'] = roe2
df2
PB ROE
a 2.5 2.00
b 3.2 0.06
c 2.5 0.08
d 2 0.02
pd.merge(df1,df2)
PE ROE PB
0 20 0.06 3.2
1 25 0.08 2.5
2 30 0.02 2
df1.join(df2, rsuffix = '_r')
PE ROE PB ROE_r
a 10 0.12 2.5 2.00
b 20 0.06 3.2 0.06
c 25 0.08 2.5 0.08
d 30 0.02 2 0.02
pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
PE ROE_x PB ROE_y
a 10 0.12 2.5 2.00
b 20 0.06 3.2 0.06
c 25 0.08 2.5 0.08
d 30 0.02 2 0.02
# 非同等数量的合并
df2 = pd.DataFrame(['2.5','3.2','2.5'], index=['a','b','c'], columns = ['PE'])
df2
PE
a 2.5
b 3.2
c 2.5
df1 =pd.DataFrame(['10','20','25','30'], index = ['a','b','c','d'], columns=['PE'])
df1
PE
a 10
b 20
c 25
d 30
roe=pd.Series([0.12,0.08,0.06],index=['b','d','c']) #添加一列以进行合并
df1['ROE'] = roe
df2['ROE'] = roe
df1
PE ROE
a 10 NaN
b 20 0.12
c 25 0.06
d 30 0.08
df2
PE ROE
a 2.5 NaN
b 3.2 0.12
c 2.5 0.06
pd.merge(df1,df2)
PE ROE
pd.merge(df1,df2,how='outer')
PE ROE
0 10 NaN
1 2.5 0.06
2 2.5 NaN
3 20 0.12
4 25 0.06
5 3.2 0.12
6 30 0.08
#层次化索引

df = pd.Series(np.random.randn(5), index=[['a','a','b','b','b'],[1,2,1,2,3]])
df
a  1    1.238082
   2    0.031972
b  1    0.518654
   2    1.004512
   3   -0.134055
dtype: float64
df.index
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('b', 3)],
           )
df['b']
1    0.518654
2    1.004512
3   -0.134055
dtype: float64
#层次化索引在数据重塑中的应用
df.unstack()
1 2 3
a 1.238082 0.031972 NaN
b 0.518654 1.004512 -0.134055
df.unstack().unstack()
1  a    1.238082
   b    0.518654
2  a    0.031972
   b    1.004512
3  a         NaN
   b   -0.134055
dtype: float64
posted @ 2025-12-26 21:15  是17阿哥呀  阅读(6)  评论(0)    收藏  举报