Pandas

数据结构

  • Series
    • 一种类似一维数组的对象,由一组数据与一组标签(索引)组成
  • DataFrame
    • 表格型的数据结构,Excel与R的data.frame;可以有行索引和列索引
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter('ignore')

# 通过list构建Series
df = pd.Series(range(10,20))
type(df)
pandas.core.series.Series
df
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
df.values
array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
type(df.values)
numpy.ndarray
df.head(5)
0    10
1    11
2    12
3    13
4    14
dtype: int64
print(df[0])
10
print(df[8])
18
df[df>15]
6    16
7    17
8    18
9    19
dtype: int64
df>15
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool
# 通过字典构建Series
sales_data = {'a':1, 'b':2,'c':3}
df = pd.Series(sales_data)
print(df.head())
print(df.index)
a    1
b    2
c    3
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
df.name = 'Series1'
df.index.name = 'idx'
df.head()
idx
a    1
b    2
c    3
Name: Series1, dtype: int64
df = pd.Series(range(8,14), index = ['a','b','c','d','e','f'])
#print(df)         # 打印整个 Series
print(df.head())  # 打印前几行(默认前5行) 显示 Series/DataFrame 的前 n 行数据
print(df.tail())  # 查看后5行
a     8
b     9
c    10
d    11
e    12
dtype: int64
b     9
c    10
d    11
e    12
f    13
dtype: int64
print(df[1]) # index行索引
print('-------------')
print(df[1:3]) # 切片索引 ,前闭后开
print('-------------')
9
-------------
b     9
c    10
dtype: int64
-------------
# DataFrame的构建
array = np.random.randn(6,4)
array
array([[ 0.06736358, -0.64255316,  0.20449614, -0.32843359],
       [ 0.94659344,  0.29186784, -0.60989757, -0.74899701],
       [-0.91717312,  1.48430298,  1.17129331,  1.81347392],
       [-3.62074396,  0.04548781,  1.65180448,  1.55219322],
       [ 0.32978011, -0.71838459, -1.21471285,  0.43443465],
       [-1.86554116, -1.25161324,  0.0990453 ,  0.46801472]])
df_obj = pd.DataFrame(array) # 通过Ndarray构建的DataFrame,如果没有声明,默认的行index和列index都是从0开始
df_obj.head()
0 1 2 3
0 0.067364 -0.642553 0.204496 -0.328434
1 0.946593 0.291868 -0.609898 -0.748997
2 -0.917173 1.484303 1.171293 1.813474
3 -3.620744 0.045488 1.651804 1.552193
4 0.329780 -0.718385 -1.214713 0.434435
df = pd.DataFrame(np.random.randn(6,4),columns = ['a','b','c','d'], index = ['e','f','g','h','i','j'])
df.head(6)
a b c d
e 0.583592 -1.501183 0.116741 -0.442168
f -0.161299 0.528328 -0.450201 -1.196337
g 1.042936 -0.152244 1.489381 0.968527
h 0.008104 0.342535 0.447326 0.136234
i -0.835837 -0.912627 -0.100602 0.612302
j 0.206510 0.232865 -0.320405 -2.312483
# 通过字典构建DataFrame
dict_data = {'Date':pd.Timestamp('20251226'),
            'Number':pd.Series(6,index=list(range(4))),
            'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
            'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()
Date Number Course_name Company
0 2025-12-26 6 Python JET
1 2025-12-26 6 Quant JET
2 2025-12-26 6 Finance JET
3 2025-12-26 6 CFA JET
# 自定义构建DataFrame
df = pd.DataFrame([10,30,60,80], columns=['Quantity'], index=['a','b','c','d'])
df
Quantity
a 10
b 30
c 60
d 80
print(type(df))
<class 'pandas.core.frame.DataFrame'>
# DataFrame的选择操作
dict_data = {'Date':pd.Timestamp('20251226'),
            'Number':pd.Series(6,index=list(range(4))),
            'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
            'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()
Date Number Course_name Company
0 2025-12-26 6 Python JET
1 2025-12-26 6 Quant JET
2 2025-12-26 6 Finance JET
3 2025-12-26 6 CFA JET
df['Course_name']
0     Python
1      Quant
2    Finance
3        CFA
Name: Course_name, dtype: object
# 增加具体的列
df['Period'] = range(21,25)
df.head()
Date Number Course_name Company Period
0 2025-12-26 6 Python JET 21
1 2025-12-26 6 Quant JET 22
2 2025-12-26 6 Finance JET 23
3 2025-12-26 6 CFA JET 24
# 删除列
del(df['Period'])
df.head()
Date Number Course_name Company
0 2025-12-26 6 Python JET
1 2025-12-26 6 Quant JET
2 2025-12-26 6 Finance JET
3 2025-12-26 6 CFA JET
# 将一个完整的DataFrame作为新建的列,Pandas会自动根据index的lable进行匹配
df['Names'] = pd.DataFrame(['Alpha','Beta','Gramma','Theta']) # ,index=['4','2','3','1'] 这个没试成功
df
Date Number Course_name Company Names
0 2025-12-26 6 Python JET Alpha
1 2025-12-26 6 Quant JET Beta
2 2025-12-26 6 Finance JET Gramma
3 2025-12-26 6 CFA JET Theta
df = pd.DataFrame([80.5,90,58,88],columns=['Scores'],
                  index=['a','b','c','d'],dtype='f')
df
Scores
a 80.5
b 90.0
c 58.0
d 88.0
df.dtypes
Scores    float32
dtype: object
df.columns #获得列名
Index(['Scores'], dtype='object')
df.index[1:3] # 选择index
Index(['b', 'c'], dtype='object')
df.iloc[1:3]
Scores
b 90.0
c 58.0
df.loc['c']
Scores    58.0
Name: c, dtype: float32
df.sum()
Scores    316.5
dtype: float32
df.rename(columns={'Scores':'Score'},inplace=True) #重命名列,注意重命名的时候要用字典来进行传输
df
Score
a 80.5
b 90.0
c 58.0
d 88.0
df.shape
(4, 1)
#DataFrame的向量化操作
df
Score
a 80.5
b 90.0
c 58.0
d 88.0
df+df
Score
a 161.0
b 180.0
c 116.0
d 176.0
df - df
Score
a 0.0
b 0.0
c 0.0
d 0.0
df * 3
Score
a 241.5
b 270.0
c 174.0
d 264.0
df
Score
a 80.5
b 90.0
c 58.0
d 88.0
df['Score'] = df['Score'].astype('int')
df.dtypes
Score    int64
dtype: object
d = {
    'PE':pd.Series([10,20,30,40],index=['Company a','Company b','Company c','Company d']),
    'PB':pd.Series([2,3,2.5,4],index=['Company a','Company b','Company c','Company d']),
    'ROE':pd.Series([0.06,0.1,0.08,0.02],index=['Company a','Company b','Company c','Company d'])
}
df = pd.DataFrame(d)
df
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
Company c 30 2.5 0.08
Company d 40 4.0 0.02
(df.PE <25) * 1
Company a    1
Company b    1
Company c    0
Company d    0
Name: PE, dtype: int64
df[df['PE']<25]
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
df[df.PE < 25]
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
df[(df.PE<25) & (df.PB<2.5)]
PE PB ROE
Company a 10 2.0 0.06
df[ (df.PE<25) * 1 + (df.PB < 2.5) * 1 == 2] #多个条件
PE PB ROE
Company a 10 2.0 0.06
df[ (df.PE < 25) * 1 + (df.PB < 2.5) * 1 + (df.ROE > 0.07) * 1 >= 2] # 三个当中满足任意两个条件
PE PB ROE
Company a 10 2.0 0.06
Company b 20 3.0 0.10
#DataFrame Apply函数
a = np.random.randn(9,6)
a.round(5)
array([[-7.87620e-01, -1.24492e+00,  1.09361e+00, -5.22550e-01,
         2.12030e-01, -1.30360e+00],
       [-4.30230e-01, -6.23100e-01,  1.66740e-01, -8.75420e-01,
        -1.78512e+00,  1.59138e+00],
       [-1.94390e+00, -1.47000e-03,  1.31750e-01,  1.13086e+00,
         4.62110e-01, -7.69850e-01],
       [-1.08891e+00, -1.12085e+00, -5.33600e-02, -9.80170e-01,
         1.77180e-01, -1.36398e+00],
       [-6.52240e-01, -6.07170e-01,  5.18720e-01, -7.06020e-01,
         2.30450e-01,  9.53490e-01],
       [ 6.51890e-01, -2.44220e-01, -4.47850e-01, -7.46110e-01,
         6.64090e-01, -3.57600e-02],
       [-1.05783e+00, -3.45810e-01,  3.88430e-01, -2.13310e-01,
        -3.91820e-01,  6.51190e-01],
       [-1.60257e+00,  5.14340e-01, -3.61200e-02,  2.39620e-01,
        -7.57500e-01,  2.22660e-01],
       [ 7.30130e-01, -3.21070e-01,  5.67360e-01,  1.83213e+00,
         6.88210e-01,  2.87610e-01]])
df = pd.DataFrame(a)
df
0 1 2 3 4 5
0 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
1 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
3 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
4 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
5 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
6 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
7 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
8 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
df.columns = ['a','b','c','d','e','f']
df
a b c d e f
0 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
1 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
3 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
4 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
5 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
6 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
7 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
8 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
dates = pd.date_range('2024-7-1',periods=9,freq='M')
dates
DatetimeIndex(['2024-07-31', '2024-08-31', '2024-09-30', '2024-10-31',
               '2024-11-30', '2024-12-31', '2025-01-31', '2025-02-28',
               '2025-03-31'],
              dtype='datetime64[ns]', freq='ME')
df.index = dates
df
a b c d e f
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
def square_fun(x):
    return x ** 2
df.apply(square_fun)
a b c d e f
2024-07-31 0.620352 1.549823 1.195979 0.273060 0.044957 1.699374
2024-08-31 0.185095 0.388257 0.027803 0.766361 3.186637 2.532503
2024-09-30 3.778741 0.000002 0.017358 1.278851 0.213550 0.592671
2024-10-31 1.185720 1.256308 0.002847 0.960740 0.031391 1.860434
2024-11-30 0.425417 0.368660 0.269066 0.498464 0.053107 0.909145
2024-12-31 0.424961 0.059642 0.200574 0.556678 0.441011 0.001279
2025-01-31 1.119000 0.119585 0.150876 0.045500 0.153520 0.424052
2025-02-28 2.568239 0.264541 0.001305 0.057419 0.573804 0.049578
2025-03-31 0.533097 0.103089 0.321903 3.356713 0.473627 0.082721
df.apply(lambda x : x ** 0.5)
a b c d e f
2024-07-31 NaN NaN 1.045757 NaN 0.460469 NaN
2024-08-31 NaN NaN 0.408343 NaN NaN 1.261501
2024-09-30 NaN NaN 0.362972 1.063420 0.679790 NaN
2024-10-31 NaN NaN NaN NaN 0.420923 NaN
2024-11-30 NaN NaN 0.720219 NaN 0.480052 0.976469
2024-12-31 0.807397 NaN NaN NaN 0.814915 NaN
2025-01-31 NaN NaN 0.623240 NaN NaN 0.806965
2025-02-28 NaN 0.717172 NaN 0.489512 NaN 0.471870
2025-03-31 0.854479 NaN 0.753236 1.353563 0.829582 0.536295
def find_min(x):
    return x.min()

df.apply(find_min)
a   -1.943899
b   -1.244919
c   -0.447855
d   -0.980173
e   -1.785116
f   -1.363977
dtype: float64
df.apply(find_min,axis=1)
2024-07-31   -1.303601
2024-08-31   -1.785116
2024-09-30   -1.943899
2024-10-31   -1.363977
2024-11-30   -0.706020
2024-12-31   -0.746109
2025-01-31   -1.057828
2025-02-28   -1.602573
2025-03-31   -0.321074
Freq: ME, dtype: float64
df
a b c d e f
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
#DataFrame排序
df.sort_index(ascending=False) # 降序排序
a b c d e f
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
df.sort_index(axis=1,ascending=False)
f e d c b a
2024-07-31 -1.303601 0.212032 -0.522551 1.093608 -1.244919 -0.787624
2024-08-31 1.591384 -1.785116 -0.875421 0.166744 -0.623103 -0.430226
2024-09-30 -0.769851 0.462114 1.130863 0.131749 -0.001466 -1.943899
2024-10-31 -1.363977 0.177176 -0.980173 -0.053360 -1.120851 -1.088908
2024-11-30 0.953491 0.230450 -0.706020 0.518715 -0.607174 -0.652240
2024-12-31 -0.035765 0.664087 -0.746109 -0.447855 -0.244218 0.651890
2025-01-31 0.651193 -0.391816 -0.213307 0.388428 -0.345811 -1.057828
2025-02-28 0.222661 -0.757499 0.239622 -0.036118 0.514335 -1.602573
2025-03-31 0.287612 0.688206 1.832134 0.567365 -0.321074 0.730135
df.sort_values(by='a') # 按values排序
a b c d e f
2024-09-30 -1.943899 -0.001466 0.131749 1.130863 0.462114 -0.769851
2025-02-28 -1.602573 0.514335 -0.036118 0.239622 -0.757499 0.222661
2024-10-31 -1.088908 -1.120851 -0.053360 -0.980173 0.177176 -1.363977
2025-01-31 -1.057828 -0.345811 0.388428 -0.213307 -0.391816 0.651193
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-11-30 -0.652240 -0.607174 0.518715 -0.706020 0.230450 0.953491
2024-08-31 -0.430226 -0.623103 0.166744 -0.875421 -1.785116 1.591384
2024-12-31 0.651890 -0.244218 -0.447855 -0.746109 0.664087 -0.035765
2025-03-31 0.730135 -0.321074 0.567365 1.832134 0.688206 0.287612
#通用函数
df.sum(axis =1) #按行排序
2024-07-31   -2.553054
2024-08-31   -1.955737
2024-09-30   -0.990490
2024-10-31   -4.430094
2024-11-30   -0.262777
2024-12-31   -0.157969
2025-01-31   -0.969142
2025-02-28   -1.419570
2025-03-31    3.784377
Freq: ME, dtype: float64
df.mean()
a   -0.686808
b   -0.443809
c    0.258808
d   -0.093440
e   -0.055596
f    0.025905
dtype: float64
df.cumsum()
a b c d e f
2024-07-31 -0.787624 -1.244919 1.093608 -0.522551 0.212032 -1.303601
2024-08-31 -1.217850 -1.868021 1.260352 -1.397972 -1.573084 0.287784
2024-09-30 -3.161749 -1.869487 1.392101 -0.267109 -1.110969 -0.482068
2024-10-31 -4.250657 -2.990339 1.338741 -1.247282 -0.933793 -1.846045
2024-11-30 -4.902897 -3.597513 1.857456 -1.953302 -0.703343 -0.892554
2024-12-31 -4.251007 -3.841731 1.409601 -2.699411 -0.039256 -0.928319
2025-01-31 -5.308835 -4.187542 1.798029 -2.912717 -0.431072 -0.277126
2025-02-28 -6.911407 -3.673207 1.761911 -2.673095 -1.188571 -0.054465
2025-03-31 -6.181272 -3.994281 2.329276 -0.840961 -0.500365 0.233147
df.describe()
a b c d e f
count 9.000000 9.000000 9.000000 9.000000 9.000000 9.000000
mean -0.686808 -0.443809 0.258808 -0.093440 -0.055596 0.025905
std 0.907970 0.540510 0.445095 0.982039 0.801856 1.009066
min -1.943899 -1.244919 -0.447855 -0.980173 -1.785116 -1.363977
25% -1.088908 -0.623103 -0.036118 -0.746109 -0.391816 -0.769851
50% -0.787624 -0.345811 0.166744 -0.522551 0.212032 0.222661
75% -0.430226 -0.244218 0.518715 0.239622 0.462114 0.651193
max 0.730135 0.514335 1.093608 1.832134 0.688206 1.591384
posted @ 2025-12-26 21:15  是17阿哥呀  阅读(2)  评论(0)    收藏  举报