Pandas

数据结构

Series
- 一种类似一维数组的对象，由一组数据与一组标签（索引）组成
DataFrame
- 表格型的数据结构，Excel与R的data.frame;可以有行索引和列索引

import numpy as np
import pandas as pd
import warnings
warnings.simplefilter('ignore')

# 通过list构建Series
df = pd.Series(range(10,20))
type(df)

pandas.core.series.Series

df

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

df.values

array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

type(df.values)

numpy.ndarray

df.head(5)

0    10
1    11
2    12
3    13
4    14
dtype: int64

print(df[0])

print(df[8])

df[df>15]

6    16
7    17
8    18
9    19
dtype: int64

df>15

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool

# 通过字典构建Series
sales_data = {'a':1, 'b':2,'c':3}
df = pd.Series(sales_data)
print(df.head())
print(df.index)

a    1
b    2
c    3
dtype: int64
Index(['a', 'b', 'c'], dtype='object')

df.name = 'Series1'
df.index.name = 'idx'
df.head()

idx
a    1
b    2
c    3
Name: Series1, dtype: int64

df = pd.Series(range(8,14), index = ['a','b','c','d','e','f'])
#print(df)         # 打印整个 Series
print(df.head())  # 打印前几行（默认前5行） 显示 Series/DataFrame 的前 n 行数据
print(df.tail())  # 查看后5行

a     8
b     9
c    10
d    11
e    12
dtype: int64
b     9
c    10
d    11
e    12
f    13
dtype: int64

print(df[1]) # index行索引
print('-------------')
print(df[1:3]) # 切片索引 ，前闭后开
print('-------------')

9
-------------
b     9
c    10
dtype: int64
-------------

# DataFrame的构建
array = np.random.randn(6,4)
array

array([[ 0.06736358, -0.64255316,  0.20449614, -0.32843359],
       [ 0.94659344,  0.29186784, -0.60989757, -0.74899701],
       [-0.91717312,  1.48430298,  1.17129331,  1.81347392],
       [-3.62074396,  0.04548781,  1.65180448,  1.55219322],
       [ 0.32978011, -0.71838459, -1.21471285,  0.43443465],
       [-1.86554116, -1.25161324,  0.0990453 ,  0.46801472]])

df_obj = pd.DataFrame(array) # 通过Ndarray构建的DataFrame,如果没有声明，默认的行index和列index都是从0开始
df_obj.head()

	0	1	2	3
0	0.067364	-0.642553	0.204496	-0.328434
1	0.946593	0.291868	-0.609898	-0.748997
2	-0.917173	1.484303	1.171293	1.813474
3	-3.620744	0.045488	1.651804	1.552193
4	0.329780	-0.718385	-1.214713	0.434435

df = pd.DataFrame(np.random.randn(6,4),columns = ['a','b','c','d'], index = ['e','f','g','h','i','j'])
df.head(6)

	a	b	c	d
e	0.583592	-1.501183	0.116741	-0.442168
f	-0.161299	0.528328	-0.450201	-1.196337
g	1.042936	-0.152244	1.489381	0.968527
h	0.008104	0.342535	0.447326	0.136234
i	-0.835837	-0.912627	-0.100602	0.612302
j	0.206510	0.232865	-0.320405	-2.312483

# 通过字典构建DataFrame
dict_data = {'Date':pd.Timestamp('20251226'),
            'Number':pd.Series(6,index=list(range(4))),
            'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
            'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()

	Date	Number	Course_name	Company
0	2025-12-26	6	Python	JET
1	2025-12-26	6	Quant	JET
2	2025-12-26	6	Finance	JET
3	2025-12-26	6	CFA	JET

# 自定义构建DataFrame
df = pd.DataFrame([10,30,60,80], columns=['Quantity'], index=['a','b','c','d'])
df

	Quantity
a	10
b	30
c	60
d	80

print(type(df))

<class 'pandas.core.frame.DataFrame'>

# DataFrame的选择操作
dict_data = {'Date':pd.Timestamp('20251226'),
            'Number':pd.Series(6,index=list(range(4))),
            'Course_name':pd.Series(['Python','Quant','Finance','CFA']),
            'Company':'JET'}
# 显示DataFrame
df = pd.DataFrame(dict_data)
df.head()

	Date	Number	Course_name	Company
0	2025-12-26	6	Python	JET
1	2025-12-26	6	Quant	JET
2	2025-12-26	6	Finance	JET
3	2025-12-26	6	CFA	JET

df['Course_name']

0     Python
1      Quant
2    Finance
3        CFA
Name: Course_name, dtype: object

# 增加具体的列
df['Period'] = range(21,25)
df.head()

	Date	Number	Course_name	Company	Period
0	2025-12-26	6	Python	JET	21
1	2025-12-26	6	Quant	JET	22
2	2025-12-26	6	Finance	JET	23
3	2025-12-26	6	CFA	JET	24

# 删除列
del(df['Period'])
df.head()

	Date	Number	Course_name	Company
0	2025-12-26	6	Python	JET
1	2025-12-26	6	Quant	JET
2	2025-12-26	6	Finance	JET
3	2025-12-26	6	CFA	JET

# 将一个完整的DataFrame作为新建的列，Pandas会自动根据index的lable进行匹配
df['Names'] = pd.DataFrame(['Alpha','Beta','Gramma','Theta']) # ,index=['4','2','3','1'] 这个没试成功
df

	Date	Number	Course_name	Company	Names
0	2025-12-26	6	Python	JET	Alpha
1	2025-12-26	6	Quant	JET	Beta
2	2025-12-26	6	Finance	JET	Gramma
3	2025-12-26	6	CFA	JET	Theta

df = pd.DataFrame([80.5,90,58,88],columns=['Scores'],
                  index=['a','b','c','d'],dtype='f')
df

	Scores
a	80.5
b	90.0
c	58.0
d	88.0

df.dtypes

Scores    float32
dtype: object

df.columns #获得列名

Index(['Scores'], dtype='object')

df.index[1:3] # 选择index

Index(['b', 'c'], dtype='object')

df.iloc[1:3]

	Scores
b	90.0
c	58.0

df.loc['c']

Scores    58.0
Name: c, dtype: float32

df.sum()

Scores    316.5
dtype: float32

df.rename(columns={'Scores':'Score'},inplace=True) #重命名列，注意重命名的时候要用字典来进行传输
df

	Score
a	80.5
b	90.0
c	58.0
d	88.0

df.shape

(4, 1)

#DataFrame的向量化操作
df

	Score
a	80.5
b	90.0
c	58.0
d	88.0

df+df

	Score
a	161.0
b	180.0
c	116.0
d	176.0

df - df

	Score
a	0.0
b	0.0
c	0.0
d	0.0

df * 3

	Score
a	241.5
b	270.0
c	174.0
d	264.0

df

	Score
a	80.5
b	90.0
c	58.0
d	88.0

df['Score'] = df['Score'].astype('int')

df.dtypes

Score    int64
dtype: object

d = {
    'PE':pd.Series([10,20,30,40],index=['Company a','Company b','Company c','Company d']),
    'PB':pd.Series([2,3,2.5,4],index=['Company a','Company b','Company c','Company d']),
    'ROE':pd.Series([0.06,0.1,0.08,0.02],index=['Company a','Company b','Company c','Company d'])
}
df = pd.DataFrame(d)
df

	PE	PB	ROE
Company a	10	2.0	0.06
Company b	20	3.0	0.10
Company c	30	2.5	0.08
Company d	40	4.0	0.02

(df.PE <25) * 1

Company a    1
Company b    1
Company c    0
Company d    0
Name: PE, dtype: int64

df[df['PE']<25]

	PE	PB	ROE
Company a	10	2.0	0.06
Company b	20	3.0	0.10

df[df.PE < 25]

	PE	PB	ROE
Company a	10	2.0	0.06
Company b	20	3.0	0.10

df[(df.PE<25) & (df.PB<2.5)]

	PE	PB	ROE
Company a	10	2.0	0.06

df[ (df.PE<25) * 1 + (df.PB < 2.5) * 1 == 2] #多个条件

	PE	PB	ROE
Company a	10	2.0	0.06

df[ (df.PE < 25) * 1 + (df.PB < 2.5) * 1 + (df.ROE > 0.07) * 1 >= 2] # 三个当中满足任意两个条件

	PE	PB	ROE
Company a	10	2.0	0.06
Company b	20	3.0	0.10

#DataFrame Apply函数
a = np.random.randn(9,6)
a.round(5)

array([[-7.87620e-01, -1.24492e+00,  1.09361e+00, -5.22550e-01,
         2.12030e-01, -1.30360e+00],
       [-4.30230e-01, -6.23100e-01,  1.66740e-01, -8.75420e-01,
        -1.78512e+00,  1.59138e+00],
       [-1.94390e+00, -1.47000e-03,  1.31750e-01,  1.13086e+00,
         4.62110e-01, -7.69850e-01],
       [-1.08891e+00, -1.12085e+00, -5.33600e-02, -9.80170e-01,
         1.77180e-01, -1.36398e+00],
       [-6.52240e-01, -6.07170e-01,  5.18720e-01, -7.06020e-01,
         2.30450e-01,  9.53490e-01],
       [ 6.51890e-01, -2.44220e-01, -4.47850e-01, -7.46110e-01,
         6.64090e-01, -3.57600e-02],
       [-1.05783e+00, -3.45810e-01,  3.88430e-01, -2.13310e-01,
        -3.91820e-01,  6.51190e-01],
       [-1.60257e+00,  5.14340e-01, -3.61200e-02,  2.39620e-01,
        -7.57500e-01,  2.22660e-01],
       [ 7.30130e-01, -3.21070e-01,  5.67360e-01,  1.83213e+00,
         6.88210e-01,  2.87610e-01]])

df = pd.DataFrame(a)
df

	0	1	2	3	4	5
0	-0.787624	-1.244919	1.093608	-0.522551	0.212032	-1.303601
1	-0.430226	-0.623103	0.166744	-0.875421	-1.785116	1.591384
2	-1.943899	-0.001466	0.131749	1.130863	0.462114	-0.769851
3	-1.088908	-1.120851	-0.053360	-0.980173	0.177176	-1.363977
4	-0.652240	-0.607174	0.518715	-0.706020	0.230450	0.953491
5	0.651890	-0.244218	-0.447855	-0.746109	0.664087	-0.035765
6	-1.057828	-0.345811	0.388428	-0.213307	-0.391816	0.651193
7	-1.602573	0.514335	-0.036118	0.239622	-0.757499	0.222661
8	0.730135	-0.321074	0.567365	1.832134	0.688206	0.287612

df.columns = ['a','b','c','d','e','f']
df

	a	b	c	d	e	f
0	-0.787624	-1.244919	1.093608	-0.522551	0.212032	-1.303601
1	-0.430226	-0.623103	0.166744	-0.875421	-1.785116	1.591384
2	-1.943899	-0.001466	0.131749	1.130863	0.462114	-0.769851
3	-1.088908	-1.120851	-0.053360	-0.980173	0.177176	-1.363977
4	-0.652240	-0.607174	0.518715	-0.706020	0.230450	0.953491
5	0.651890	-0.244218	-0.447855	-0.746109	0.664087	-0.035765
6	-1.057828	-0.345811	0.388428	-0.213307	-0.391816	0.651193
7	-1.602573	0.514335	-0.036118	0.239622	-0.757499	0.222661
8	0.730135	-0.321074	0.567365	1.832134	0.688206	0.287612

dates = pd.date_range('2024-7-1',periods=9,freq='M')
dates

DatetimeIndex(['2024-07-31', '2024-08-31', '2024-09-30', '2024-10-31',
               '2024-11-30', '2024-12-31', '2025-01-31', '2025-02-28',
               '2025-03-31'],
              dtype='datetime64[ns]', freq='ME')

df.index = dates
df

	a	b	c	d	e	f
2024-07-31	-0.787624	-1.244919	1.093608	-0.522551	0.212032	-1.303601
2024-08-31	-0.430226	-0.623103	0.166744	-0.875421	-1.785116	1.591384
2024-09-30	-1.943899	-0.001466	0.131749	1.130863	0.462114	-0.769851
2024-10-31	-1.088908	-1.120851	-0.053360	-0.980173	0.177176	-1.363977
2024-11-30	-0.652240	-0.607174	0.518715	-0.706020	0.230450	0.953491
2024-12-31	0.651890	-0.244218	-0.447855	-0.746109	0.664087	-0.035765
2025-01-31	-1.057828	-0.345811	0.388428	-0.213307	-0.391816	0.651193
2025-02-28	-1.602573	0.514335	-0.036118	0.239622	-0.757499	0.222661
2025-03-31	0.730135	-0.321074	0.567365	1.832134	0.688206	0.287612

def square_fun(x):
    return x ** 2

df.apply(square_fun)

	a	b	c	d	e	f
2024-07-31	0.620352	1.549823	1.195979	0.273060	0.044957	1.699374
2024-08-31	0.185095	0.388257	0.027803	0.766361	3.186637	2.532503
2024-09-30	3.778741	0.000002	0.017358	1.278851	0.213550	0.592671
2024-10-31	1.185720	1.256308	0.002847	0.960740	0.031391	1.860434
2024-11-30	0.425417	0.368660	0.269066	0.498464	0.053107	0.909145
2024-12-31	0.424961	0.059642	0.200574	0.556678	0.441011	0.001279
2025-01-31	1.119000	0.119585	0.150876	0.045500	0.153520	0.424052
2025-02-28	2.568239	0.264541	0.001305	0.057419	0.573804	0.049578
2025-03-31	0.533097	0.103089	0.321903	3.356713	0.473627	0.082721

df.apply(lambda x : x ** 0.5)

	a	b	c	d	e	f
2024-07-31	NaN	NaN	1.045757	NaN	0.460469	NaN
2024-08-31	NaN	NaN	0.408343	NaN	NaN	1.261501
2024-09-30	NaN	NaN	0.362972	1.063420	0.679790	NaN
2024-10-31	NaN	NaN	NaN	NaN	0.420923	NaN
2024-11-30	NaN	NaN	0.720219	NaN	0.480052	0.976469
2024-12-31	0.807397	NaN	NaN	NaN	0.814915	NaN
2025-01-31	NaN	NaN	0.623240	NaN	NaN	0.806965
2025-02-28	NaN	0.717172	NaN	0.489512	NaN	0.471870
2025-03-31	0.854479	NaN	0.753236	1.353563	0.829582	0.536295

def find_min(x):
    return x.min()

df.apply(find_min)

a   -1.943899
b   -1.244919
c   -0.447855
d   -0.980173
e   -1.785116
f   -1.363977
dtype: float64

df.apply(find_min,axis=1)

2024-07-31   -1.303601
2024-08-31   -1.785116
2024-09-30   -1.943899
2024-10-31   -1.363977
2024-11-30   -0.706020
2024-12-31   -0.746109
2025-01-31   -1.057828
2025-02-28   -1.602573
2025-03-31   -0.321074
Freq: ME, dtype: float64

df

	a	b	c	d	e	f
2024-07-31	-0.787624	-1.244919	1.093608	-0.522551	0.212032	-1.303601
2024-08-31	-0.430226	-0.623103	0.166744	-0.875421	-1.785116	1.591384
2024-09-30	-1.943899	-0.001466	0.131749	1.130863	0.462114	-0.769851
2024-10-31	-1.088908	-1.120851	-0.053360	-0.980173	0.177176	-1.363977
2024-11-30	-0.652240	-0.607174	0.518715	-0.706020	0.230450	0.953491
2024-12-31	0.651890	-0.244218	-0.447855	-0.746109	0.664087	-0.035765
2025-01-31	-1.057828	-0.345811	0.388428	-0.213307	-0.391816	0.651193
2025-02-28	-1.602573	0.514335	-0.036118	0.239622	-0.757499	0.222661
2025-03-31	0.730135	-0.321074	0.567365	1.832134	0.688206	0.287612

#DataFrame排序
df.sort_index(ascending=False) # 降序排序

	a	b	c	d	e	f
2025-03-31	0.730135	-0.321074	0.567365	1.832134	0.688206	0.287612
2025-02-28	-1.602573	0.514335	-0.036118	0.239622	-0.757499	0.222661
2025-01-31	-1.057828	-0.345811	0.388428	-0.213307	-0.391816	0.651193
2024-12-31	0.651890	-0.244218	-0.447855	-0.746109	0.664087	-0.035765
2024-11-30	-0.652240	-0.607174	0.518715	-0.706020	0.230450	0.953491
2024-10-31	-1.088908	-1.120851	-0.053360	-0.980173	0.177176	-1.363977
2024-09-30	-1.943899	-0.001466	0.131749	1.130863	0.462114	-0.769851
2024-08-31	-0.430226	-0.623103	0.166744	-0.875421	-1.785116	1.591384
2024-07-31	-0.787624	-1.244919	1.093608	-0.522551	0.212032	-1.303601

df.sort_index(axis=1,ascending=False)

	f	e	d	c	b	a
2024-07-31	-1.303601	0.212032	-0.522551	1.093608	-1.244919	-0.787624
2024-08-31	1.591384	-1.785116	-0.875421	0.166744	-0.623103	-0.430226
2024-09-30	-0.769851	0.462114	1.130863	0.131749	-0.001466	-1.943899
2024-10-31	-1.363977	0.177176	-0.980173	-0.053360	-1.120851	-1.088908
2024-11-30	0.953491	0.230450	-0.706020	0.518715	-0.607174	-0.652240
2024-12-31	-0.035765	0.664087	-0.746109	-0.447855	-0.244218	0.651890
2025-01-31	0.651193	-0.391816	-0.213307	0.388428	-0.345811	-1.057828
2025-02-28	0.222661	-0.757499	0.239622	-0.036118	0.514335	-1.602573
2025-03-31	0.287612	0.688206	1.832134	0.567365	-0.321074	0.730135

df.sort_values(by='a') # 按values排序

	a	b	c	d	e	f
2024-09-30	-1.943899	-0.001466	0.131749	1.130863	0.462114	-0.769851
2025-02-28	-1.602573	0.514335	-0.036118	0.239622	-0.757499	0.222661
2024-10-31	-1.088908	-1.120851	-0.053360	-0.980173	0.177176	-1.363977
2025-01-31	-1.057828	-0.345811	0.388428	-0.213307	-0.391816	0.651193
2024-07-31	-0.787624	-1.244919	1.093608	-0.522551	0.212032	-1.303601
2024-11-30	-0.652240	-0.607174	0.518715	-0.706020	0.230450	0.953491
2024-08-31	-0.430226	-0.623103	0.166744	-0.875421	-1.785116	1.591384
2024-12-31	0.651890	-0.244218	-0.447855	-0.746109	0.664087	-0.035765
2025-03-31	0.730135	-0.321074	0.567365	1.832134	0.688206	0.287612

#通用函数
df.sum(axis =1) #按行排序

2024-07-31   -2.553054
2024-08-31   -1.955737
2024-09-30   -0.990490
2024-10-31   -4.430094
2024-11-30   -0.262777
2024-12-31   -0.157969
2025-01-31   -0.969142
2025-02-28   -1.419570
2025-03-31    3.784377
Freq: ME, dtype: float64

df.mean()

a   -0.686808
b   -0.443809
c    0.258808
d   -0.093440
e   -0.055596
f    0.025905
dtype: float64

df.cumsum()

	a	b	c	d	e	f
2024-07-31	-0.787624	-1.244919	1.093608	-0.522551	0.212032	-1.303601
2024-08-31	-1.217850	-1.868021	1.260352	-1.397972	-1.573084	0.287784
2024-09-30	-3.161749	-1.869487	1.392101	-0.267109	-1.110969	-0.482068
2024-10-31	-4.250657	-2.990339	1.338741	-1.247282	-0.933793	-1.846045
2024-11-30	-4.902897	-3.597513	1.857456	-1.953302	-0.703343	-0.892554
2024-12-31	-4.251007	-3.841731	1.409601	-2.699411	-0.039256	-0.928319
2025-01-31	-5.308835	-4.187542	1.798029	-2.912717	-0.431072	-0.277126
2025-02-28	-6.911407	-3.673207	1.761911	-2.673095	-1.188571	-0.054465
2025-03-31	-6.181272	-3.994281	2.329276	-0.840961	-0.500365	0.233147

df.describe()

	a	b	c	d	e	f
count	9.000000	9.000000	9.000000	9.000000	9.000000	9.000000
mean	-0.686808	-0.443809	0.258808	-0.093440	-0.055596	0.025905
std	0.907970	0.540510	0.445095	0.982039	0.801856	1.009066
min	-1.943899	-1.244919	-0.447855	-0.980173	-1.785116	-1.363977
25%	-1.088908	-0.623103	-0.036118	-0.746109	-0.391816	-0.769851
50%	-0.787624	-0.345811	0.166744	-0.522551	0.212032	0.222661
75%	-0.430226	-0.244218	0.518715	0.239622	0.462114	0.651193
max	0.730135	0.514335	1.093608	1.832134	0.688206	1.591384

posted @ 2025-12-26 21:15 是17阿哥呀阅读(2) 评论(0) 收藏举报

刷新页面返回顶部

攀登者说

与稚子弈，吾局大溃，弃甲曳兵而走。子哂曰:"技拙当勤修"

Pandas

数据结构

公告