1 import pandas as pd
2 import numpy as np
3 s = pd.Series([1, 3, 6, np.nan, 44, 1])
4
5 df= pd.DataFrame(np.random.random((4,5)))
6
7 # data frame 常用属性
8 df.dtypes
9 df.index
10 df.columns
11 df.values
12
13 # data frame 常用方法
14 df.describe()
15 df.T
16 df.sort_index(axis = 1, ascending = False)
17 df.sort_values(by = 4)
18
19 # 选择数据
20 dates = pd.date_range('20160101', periods = 6)
21 df = pd.DataFrame(np.arange(24).reshape((6,4)), index = dates,
22 columns = ['A', 'B', 'C', 'D'])
23
24 '''row or column''' # 行不可隔着选择
25 print(df[0:3])
26 print(df[['A', 'D']])
27
28 '''select by label:loc''' # 行不可隔着选择
29 print(df.loc['20160101', :])
30 print(df.loc[:,['A', 'B']])
31
32 '''select by position:iloc'''
33 print(df.iloc[[0, 2], [0, 3]])
34
35 '''mixed selection:ix'''
36 print(df.ix[[0, 2], ['A', 'D']])
37
38 '''Boolean indexing'''
39 print(df[df.B > 5])
40
41 # 设置数据
42 df.iloc[2, 2] = 111
43 df.loc['20160101', 'D'] = 222
44 df.B[df.A > 5] = 0
45 print(df)
46
47 df['F'] = np.nan
48 df['E'] = range(6)
49 print(df)
50
51 # 处理缺失数据
52 df.iloc[0, 1] = np.nan
53 df.iloc[1, 2] = np.nan
54 print(df)
55 print(df.dropna(axis = 0, how = 'all')) # how = {'any', 'all'}
56 print(df.fillna(value = 0))
57 print(np.any(df.isnull()))
58
59 # data frame 合并
60 '''concatenating'''
61 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'])
62 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['a', 'b', 'c', 'd'])
63 df3 = pd.DataFrame(np.ones((3,4))*2, columns = ['a', 'b', 'c', 'd'])
64
65 res = pd.concat([df1, df2, df3], axis = 0, ignore_index = True)
66 res1 = pd.concat([df1, df2, df3], axis = 1)
67
68 '''join参数'''
69 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3])
70 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
71
72 res = pd.concat([df1, df2], join = 'outer', ignore_index = True)
73 res = pd.concat([df1, df2], join = 'inner', ignore_index = True)
74 print(res)
75
76 '''join_axes'''
77 res = pd.concat([df1, df2], axis = 1, join = 'inner')
78 res = pd.concat([df1, df2], axis = 1, join_axes = [df1.index])
79
80 # append
81 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3])
82 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
83 df3 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
84
85 res = df1.append([df2, df3], ignore_index = True)
86 res1 = pd.concat([df1, df2, df3])
87 print(res)
88 print(res1)
89
90 # data frame merge
91 '''merge one key'''
92 left = pd.DataFrame({'key':['K1','K2','K3'],
93 'A':[1,2,3],
94 'B':[4,5,6]})
95
96 right = pd.DataFrame({'key':['K0','K1','K3'],
97 'A':[11,43,53],
98 'D':[12,-1,0]})
99 res = pd.merge(left, right, on = 'key', how = 'outer')
100 print(res)
101
102 '''merge two or more keys'''
103 left = pd.DataFrame({'key0':['K1','K2','K3'],
104 'key1':['X0','X2','X3'],
105 'A':[1,2,3],
106 'B':[4,5,6]})
107
108 right = pd.DataFrame({'key0':['K0','K1','K3'],
109 'key1':['X1','X0','K3'],
110 'A':[11,43,53],
111 'D':[12,-1,0]})
112 res = pd.merge(left, right, on = ['key0', 'key1'], how = 'outer')
113 print(res)
114
115 '''merge index'''
116 left = pd.DataFrame({'A':[1,2,3],
117 'B':[4,5,6]},
118 index = ['K0', 'K1', 'K2'])
119
120 right = pd.DataFrame({'A':[11,43,53],
121 'D':[12,-1,0]},
122 index = ['K1', 'K2', 'K3'])
123 res = pd.merge(left, right, left_index = True,
124 right_index = True)
125 print(res)
126
127 '''handle overlapping columns'''
128 left = pd.DataFrame({'key':['K1','K2','K3'],
129 'A':[1,2,3],
130 'B':[4,5,6]})
131
132 right = pd.DataFrame({'key':['K0','K1','K3'],
133 'A':[11,43,53],
134 'B':[12,-1,0]})
135 res = pd.merge(left, right, on = 'key',
136 suffixes = ['_left', '_right'] , how = 'outer')
137 print(res)
138
139 # 作图
140 import pandas as pd
141 import numpy as np
142 import matplotlib.pyplot as plt
143
144 '''plot data'''
145 '''Series'''
146 data = pd.Series(np.random.randn(1000), index = np.arange(1000))
147 data = data.cumsum()
148 data.plot()
149 print(data)
150
151 '''Data Frame'''
152 data = pd.DataFrame(np.random.randn(1000, 4),
153 index = np.arange(1000),
154 columns = list("ABCD"))
155 print(data.head())
156 data = data.cumsum()
157 data.plot()
158 ax = data.plot.scatter(x = 'A', y = 'C',
159 color = 'Red',
160 label = 'Class 2')
161 data.plot.scatter(x = 'A', y = 'B',
162 color = 'DarkGreen',
163 label = 'Class 2',
164 ax = ax)