python学习-数据规整

1、分级索引

data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1,2,3,1,3,1,2,2,3]])
data.index
data['b']
data['b':'d']
data.loc[:, 2]
data.loc[:, 1]
data.unstack()
data.unstack().stack()

frame = pd.DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame['Ohio']

frame.swaplevel('key1','key2') #交换level
frame.sort_index(level=1)  #交换level
frame.swaplevel(0, 1).sort_index(level=0)

#Summary Statistics by Level
frame.sum(level='key1')
frame.sum(level='color', axis=1)
#Indexing with a DataFrame’s columns列作为索引
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1), 'c': ['one', 'one', 'one', 'two', 'two',
'two', 'two'], 'd':[0,1,2,0,1,2,3]})
frame2 = frame.set_index(['c', 'd'])
frame.set_index(['c', 'd'], drop=False)
frame2.reset_index()

2、合并数据集

df1= pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2= pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
pd.merge(df1,df2)
pd.merge(df1, df2, on='key') #显示指定

df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey') #不同列名key内连接

pd.merge(df1, df2, how='outer') #外连接

pd.merge(df1, df2, how='outer') #inner left right output

left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'], 'key2': ['one', 'two', 'one'], 'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], 'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]}) 
pd.merge(left, right, on=['key1', 'key2'], how='outer') #key1 key2作为一个元组进行外连接

#以下是等价的 默认后缀和指定后缀
pd.merge(left, right, on='key1') 
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

#基于index索引的合并
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
pd.merge(left1, right1, left_on='key', right_index=True)
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

lefth= pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],'key2':[2000,2001, 2002, 2001, 2002], 'data': np.arange(5.)})
righth=pd.DataFrame(np.arange(12).reshape((6, 2)), index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',
                       'Ohio', 'Ohio'],
                      [2001, 2000, 2000, 2000, 2001, 2002]],
               columns=['event1', 'event2'])
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True,how='outer')

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
              columns=['Ohio', 'Nevada'])
right2= pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'],
               columns=['Missouri', 'Alabama'])
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
left2.join(right2, how='outer')
left1.join(right1, on='key')

another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])
left2.join([right2, another])
left2.join([right2, another], how='outer')

另一种合并数据集

arr = np.arange(12).reshape((3, 4))
np.concatenate([arr, arr], axis=1)

s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])


pd.concat([s1,s2,s3])

pd.concat([s1, s2, s3], axis=1)

s4 = pd.concat([s1, s3]) # s4 = pd.concat([s1, s3],axis=0)

pd.concat([s1, s4], axis=1)  #外连接
pd.concat([s1, s4], axis=1,join='inner')

result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result.unstack()
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three']) #通过指定key区分结果集

#同样的方式应用有frame
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
pd.concat({'level1': df1, 'level2': df2}, axis=1)
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],names=['upper', 'lower'])

df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
pd.concat([df1, df2], ignore_index=True)

覆盖的方式合并Combining Data with Overlap

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
np.where(pd.isnull(a), b, a)
b[:-2].combine_first(a[2:])

df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2., np.nan, 6.],
'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
'b': [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2) #df1为空时 用df2对应值替换

Reshaping and Pivoting

data = pd.DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'], name='number'))
result = data.stack()
result.unstack()
result.unstack('state')

s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2.unstack()
data2.unstack().stack()
data2.unstack().stack(dropna=False)

df = pd.DataFrame({'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side'))
df.unstack('state')
df.unstack('state').stack('side')

 

 

posted @ 2020-06-04 23:07  行者无疆  阅读(163)  评论(0)    收藏  举报