pandas文件保存与读入<补充>

Table of Contents

问题背景

当数据框中的元素存在列表是,使用data.to_csv,data.to_excel链接都会将其转化成字符串保存。对于后续导入后需要进行二次转换比较棘手。所以也可以考虑通过json和pickle文件进行保存。具体说明如下:

import pandas as pd
import json
import joblib#之前是from sklearn.internels import joblib
#构建数据集
data=pd.DataFrame([{"a":[1,2,3,4],"b":1},{"a":[1,2,1],"b":3}])
data

a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3

数据的保存与读入

to_csv保存
#首先通过to_csv进行保存
data.to_csv(r'test.csv')
#再导入
data1=pd.read_csv(r'test.csv',index_col=0)
data1

a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
print(data1.loc[0,'a'],type(data1.loc[0,'a']))#默认转换成字符串

[1, 2, 3, 4] <class 'str'>

data1['a']=data1['a'].apply(lambda x:json.loads(x))
print(data1.loc[0,'a'],type(data1.loc[0,'a']))#目标格式

[1, 2, 3, 4] <class 'list'>

to_excel保存
data.to_excel(r'test.xlsx')
#再导入
data2=pd.read_csv(r'test.csv',index_col=0)
data2

a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
print(data2.loc[0,'a'],type(data2.loc[0,'a']))#默认转换成字符串

[1, 2, 3, 4] <class 'str'>

data2['a']=data2['a'].apply(lambda x:json.loads(x))
print(data2.loc[0,'a'],type(data2.loc[0,'a']))#目标格式

[1, 2, 3, 4] <class 'list'>

josn
#将其保存成json文件
json.dump(data.to_dict(),open(r'test.json','w+'))

若data文件中特殊格式的字符如Timestamps时,可以考虑先转字符串,再保存json.dump(data.astype('str').to_dict(),open(r'test.json','w+'))

#导入
data3=pd.DataFrame(json.load(open(r'test.json','r+')))
data3

a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
data3.loc[0,'a']

---------------------------------------------------------------------------

TypeError Traceback (most recent call last)

in
----> 1 data3.loc[0,'a']

d:\software\python\lib\site-packages\pandas\core\indexing.py in getitem(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis

d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
1270 def _getitem_tuple(self, tup: Tuple):
1271 try:
-> 1272 return self._getitem_lowerdim(tup)
1273 except IndexingError:
1274 pass

d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
1387 for i, key in enumerate(tup):
1388 if is_label_like(key) or isinstance(key, tuple):
-> 1389 section = self._getitem_axis(key, axis=i)
1390
1391 # we have yielded a scalar ?

d:\software\python\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1962
1963 # fall thru to straight lookup
-> 1964 self._validate_key(key, axis)
1965 return self._get_label(key, axis=axis)
1966

d:\software\python\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
1829
1830 if not is_list_like_indexer(key):
-> 1831 self._convert_scalar_indexer(key, axis)
1832
1833 def _is_scalar_access(self, key: Tuple) -> bool:

d:\software\python\lib\site-packages\pandas\core\indexing.py in _convert_scalar_indexer(self, key, axis)
739 ax = self.obj._get_axis(min(axis, self.ndim - 1))
740 # a scalar
--> 741 return ax._convert_scalar_indexer(key, kind=self.name)
742
743 def _convert_slice_indexer(self, key: slice, axis: int):

d:\software\python\lib\site-packages\pandas\core\indexes\base.py in _convert_scalar_indexer(self, key, kind)
2886 elif kind in ["loc"] and is_integer(key):
2887 if not self.holds_integer():
-> 2888 self._invalid_indexer("label", key)
2889
2890 return key

d:\software\python\lib\site-packages\pandas\core\indexes\base.py in _invalid_indexer(self, form, key)
3075 """
3076 raise TypeError(
-> 3077 f"cannot do {form} indexing on {type(self)} with these "
3078 f"indexers [{key}] of {type(key)}"
3079 )

TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [0] of <class 'int'>

#查看索引名称发现,其变成了字符串
data3.loc['0','a']

[1, 2, 3, 4]

print(data3.loc['0','a'],type(data3.loc['0','a']))#元素为目标格式,但数字索引被转换成了字符串

[1, 2, 3, 4] <class 'list'>

#改进,即修改索引名即可
data3.index=[i for i in range(data3.shape[0])]
print(data3.loc[0,'a'],type(data3.loc[0,'a']))#元素为目标格式,但数字索引被转换成了字符串

[1, 2, 3, 4] <class 'list'>

pickle
data.to_pickle(r'test.pkl')
#导入
data4=pd.read_pickle(r'test.pkl')
data4

a b
0 [1, 2, 3, 4] 1
1 [1, 2, 1] 3
print(data4.loc[0,'a'],type(data4.loc[0,'a']))#直接为原始文件

[1, 2, 3, 4] <class 'list'>

文件储存大小对比<后补>

posted @ 2020-08-31 10:39  LgRun  阅读(527)  评论(0)    收藏  举报