【记录】一次pd.to_datetime处理异常

Posted on 2021-11-29 11:08  呱嗒呱嗒  阅读(339)  评论(0编辑  收藏  举报

G-Research模拟环境下对于时间的处理异常,代码如下:

import datetime
import os,sys
import numpy as np
import pandas as pd
import gresearch_crypto

#env = gresearch_crypto.make_env()


class gresearch_guada():
    """docstring for gresearch_guada"""
    def __init__(self):
        #super(gresearch_guada, self).__init__()
        ### 训练集
        self.train = '/kaggle/input/g-research-crypto-forecasting/train.csv'
        ### 补充训练数据集——(验证集)
        self.supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv'
        ### 资产信息数据集,包含14个虚拟货币资产
        self.asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
        ### 测试数据样例
        self.example_test = '/kaggle/input/g-research-crypto-forecasting/example_test.csv'
        self.env = gresearch_crypto.make_env()
        self.iter_test = self.env.iter_test()

    def dataReader(self, datasetName):
        ### 数据集读取
        if datasetName == 'train':
            ### 获取train数据集
            df = pd.read_csv(self.train, usecols=['Target', 'Asset_ID', 'timestamp'], dtype={'Asset_ID': 'int8'})
        elif datasetName == 'supplemental_train':
            ### 获取supplemental_train
            df = pd.read_csv(self.supplemental_train, usecols=['Target', 'Asset_ID', 'timestamp'], dtype={'Asset_ID': 'int8'})
        else:
            print("ERROR [1018] - message: 数据集传入参数错误!")
        return df
    def datetimeProc(self, datasetName):
        ## 数据集时间处理
        datasetName['datetime'] = pd.to_datetime(datasetName['timestamp'], unit='s')
        #print(datasetName['datetime'])
        datasetName = datasetName.set_index('datetime').drop('timestamp', axis=1)
        datasetName = datasetName[(datasetName.index.year == 2021) & (datasetName.index.month > 5)]
        #print(datasetName)
        
        dfs = {asset_id:datasetName[datasetName['Asset_ID'] == asset_id].resample('1min').interpolate().copy() for asset_id in datasetName['Asset_ID'].unique()}
        #print(dfs)
        ## delete $datasetName dataset
        del datasetName
        
        for datasetName_test, datasetName_pred in self.iter_test:
            print("---- 没有datetime的数据集:\n")
            print(datasetName_test['timestamp'])
            datasetName_test['datetime'] = pd.to_datetime(datasetName_test['timestamp'], unit='ms', errors='raise')
            print(datasetName_test['datetime'])
            #print("---- 更新datetime的数据集 ----")
            #print(datasetName_test)
            #print("---- 这是预测集 -----")
            #print(datasetName_pred)
            for _, row in datasetName_test.iterrows():
                try:
                    datasetName = dfs[row['Asset_ID']]
                    closest_train_sample = datasetName.iloc[datasetName.index.get_loc(row['datasetName'], method='nearest')]
                    datasetName_pred.loc[datasetName_pred['row_id'] == row['row_id'], 'Target'] = closest_train_sample['Target']
                except:
                    #raise e
                    print("时间处理函数遭遇异常!")
                    #datasetName_pred.loc[datasetName_pred['row_id'] == row['row_id'], 'Target'] = 0
                    #raise e
            #gresearch_guada.
                #print(datasetName_pred)
            datasetName_pred['Target'] = datasetName_pred['Target'].fillna(0)
            print(datasetName_pred)
            #datasetName_pred_filled = gresearch_guada.dataFillNan(datasetName_pred, 'Target', '3')
            return datasetName_pred
......

异常代码段:

        for datasetName_test, datasetName_pred in self.iter_test:
            print("---- 没有datetime的数据集:\n")
            print(datasetName_test['timestamp'])
            datasetName_test['datetime'] = pd.to_datetime(datasetName_test['timestamp'], unit='ms', errors='raise')
            print(datasetName_test['datetime'])

异常Output:unit=ms时

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
---- 没有datetime的数据集:

0     1623542400
1     1623542400
2     1623542400
3     1623542400
4     1623542400
5     1623542400
6     1623542400
7     1623542400
8     1623542400
9     1623542400
10    1623542400
11    1623542400
12    1623542400
13    1623542400
Name: timestamp, dtype: int64
0    1970-01-19 18:59:02.400
1    1970-01-19 18:59:02.400
2    1970-01-19 18:59:02.400
3    1970-01-19 18:59:02.400
4    1970-01-19 18:59:02.400
5    1970-01-19 18:59:02.400
6    1970-01-19 18:59:02.400
7    1970-01-19 18:59:02.400
8    1970-01-19 18:59:02.400
9    1970-01-19 18:59:02.400
10   1970-01-19 18:59:02.400
11   1970-01-19 18:59:02.400
12   1970-01-19 18:59:02.400
13   1970-01-19 18:59:02.400
Name: datetime, dtype: datetime64[ns]

异常Output:unit=s时

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
---- 没有datetime的数据集:

0     1623542400
1     1623542400
2     1623542400
3     1623542400
4     1623542400
5     1623542400
6     1623542400
7     1623542400
8     1623542400
9     1623542400
10    1623542400
11    1623542400
12    1623542400
13    1623542400
Name: timestamp, dtype: int64
0    2021-06-13
1    2021-06-13
2    2021-06-13
3    2021-06-13
4    2021-06-13
5    2021-06-13
6    2021-06-13
7    2021-06-13
8    2021-06-13
9    2021-06-13
10   2021-06-13
11   2021-06-13
12   2021-06-13
13   2021-06-13
Name: datetime, dtype: datetime64[ns]