# Tensorflow 时间序列数据的处理

## 窗口序列数据的获取和应用

MIN_LEN = 20 # 最小窗口序列长度，低于该长度的窗口序列会被全零行填充
FEATURE_NUM = 300
ZERO_INDEX = 3141410 # 全零行序号
def form_indexes(data,time_range): # data：原数据集 time_range:时间序列范围
id_list = sorted(data['investment_id'].unique())
if 0 in id_list:
id_list.remove(0)
indexes_list = []
for id in tqdm(id_list):

sub_data = data[data['investment_id']==id].sort_values(by=['time_id'])
time_list = tuple(sorted(sub_data['time_id'].unique()))
for t in range(time_range[0],time_range[1]):
if t in time_list:
i_t = time_list.index(t)
temp = list(sub_data[max(i_t-MIN_LEN+1,0):i_t+1].index.values)
indexes = [ZERO_INDEX]*(MIN_LEN-len(temp)) + temp
return indexes_list


train_indexset= pd.read_parquet('trainindex.parquet')
val_indexset= pd.read_parquet('valindex.parquet')

def gen_func(train_val_or_test): # 生成器函数
if train_val_or_test == 1:
for indexes in train_indexset.iterrows():
features = data.iloc[indexes[1].values].values[:,4:]
label = data.iloc[indexes[1].values[-1]]['target']
yield (features,label)
elif train_val_or_test == 2:
for indexes in val_indexset.iterrows():
features = data.iloc[indexes[1].values].values[:,4:]
label = data.iloc[indexes[1].values[-1]]['target']
yield (features,label)
else:
print("error input")
raise ValueError

# 指定输出的形状和数据类型
featureSpec = tf.TensorSpec(
shape=[MIN_LEN,FEATURE_NUM],
dtype=tf.dtypes.float32,
name=None
)

labelSpec = tf.TensorSpec(
shape=[],
dtype=tf.dtypes.float32,
name=None
)

train_data = tf.data.Dataset.from_generator(generator=gen_func,args=[1] ,output_signature=(featureSpec,labelSpec))
val_data = tf.data.Dataset.from_generator(generator=gen_func,args=[2] ,output_signature=(featureSpec,labelSpec))


MIN_LEN = 20
FEATURE_NUM = 300
BATCH_SIZE = 1000
EPOCH_NUM = 50

def build_RNNmodel():
model = tf.keras.models.Sequential(
[
tf.keras.layers.Masking(mask_value=0.,
input_shape=(MIN_LEN, FEATURE_NUM)),
tf.keras.layers.LSTM(1024,activation='tanh',
return_sequences=True,
dropout=0.5,
kernel_initializer=tf.initializers.TruncatedNormal(stddev=0.01),
),
tf.keras.layers.LSTM(256,activation='tanh',
dropout=0.5,
kernel_initializer=tf.initializers.TruncatedNormal(stddev=0.01),
),
tf.keras.layers.Dense(1,activation='relu')
]
)
return model

train_batchs = train_data.batch(batch_size=BATCH_SIZE).prefetch(BATCH_SIZE)
val_batchs = val_data.batch(batch_size=BATCH_SIZE).prefetch(BATCH_SIZE)
# 设置prefetch可以预读取后续批次数据提高运行速度

model = build_RNNmodel()
model.compile(loss='mae', optimizer=tf.keras.optimizers.Adam(0.0001))

history = model.fit(train_batchs,epochs=EPOCH_NUM,validation_data=val_batchs)


posted @ 2022-04-28 11:29  多事鬼间人  阅读(485)  评论(0编辑  收藏  举报