import sys
# import keras
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers
import os
import matplotlib.pyplot as plt
# 设置相关底层配置
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)
def my_padding(x,size):
result = np.array([])
bat_size = x.shape[0]
for i in range(bat_size):
zero_mat = np.zeros((size - x.shape[1],x.shape[2]))
start_mat = np.ones((1,x.shape[2])) * 0.3
end_mat = np.ones((1,x.shape[2])) * 0.3
each_x = x[i]
each_x = np.vstack([start_mat,each_x])
pad_x = np.vstack([each_x,zero_mat])
pad_x = np.vstack([pad_x,end_mat])
pad_x = pad_x.reshape(1, size+2, x.shape[2])
if result.shape[0] == 0:
result = pad_x
else:
result = np.vstack([result,pad_x])
return result
# test_x = np.array([
# [[111, 112, 113],
# [121, 122, 123],
# ],
# [[211, 212, 213],
# [221, 222, 223],
# ],
# [[311, 312, 313],
# [321, 322, 323],
# ],
# ])
# a = my_padding(test_x,6)
# print(a.shape)
# print(a)
# sys.exit(2)
def get_angles(pos, i, d_model):
# 这里的i等价与上面公式中的2i和2i+1
angle_rates = 1 / np.power(10000, (2*(i // 2))/ np.float32(d_model))
return pos * angle_rates
def positional_encoding(position, d_model):
# print('position:',position)
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
# 第2i项使用sin
sines = np.sin(angle_rads[:, 0::2])
# 第2i+1项使用cos
cones = np.cos(angle_rads[:, 1::2])
pos_encoding = np.concatenate([sines, cones], axis=-1)
pos_encoding = pos_encoding[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
# pos_encoding = positional_encoding(50, 512)
# def create_padding_mark(seq):
# # 获取为0的padding项
# # print('seq:', seq)
# seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
# # 扩充维度以便用于attention矩阵
# return seq[:, np.newaxis, np.newaxis, :] # (batch_size,1,1,seq_len)
# # print('seq_mask:',seq)
# return seq # (batch_size,seq_len)
def create_padding_mark(targets):
# 获取为0的padding项
# print('targets:',targets.shape)
zero_mask = np.max(targets,axis=-1)
zero_mask = tf.cast(tf.math.equal(zero_mask, 0), tf.float32)
zero_mask = tf.reshape(zero_mask,(-1,1))
one_mat = np.ones((targets.shape[0],targets.shape[0]))
# print('zero_mask:',zero_mask.shape,zero_mask)
result_mask1 = np.multiply(one_mat,zero_mask)
result_mask2 = np.multiply(one_mat, tf.transpose(zero_mask))
result_mask = tf.maximum(result_mask1,result_mask2)
# print('result_mask:', result_mask.shape, result_mask)
# sys.exit(2)
return result_mask # (batch_size,seq_len)
# mark 测试
# create_padding_mark([[1,2,0,0,3],[3,4,5,0,0],[0,0,0,0,0]])
# sys.exit(2)
def create_look_ahead_mark(size):
# print('size:',size)
# 1 - 对角线和取下三角的全部对角线(-1->全部)
# 这样就可以构造出每个时刻未预测token的掩码
mark = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) # (a, numLower, numUpper)
# print('mark.shapeL:',mark.shape)
# sys.exit(2)
# mark = np.reshape(mark,())
mark = tf.cast(mark,tf.float32)
return mark # (seq_len, seq_len)
# temp = create_look_ahead_mark(3)
# print(temp)
# sys.exit(2)
# 构建掩码
def create_mask(targets):
# loss_mark = np.max(targets,axis=-1)
# loss_mark = tf.cast(tf.math.equal(loss_mark, 0), tf.float32)
# loss_mark = tf.reshape(loss_mark,(-1,1))
# # print('loss_mark:',loss_mark,loss_mark.shape)
# decode_targets_padding_mask = create_padding_mark(targets)
# decode_targets_padding_mask = tf.cast(decode_targets_padding_mask,tf.float32)
# look_ahead 掩码, 掩掉未预测的词
look_ahead_mask = create_look_ahead_mark(targets.shape[0])
look_ahead_mask = tf.cast(look_ahead_mask , tf.float32)
# print('此粗话'k:', look_ahead_mask.shape,type(look_ahead_mask.shape))
# 合并解码层第一层掩码
# combine_mask = tf.maximum(decode_targets_padding_mask, look_ahead_mask)
# print('combine_m
# return combine_mask, loss_mark
return look_ahead_mask
# sys.exit(2)
def scaled_dot_product_attention(q, k, v, mask):
# query key 相乘获取匹配关系
matmul_qk = tf.matmul(q, k, transpose_b=True)
# 使用dk进行缩放
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
# 掩码
if mask is not None:
scaled_attention_logits += (mask * -1e9)
# 通过softmax获取attention权重
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
# attention 乘上value
output = tf.matmul(attention_weights, v) # (.., seq_len_v, depth)
return output, attention_weights
# attention测试
# def print_out(q, k, v):
# temp_out, temp_att = scaled_dot_product_attention(
# q, k, v, None)
# print('attention weight:')
# print(temp_att)
# print('output:')
# print(temp_out)
# np.set_printoptions(suppress=True)
# temp_k = tf.constant([[10,0,0],
# [0,10,0],
# [0,0,10],
# [0,0,10]], dtype=tf.float32) # (4, 3)
#
# temp_v = tf.constant([[ 1,0],
# [ 10,0],
# [ 100,5],
# [1000,6]], dtype=tf.float32) # (4, 3)
# # 关注第2个key, 返回对应的value
# temp_q = tf.constant([[0,10,0]], dtype=tf.float32)
# print_out(temp_q, temp_k, temp_v)
# 构造mutil head attention层
class MutilHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MutilHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
# d_model 必须可以正确分为各个头
assert d_model % num_heads == 0
# 分头后的维度
self.depth = d_model // num_heads
self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)
self.dense = tf.keras.layers.Dense(d_model)
def split_heads(self, x, batch_size):
# 分头, 将头个数的维度 放到 seq_len 前面
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
# 分头前的前向网络,获取q、k、v语义
# print('or_q:',q.shape)
q = self.wq(q) # (batch_size, seq_len, d_model)
# print('or_q:', q.shape)
# sys.exit(2)
k = self.wk(k)
v = self.wv(v)
# 分头
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)
# scaled_attention.shape == (batch_size, num_heads, seq_len_v, depth)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
# 通过缩放点积注意力层
scaled_attention, attention_weights = scaled_dot_product_attention(
q, k, v, mask)
# 把多头维度后移
scaled_attention = tf.transpose(scaled_attention, [0, 2, 1, 3]) # (batch_size, seq_len_v, num_heads, depth)
# 合并多头
concat_attention = tf.reshape(scaled_attention,
(batch_size, -1, self.d_model))
# 全连接重塑
output = self.dense(concat_attention)
return output, attention_weights
# temp_mha = MutilHeadAttention(d_model=512, num_heads=8)
# # y = tf.random.uniform((1, 60, 512))
# y = tf.random.uniform((2, 60, 512))
# # y = tf.random.uniform((1, 2,60, 32))
# output, att = temp_mha(y, k=y, q=y, mask=None)
# print('x:{}'.format(y.shape))
# print("out:{},att:{}".format(output.shape, att.shape))
def point_wise_feed_forward_network(d_model, diff):
return tf.keras.Sequential([
tf.keras.layers.Dense(diff, activation='relu'),
tf.keras.layers.Dense(d_model)
])
class LayerNormalization(tf.keras.layers.Layer):
def __init__(self, epsilon=1e-6, **kwargs):
self.eps = epsilon
super(LayerNormalization, self).__init__(**kwargs)
def build(self, input_shape):
self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
initializer=tf.ones_initializer(), trainable=True)
self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
initializer=tf.zeros_initializer(), trainable=True)
super(LayerNormalization, self).build(input_shape)
def call(self, x):
mean = tf.keras.backend.mean(x, axis=-1, keepdims=True)
std = tf.keras.backend.std(x, axis=-1, keepdims=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
def compute_output_shape(self, input_shape):
return input_shape
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, n_heads, ddf, dropout_rate=0.1):
super(EncoderLayer, self).__init__()
self.mha = MutilHeadAttention(d_model, n_heads) # return (output, attention_weights)
self.ffn = point_wise_feed_forward_network(d_model, ddf)
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
########################################################### 可配一个大残差链接 ##########################################################
def call(self, inputs, training, mask):
# 多头注意力网络
att_output, _ = self.mha(inputs, inputs, inputs, mask) # return (output, attention_weights)
att_output = self.dropout1(att_output, training=training)
out1 = self.layernorm1(inputs + att_output) # (batch_size, input_seq_len, d_model)
# 前向网络
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
# sample_encoder_layer = EncoderLayer(512, 8, 2048) #(d_model, n_heads, ddf, dropout_rate=0.1)
# sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)
# print('sample_encoder_layer_output.shape:',sample_encoder_layer_output.shape)
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, drop_rate=0.1):
super(DecoderLayer, self).__init__()
self.mha1 = MutilHeadAttention(d_model, num_heads)
self.mha2 = MutilHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.layernorm3 = LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(drop_rate)
self.dropout2 = layers.Dropout(drop_rate)
self.dropout3 = layers.Dropout(drop_rate)
def call(self, inputs, encode_out, training,
look_ahead_mask, padding_mask):
# masked muti-head attention
att1, att_weight1 = self.mha1(inputs, inputs, inputs, look_ahead_mask) #(v, k, q)
att1 = self.dropout1(att1, training=training)
out1 = self.layernorm1(inputs + att1)
# muti-head attention
########################################################### 此处inputs 可换为out1 ##########################################################
# att2, att_weight2 = self.mha2(encode_out, encode_out, inputs, padding_mask) #(v, k, q)
att2, att_weight2 = self.mha2(encode_out, encode_out, out1, padding_mask) # (v, k, q)
att2 = self.dropout2(att2, training=training)
out2 = self.layernorm2(out1 + att2)
ffn_out = self.ffn(out2)
ffn_out = self.dropout3(ffn_out, training=training)
out3 = self.layernorm3(out2 + ffn_out)
return out3, att_weight1, att_weight2
# sample_encoder_layer = EncoderLayer(512, 8, 2048) #(d_model, n_heads, ddf, dropout_rate=0.1)
# sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)
# sample_decoder_layer = DecoderLayer(512, 8, 2048) # (d_model, num_heads, dff, drop_rate=0.1)
# sample_decoder_layer_output, _, _ = sample_decoder_layer(
# tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,False, None, None)
# print('sample_decoder_layer_output.shape:',sample_decoder_layer_output.shape)
class Encoder(layers.Layer):
# def __init__(self, n_layers, d_model, n_heads, ddf,
# input_vocab_size, max_seq_len, drop_rate=0.1):
def __init__(self, n_layers, d_model, n_heads, ddf, drop_rate=0.1):
super(Encoder, self).__init__()
self.n_layers = n_layers
self.d_model = d_model
# self.embedding = layers.Embedding(input_vocab_size, d_model)
self.embedding = layers.Dense(d_model,activation='relu')
# self.pos_embedding = positional_encoding(max_seq_len, d_model)
self.encode_layer = [EncoderLayer(d_model, n_heads, ddf, drop_rate)
for _ in range(n_layers)]
self.dropout = layers.Dropout(drop_rate)
def call(self, inputs, training, mark):
seq_len = inputs.shape[1]
word_emb = self.embedding(inputs)
word_emb *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
# emb = word_emb + self.pos_embedding[:, :seq_len, :]
emb = word_emb + positional_encoding(seq_len, self.d_model)
x = self.dropout(emb, training=training)
for i in range(self.n_layers):
x = self.encode_layer[i](x, training, mark)
return x
# sample_encoder = Encoder(2, 512, 8, 1024, 180) # (n_layers, d_model, n_heads, ddf, max_seq_len, drop_rate=0.1)
# print('此处')
# sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None)
# print('sample_encoder_output.shape:',sample_encoder_output.shape)
# sys.exit(2)
class Decoder(layers.Layer):
def __init__(self, n_layers, d_model, n_heads, ddf, drop_rate=0.1):
super(Decoder, self).__init__()
self.d_model = d_model
self.n_layers = n_layers
# self.embedding = layers.Embedding(target_vocab_size, d_model)
self.embedding = layers.Dense(d_model, activation='relu')
# self.pos_embedding = positional_encoding(max_seq_len, d_model)
self.decoder_layers = [DecoderLayer(d_model, n_heads, ddf, drop_rate)
for _ in range(n_layers)]
self.dropout = layers.Dropout(drop_rate)
def call(self, inputs, encoder_out, training,
look_ahead_mark, padding_mark):
# seq_len = tf.shape(inputs)[1]
seq_len = inputs.shape[1]
attention_weights = {}
h = self.embedding(inputs)
h *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
# h += self.pos_embedding[:, :seq_len, :]
h += positional_encoding(seq_len, self.d_model)
h = self.dropout(h, training=training)
# print('--------------------\n',h, h.shape)
# 叠加解码层
for i in range(self.n_layers):
h, att_w1, att_w2 = self.decoder_layers[i](h, encoder_out,
training, look_ahead_mark,
padding_mark)
attention_weights['decoder_layer{}_att_w1'.format(i + 1)] = att_w1
attention_weights['decoder_layer{}_att_w2'.format(i + 1)] = att_w2
return h, attention_weights
# sample_encoder = Encoder(2, 512, 8, 1024) # (n_layers, d_model, n_heads, ddf, drop_rate=0.1)
# sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None)
# # print('sample_encoder_output.shape:',sample_encoder_output.shape)
# sample_decoder = Decoder(2, 512,8,1024) # (n_layers, d_model, n_heads, ddf, drop_rate=0.1)
# sample_decoder_output, attn = sample_decoder(tf.random.uniform((64, 180,100)),sample_encoder_output, False, None, None)
# print(sample_decoder_output.shape, attn['decoder_layer1_att_w1'].shape)
# sys.exit(2)
class Transformer(tf.keras.Model):
def __init__(self, n_layers, d_model, n_heads, diff, target_vocab_size, drop_rate=0.1):
super(Transformer, self).__init__()
# self.bn1 = layers.BatchNormalization()
self.encoder = Encoder(n_layers, d_model, n_heads, diff,drop_rate)
self.decoder = Decoder(n_layers, d_model, n_heads, diff, drop_rate)
self.bn = tf.keras.layers.BatchNormalization()
self.final_layer = tf.keras.layers.Dense(target_vocab_size)
# self.final_layer = tf.keras.layers.Dense(target_vocab_size,activation='tanh')
# def call(self, inputs, targets, training, encode_padding_mask,
# look_ahead_mask, decode_padding_mask):
def call(self, inputs, targets, training,look_ahead_mask = None, encode_padding_mask = None,
decode_padding_mask = None):
# inputs = self.bn1(inputs)
encode_out = self.encoder(inputs, training, encode_padding_mask)
# print(encode_out.shape)
decode_out, att_weights = self.decoder(targets, encode_out, training,
look_ahead_mask, decode_padding_mask)
# print('decode_out.shape:',decode_out.shape)
decode_out = self.bn(decode_out)
final_out = self.final_layer(decode_out)
# final_out = self.final_layer(decode_out) *10
return final_out, att_weights
# sample_transformer = Transformer(n_layers=2, d_model=512, n_heads=8, diff=1024,target_vocab_size=20)
# temp_input = tf.random.uniform((64,180, 62))
# temp_target = tf.random.uniform((64, 180,26))
# fn_out, _ = sample_transformer(temp_input, temp_target, training=False,
# encode_padding_mask=None,
# look_ahead_mask=None,
# decode_padding_mask=None,
# )
# print('fn_out.shape:',fn_out.shape)
# @tf.function
global_num = 0
global_train_acc = 0
def train_step(inputs, targets):
# print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^6")
global global_num
global global_train_acc
global_num +=1
# tar_inp = targets[:, :-1,:6]
# tar_real = targets[:, 1:,:6]
tar_inp = targets[:, :-1, 5][:,:,np.newaxis]
tar_real = targets[:, 1:, 5][:,:,np.newaxis]
# print()
with tf.GradientTape() as tape:
predictions, _ = transformer(inputs, tar_inp,training = True,look_ahead_mask=look_mask,decode_padding_mask = None)
# print("pre:{}".format(predictions[:2,:5]))
# print("True:{}".format(tar_real[:2,:5]))
# predictions, _ = transformer(inputs, tar_inp, training=True, look_ahead_mask=None,decode_padding_mask=None)
# loss = loss_fun(tar_real, predictions,loss_mask)
loss = loss_fun(tar_real, predictions)
if global_num % 10 == 0:
acc = get_acc(predictions,tar_real)
global_train_acc = acc.numpy()
# print('train_acc:{:.2f}'.format(acc.numpy()))
# 求梯度
gradients = tape.gradient(loss, transformer.trainable_variables)
# 反向传播
optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
mse_loss = tf.reduce_mean(tf.losses.MSE(predictions,tar_real))
train_loss(mse_loss)