## 前言

I是图片信息矩阵也就是[224,224,3]，通过前面的cnn也就是所谓的sequence-sequence模型中的encoder，我用的是vgg19，得到a，这里的a其实是[14*14,512]=[196,512]，很形象吧，代表的是图片被分成了这么多个区域，后面就看我们单词注意在哪个区域了，大家可以先这么泛泛理解。通过了本文要讲的Attention之后得到z。这个z是一个区域概率，也就是当前的单词在哪个图像区域的概率最大。然后z组合单词的embedding去训练。

## attention的内部结构是什么？

def _get_initial_lstm(self, features):
with tf.variable_scope('initial_lstm'):
features_mean = tf.reduce_mean(features, 1)

w_h = tf.get_variable('w_h', [self.D, self.H], initializer=self.weight_initializer)
b_h = tf.get_variable('b_h', [self.H], initializer=self.const_initializer)
h = tf.nn.tanh(tf.matmul(features_mean, w_h) + b_h)

w_c = tf.get_variable('w_c', [self.D, self.H], initializer=self.weight_initializer)
b_c = tf.get_variable('b_c', [self.H], initializer=self.const_initializer)
c = tf.nn.tanh(tf.matmul(features_mean, w_c) + b_c)
return c, h


y向量代表的就是feature。

c和y在输入到tanh之前要做个全连接，代码如下。

        w = tf.get_variable('w', [self.H, self.D], initializer=self.weight_initializer)
b = tf.get_variable('b', [self.D], initializer=self.const_initializer)
w_att = tf.get_variable('w_att', [self.D, 1], initializer=self.weight_initializer)

h_att = tf.nn.relu(features_proj + tf.expand_dims(tf.matmul(h, w), 1) + b)    # (N, L, D)


def _project_features(self, features):
with tf.variable_scope('project_features'):
w = tf.get_variable('w', [self.D, self.D], initializer=self.weight_initializer)
features_flat = tf.reshape(features, [-1, self.D])
features_proj = tf.matmul(features_flat, w)
features_proj = tf.reshape(features_proj, [-1, self.L, self.D])
return features_proj


out_att = tf.reshape(tf.matmul(tf.reshape(h_att, [-1, self.D]), w_att), [-1, self.L])   # (N, L)
alpha = tf.nn.softmax(out_att)


context = tf.reduce_sum(features * tf.expand_dims(alpha, 2), 1, name='context')   #(N, D)


    def _attention_layer(self, features, features_proj, h, reuse=False):
with tf.variable_scope('attention_layer', reuse=reuse):
w = tf.get_variable('w', [self.H, self.D], initializer=self.weight_initializer)
b = tf.get_variable('b', [self.D], initializer=self.const_initializer)
w_att = tf.get_variable('w_att', [self.D, 1], initializer=self.weight_initializer)

h_att = tf.nn.relu(features_proj + tf.expand_dims(tf.matmul(h, w), 1) + b)    # (N, L, D)
out_att = tf.reshape(tf.matmul(tf.reshape(h_att, [-1, self.D]), w_att), [-1, self.L])   # (N, L)
alpha = tf.nn.softmax(out_att)
context = tf.reduce_sum(features * tf.expand_dims(alpha, 2), 1, name='context')   #(N, D)
return context, alpha


ok，回到我们的image_caption中，看下图

https://segmentfault.com/a/1190000011744246

posted on 2018-11-24 08:56  alexanderkun  阅读(4456)  评论(0编辑  收藏  举报