用VAE(variational autoencoder)做sentence embedding/representation或者其他任何结构数据的热presentation

VAE是一个神奇得算法,其实思想倒是有点像word2vec,只是在其上加了一层bayesian的思想,这点上又倒是有点像LDA了;

个人觉得,VAE挖掘的好的话,倒是有很大的潜力和应用的,因为它是真正意义上的无监督的,句子表示成向量之后,然后你就可以想干嘛就干嘛了;

简单介绍一下VAE,就是一个变分估算参数后验概率,输入输出都是句子本身;

下面介绍一种最简单的VAE的实现,用1-gram,就是把句子表示成1*vocabulary size的向量;

tensorflow实现的;现在我在开发基于seq2seq的VAE算法,github上面倒是有几个,但是基于tensorflow的没一个写的让我满意的;

#encoding=utf-8
import os
import itertools
import numpy as np
import tensorflow as tf
from reader import TextReader
import random 

embed_dim = 500
h_dim = 100


data_path = './n_gram/'
model_dir = './n_gram/model_dir/'
reader = TextReader(data_path)
def create_train_op(loss):
    train_op = tf.contrib.layers.optimize_loss(loss = loss, 
        global_step = tf.contrib.framework.get_global_step(), 
        learning_rate = 0.01, 
        clip_gradients = 10.0, 
        optimizer = "Adam")
    return train_op


global_step = tf.Variable(0, name = 'global_step', trainable=False)

tx = tf.placeholder(tf.int64, [None, reader.vocab_size])
x = tf.to_float(tx)

batch_size = tf.placeholder(tf.int64)
w = tf.placeholder(tf.float32)

with tf.variable_scope('encoder'):
    w_1 = tf.get_variable('w_1', [reader.vocab_size, embed_dim], initializer = tf.truncated_normal_initializer())
    b_1 = tf.get_variable('b_1', [embed_dim], initializer = tf.truncated_normal_initializer())

    L1 = tf.nn.bias_add(tf.matmul(x, w_1), b_1)
    L1 = tf.nn.tanh(L1)

    w_2 = tf.get_variable('w_2', [embed_dim, embed_dim], initializer = tf.truncated_normal_initializer())
    b_2 = tf.get_variable('b_2', [embed_dim], initializer = tf.truncated_normal_initializer())

    L2 = tf.nn.bias_add(tf.matmul(L1, w_2), b_2)
    L2 = tf.nn.tanh(L2)

    w_encoder_mu = tf.get_variable('w_encoder_mu', [embed_dim, h_dim], initializer = tf.truncated_normal_initializer(0, 0.01))
    b_encoder_mu = tf.get_variable('b_encoder_mu', [h_dim], initializer = tf.truncated_normal_initializer(0, 0.001))

    w_encoder_var = tf.get_variable('w_encoder_var', [embed_dim, h_dim], initializer = tf.truncated_normal_initializer(0, 0.01))
    b_encoder_var = tf.get_variable('b_encoder_var', [h_dim], initializer = tf.truncated_normal_initializer(0, 0.01))

    mu = tf.nn.bias_add(tf.matmul(L2, w_encoder_mu), b_encoder_mu)
    log_sigma_sq = tf.nn.bias_add(tf.matmul(L2, w_encoder_var), b_encoder_var)

    eps = tf.random_normal([batch_size, h_dim], 0, 1, dtype = tf.float32)
    sigma = tf.sqrt(tf.exp(log_sigma_sq))

    h = mu + sigma*eps

with tf.variable_scope('decoder') as vs:
    R = tf.get_variable('R', [h_dim, reader.vocab_size], initializer = tf.truncated_normal_initializer(0, 0.0001))
    b = tf.get_variable('b', [reader.vocab_size], initializer = tf.truncated_normal_initializer(0, 0.0001))

    e = -tf.matmul(h, R) + b 
    p_x_i = tf.nn.softmax(e, -1)

e_loss = -0.5 * tf.reduce_sum(1.0 + log_sigma_sq - tf.square(mu) - tf.exp(log_sigma_sq), 1)
g_loss = -tf.reduce_sum(tf.log(p_x_i + 1e-10)*x, 1)
g_loss_stand = -tf.log(1.0/tf.reduce_sum(x, 1))*tf.reduce_sum(x, 1)
#g_loss = g_loss/tf.maximum(g_loss_stand, 1.0)


e_loss_mean = tf.reduce_mean(e_loss)
g_loss_mean = tf.reduce_mean(g_loss)

loss = 0.1*e_loss + g_loss 
loss = tf.reduce_mean(loss)

encoder_var_list = []
decoder_var_list = []
for var in tf.trainable_variables():
    if 'encoder' in var.name:
        encoder_var_list.append(var)
    elif 'decoder' in var.name:
        decoder_var_list.append(var)


optim_e = tf.train.AdamOptimizer(learning_rate=0.05).minimize(e_loss, global_step=global_step, var_list=encoder_var_list)
optim_g = tf.train.AdamOptimizer(learning_rate=0.05).minimize(g_loss, global_step=global_step, var_list=decoder_var_list)
train_op = create_train_op(loss)

saver = tf.train.Saver()
with tf.Session() as sess:

    sess.run(tf.initialize_all_variables())
    ckpt = tf.train.get_checkpoint_state(model_dir)
    if ckpt and ckpt.model_checkpoint_path:
        print 'the model being restored is '
        print ckpt.model_checkpoint_path 
        saver.restore(sess, ckpt.model_checkpoint_path)
        print 'sucesssfully restored the session'

    count = global_step.eval()
    for k in range(0, 0):
        data, length = reader.iterator()
        em, gm, lm, _= sess.run([e_loss_mean, g_loss_mean, loss, train_op], feed_dict = {tx: data, 
            batch_size:length,
            w:k/1000.0})
        print 'After\t' + str(global_step.eval()) + ' th step,the loss\t' + str(lm) + '\t kL loss\t' + str(em) + '\tdecoder loss\t' + str(gm)
        global_step.assign(count).eval()
        if k%10 == 0:
            saver.save(sess, model_dir + 'model.ckpt', global_step = global_step)
        count += 1

 

posted @ 2017-03-17 14:15  LarryGates  阅读(3820)  评论(2编辑  收藏  举报