CS224n assignment1 Q3 word2vec

(a) 求解预测词向量 Vc的所对应的梯度。



(b) 求解输出词向量μw的梯度(包括μo在内)


(c) 梯度求解







(e) 完成word2vec模型


def normalizeRows(x): #行归一化函数
    """ Row normalization function

    Implement a function that normalizes each row of a matrix to have
    unit length.
    n = x.shape[0]
    x /= np.sqrt(np.sum(x**2,axis=1)).reshape((n,1)) + 1e-30 #防止除0加个小数
    return x


def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    #针对一个predict word和当前的的target word,完成一个传播过程

    v_hat = predicted
    z = np.dot(outputVectors,v_hat)
    preds = softmax(z)

    cost = -np.log(preds[target])

    z = preds.copy()
    z[target] -= 1.0

    grad = np.outer(z,v_hat)
    gradPred = np.dot(outputVectors.T,z)
    return cost, gradPred, grad


def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    grad = np.zeros(outputVectors.shape)
    gradPred = np.zeros(predicted.shape)
    cost = 0

    z = sigmoid(np.dot(outputVectors[target],predicted))

    cost -= np.log(z)
    grad[target] += predicted*(z-1.0)
    gradPred = outputVectors[target] * (z-1.0)

    for k in range(K):
        sample = indices[k+1]
        z = sigmoid(np.dot(outputVectors[sample],predicted))
        cost -= np.log(1.0-z)
        grad[sample] += predicted*z
        gradPred += outputVectors[sample] * z
    return cost, gradPred, grad


def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
             dataset, word2vecCostAndGradient=softmaxCostAndGradient):
    """ Skip-gram model in word2vec

    Implement the skip-gram model in this function.

    currentWord -- a string of the current center word
    C -- integer, context size
    contextWords -- list of no more than 2*C strings, the context words
    tokens -- a dictionary that maps words to their indices in
              the word vector list
    inputVectors -- "input" word vectors (as rows) for all tokens
    outputVectors -- "output" word vectors (as rows) for all tokens
    word2vecCostAndGradient -- the cost and gradient function for
                               a prediction vector given the target
                               word vectors, could be one of the two
                               cost functions you implemented above.

    cost -- the cost function value for the skip-gram model
    grad -- the gradient with respect to the word vectors
    cost = 0.0
    gradIn = np.zeros(inputVectors.shape)
    gradOut = np.zeros(outputVectors.shape)

    cword_idx = tokens[currentWord]
    v_hat = inputVectors[cword_idx]
    for i in contextWords:#对于窗口中的每个单词
        idx = tokens[i] #target的下标(要预测的单词的下标)
        c_cost,c_grad_in,c_grad_out = word2vecCostAndGradient(v_hat,idx,outputVectors,dataset)
        #更新cost、grad 即使用k个单词来训练这个向量
        cost += c_cost
        gradOut += c_grad_out
        gradIn[cword_idx] += c_grad_in
    return cost, gradIn, gradOut



def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
    """ Stochastic Gradient Descent

    Implement the stochastic gradient descent method in this function.

    f -- the function to optimize, it should take a single
         argument and yield two outputs, a cost and the gradient
         with respect to the arguments
    x0 -- the initial point to start SGD from
    step -- the step size for SGD
    iterations -- total iterations to run SGD for
    postprocessing -- postprocessing function for the parameters
                      if necessary. In the case of word2vec we will need to
                      normalize the word vectors to have unit length.
    PRINT_EVERY -- specifies how many iterations to output loss

    x -- the parameter value after SGD finishes

    # Anneal learning rate every several iterations
    ANNEAL_EVERY = 20000

    if useSaved:
        start_iter, oldx, state = load_saved_params()
        if start_iter > 0:
            x0 = oldx
            step *= 0.5 ** (start_iter / ANNEAL_EVERY)

        if state:
        start_iter = 0

    x = x0

    if not postprocessing:
        postprocessing = lambda x: x

    expcost = None

    for iter in range(start_iter + 1, iterations + 1):
        # Don't forget to apply the postprocessing after every iteration!
        # You might want to print the progress every few iterations.

        cost = None
        ### YOUR CODE HERE
        cost,grad = f(x)
        x -= step*grad
        ### END YOUR CODE

        if iter % PRINT_EVERY == 0:
            if not expcost:
                expcost = cost
                expcost = .95 * expcost + .05 * cost
            print ("iter %d: %f" % (iter, expcost))

        if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
            save_params(iter, x)

        if iter % ANNEAL_EVERY == 0:
            step *= 0.5

    return x




def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
         dataset, word2vecCostAndGradient=softmaxCostAndGradient):
    """CBOW model in word2vec

    Implement the continuous bag-of-words model in this function.

    Arguments/Return specifications: same as the skip-gram model

    Extra credit: Implementing CBOW is optional, but the gradient
    derivations are not. If you decide not to implement CBOW, remove
    the NotImplementedError.

    cost = 0
    gradIn = np.zeros(inputVectors.shape)
    gradOut = np.zeros(outputVectors.shape)
    D = inputVectors.shape[1]
    predicted = np.zeros((D,))

    indices = [tokens[cwd] for cwd in contextWords]
    for idx in indices:
        predicted += inputVectors[idx, :]

    cost, gp, gradOut = word2vecCostAndGradient(predicted, tokens[currentWord], outputVectors, dataset)
    gradIn = np.zeros(inputVectors.shape)
    for idx in indices:
        gradIn[idx, :] += gp
    return cost, gradIn, gradOut

参考: https://blog.csdn.net/longxinchen_ml/article/details/51765418

