Code

Code


  def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):

    """

    Update CBOW model by training on a sequence of sentences.


    Each sentence is a list of string tokens, which are looked up in the model's

    vocab dictionary. Called internally from `Word2Vec.train()`.


    This is the non-optimized, Python version. If you have cython installed, gensim

    will use the optimized version from word2vec_inner instead.


    """

    result = 0

    for sentence in sentences:

      word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and

              model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

      for pos, word in enumerate(word_vocabs):

        reduced_window = model.random.randint(model.window) # `b` in the original word2vec code

        start = max(0, pos - model.window + reduced_window)

        window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)

        word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]

        l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size

        if word2_indices and model.cbow_mean:

          l1 /= len(word2_indices)

        train_cbow_pair(model, word, word2_indices, l1, alpha)

      result += len(word_vocabs)

    return result

Report Page