Code

February 22, 2017

def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):

"""

Update CBOW model by training on a sequence of sentences.

Each sentence is a list of string tokens, which are looked up in the model's

vocab dictionary. Called internally from `Word2Vec.train()`.

This is the non-optimized, Python version. If you have cython installed, gensim

will use the optimized version from word2vec_inner instead.

"""

result = 0

for sentence in sentences:

word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and

model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

for pos, word in enumerate(word_vocabs):

reduced_window = model.random.randint(model.window) # `b` in the original word2vec code

start = max(0, pos - model.window + reduced_window)

window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)

word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]

l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size

if word2_indices and model.cbow_mean:

l1 /= len(word2_indices)

train_cbow_pair(model, word, word2_indices, l1, alpha)

result += len(word_vocabs)

return result

Report content on this page

Report Page