我不知道如何构建一个嵌入CBOW模型的张量流词。 我在构建生成数据功能时遇到麻烦。 这是我的预处理函数:
# Removes sentences with fewer than 3 words
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]
# remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
# convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # total number of words in the corpus
V = len(tokenizer.word_index) + 1 # total number of unique words in the corpus
我已经像这样实现了Skipgram:
def generate_data_skipgram(corpus, window_size, V):
maxlen = window_size*2
all_in = []
all_out = []
for words in corpus:
L = len(words)
for index, word in enumerate(words):
p = index - window_size
n = index + window_size + 1
in_words = []
labels = []
for i in range(p, n):
if i != index and 0 <= i < L:
# Add the input word
#in_words.append(word)
all_in.append(word)
# Add one-hot of the context words
all_out.append(to_categorical(words[i], V))
return (np.array(all_in),np.array(all_out))
我应该如何创建generate_data_CBOW
函数,类似于生成跳过图之一。
我尝试寻找类似的结果,但所有结果均来自2016年,并且已经过时了
编辑: 这是我运行skipgram模型的模型,如果有帮助的话
dim=50
dim1=150
dim2=300
#create skipgram architecture
model = Sequential()
model.add(Embedding(input_dim=V, output_dim=dim, input_length=1, embeddings_initializer='glorot_uniform', ))
model.add(Reshape((dim,)))
model.add(Dense(V, kernel_initializer='glorot_uniform', activation='softmax'))
# compile the model
model.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
plot_model(model, show_shapes = True, show_layer_names=False)
答案 0 :(得分:0)
def generate_context_word_pairs(corpus, window_size, vocab_size):
context_length = window_size*2
for words in corpus:
sentence_length = len(words)
for index, word in enumerate(words):
context_words = []
label_word = []
start = index - window_size
end = index + window_size + 1
context_words.append([words[i]
for i in range(start, end)
if 0 <= i < sentence_length
and i != index])
label_word.append(word)
x = sequence.pad_sequences(context_words, maxlen=context_length)
y = np_utils.to_categorical(label_word, vocab_size)
yield (x, y)
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
if 0 not in x[0]:
print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
if i == 10:
break
i += 1