我正在尝试使用keras实现skip-gram。添加Dot()层时出错。以下实现受Dipanjan代码的启发。我做了一些与不同版本的keras相关的修改。
这是我到目前为止所执行的:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline
from nltk.corpus import gutenberg
from string import punctuation
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
# lower case and remove special characters\whitespaces
doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
doc = doc.lower()
doc = doc.strip()
# tokenize document
tokens = wpt.tokenize(doc)
# filter stopwords out of document
filtered_tokens = [token for token in tokens if token not in stop_words]
# re-create document from filtered tokens
doc = ' '.join(filtered_tokens)
return doc
normalize_corpus = np.vectorize(normalize_document)
bible = gutenberg.sents('bible-kjv.txt')
remove_terms = punctuation + '0123456789'
remove_terms
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]
from keras.preprocessing import text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}
vocab_size = len(word2id) + 1
embed_size = 100
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
from keras.preprocessing.sequence import skipgrams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]
from keras.layers import Add
from keras.layers.core import Dense, Reshape
from keras.layers import Input
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers.merge import Dot
from keras.models import Model
word_input = Input(shape=(1,), name="word_input")
word = Embedding(vocab_size, embed_size,embeddings_initializer="glorot_uniform",input_length=1)(word_input)
word = Reshape(target_shape=(embed_size,))(word)
context_input = Input(shape=(1,), name="context_input")
context = Embedding(vocab_size, embed_size,embeddings_initializer="glorot_uniform",input_length=1)(context_input)
context = Reshape(target_shape=(embed_size,))(context)
model = Sequential()
model.add(Dot(axes=1)([word, context]))
最后一行之后,出现类似-
的错误The added layer must be an instance of class Layer. Found: Tensor("dot_6/MatMul:0", shape=(?, 100, 100), dtype=float32)
想知道为什么会这样吗?