Question

我正在尝试为二进制分类训练模型。这是对推文的情感分析，但是模型在时期1之后提示错误。输入的大小必须是输入的大小，但无法准确确定是什么输入引起了问题。任何帮助将不胜感激。

非常感谢！

我已经尝试了许多不同大小的实例，问题仍然存在，

import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense


df = pd.read_csv('twitter-sentiment-analysis2/train.csv',encoding='latin-1')
df.drop(['ItemID'], axis=1, inplace=True)
label=list(df.Sentiment)
text=list(df.SentimentText)
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ')
tokenizer.fit_on_texts(text)
vocab = tokenizer.word_index
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.1,random_state=42)

X_train_word_ids = tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(X_train_word_ids, maxlen=50)
x_test= pad_sequences(X_test_word_ids, maxlen=50)

glove_dir = 'glove6b100dtxt/'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


embedding_dim = 100 #data comes from my GloVe
max_words=50
maxlen=50
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in vocab.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_split=0.1,shuffle=True)
model.save_weights('pre_trained_glove_model.h5')

有人可以给我一些建议去看哪里吗？再次感谢！

这是错误：

File "HM3.py", line 58, in <module>
    history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_split=0.1,shuffle=True)
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/training_arrays.py", line 199, in fit_loop
    outs = f(ins_batch)
  File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
  File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
    fetched = self._callable_fn(*array_vals)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1439, in __call__
    run_metadata_ptr)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: indices[26,39] = 31202 is not in [0, 50)
     [[{{node embedding_1/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_1/embeddings/read, embedding_1/Cast, embedding_1/embedding_lookup/axis)]]

Answer 1

max_words=50
...
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))

您创建了只能嵌入50个不同单词的嵌入，但是在训练数据中，您索引了所有出现的单词。该错误告诉您在大小为[0，50）的嵌入中找不到索引为31202的单词。

一种解决方案是扩大“嵌入”输入，以覆盖训练集中出现的所有单词。另一种方法是使用零索引并嵌入零，然后将索引大于等于50的所有训练词重新映射到该零。

深度学习模型在第一个时期后提示错误

1 个答案: