Question

当我尝试使用validation_data输入自己的验证集时出现错误。

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-53-e2816bdbad19> in <module>
      2                      np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
      3                     np.array(y_tr).reshape(len(y_tr), max_len, 1),
----> 4                     batch_size=32, epochs=10, validation_data=[X_word_te, y_te], verbose=1)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[ 7993, 30540, 29051, ...,     0,     0,     0],
       [ 9571, 24132, 14066, ...,     0,     0,     0],
       [19338, 15304,  7322, ...,     0,     0,     0],
       ...,
       [ 5062, 2713...

这是我的jupyter笔记本的导出产品，它是发布到此blog

的示例的副本。

从此处https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

提取ner数据集

#!/usr/bin/env python
# coding: utf-8

# In[1]:
import pandas as pd
import numpy as np
# In[11]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
# In[13]:
data = data.fillna(method="ffill")
# In[15]:
words = list(set(data["Word"].values))
n_words = len(words); n_words
# In[16]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
# In[17]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


# In[18]:
getter = SentenceGetter(data)
# In[19]:
sent = getter.get_next()
# In[21]:
sentences = getter.sentences
# In[22]:
max_len = 75
max_len_char = 10
# In[23]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}
# In[25]:
from keras.preprocessing.sequence import pad_sequences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]
# In[26]:
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')
# In[27]:
max_len_char
# In[28]:
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)
# In[29]:
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0
# In[30]:
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))


# In[31]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
# In[32]:
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')
# In[33]:
from sklearn.model_selection import train_test_split
# In[34]:
X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)
# In[35]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
# In[37]:
# input and embedding for words
word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                     input_length=max_len, mask_zero=True)(word_in)

# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,))
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                           input_length=max_len_char, mask_zero=True))(char_in)
# character LSTM to get word encodings by characters
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                recurrent_dropout=0.5))(emb_char)

# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6))(x)
out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)

model = Model([word_in, char_in], out)

# In[38]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])


# In[39]:
model.summary()

# In[52]:
history = model.fit([X_word_tr,
                     np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
                    np.array(y_tr).reshape(len(y_tr), max_len, 1),
                    batch_size=32, epochs=10, validation_data=(X_word_te , y_te), verbose=1)

编辑：添加模型摘要

model.summary()

Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_2 (InputLayer)            (None, 75, 10)       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 75)           0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 75, 10, 10)   1000        input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 75, 20)       703600      input_1[0][0]                    
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, 75, 20)       2480        time_distributed_1[0][0]         
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 75, 40)       0           embedding_1[0][0]                
                                                                 time_distributed_2[0][0]         
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 75, 40)       0           concatenate_1[0][0]              
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 75, 100)      36400       spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
time_distributed_3 (TimeDistrib (None, 75, 18)       1818        bidirectional_1[0][0]            
==================================================================================================
Total params: 745,298
Trainable params: 745,298
Non-trainable params: 0

Answer 1

问题可能出在model.fit（）的validation_data部分内。应该是

validation_data=([X_word_te, X_char_te], y_te)

根据模型输入的要求。

检查模型输入时出错：传递给模型的Numpy数组列表不是模型预期的大小

1 个答案: