当我尝试使用validation_data输入自己的验证集时出现错误。
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-53-e2816bdbad19> in <module>
2 np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
3 np.array(y_tr).reshape(len(y_tr), max_len, 1),
----> 4 batch_size=32, epochs=10, validation_data=[X_word_te, y_te], verbose=1)
ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[ 7993, 30540, 29051, ..., 0, 0, 0],
[ 9571, 24132, 14066, ..., 0, 0, 0],
[19338, 15304, 7322, ..., 0, 0, 0],
...,
[ 5062, 2713...
这是我的jupyter笔记本的导出产品,它是发布到此blog
的示例的副本。从此处https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
提取ner数据集#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
# In[11]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
# In[13]:
data = data.fillna(method="ffill")
# In[15]:
words = list(set(data["Word"].values))
n_words = len(words); n_words
# In[16]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
# In[17]:
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
s["POS"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(agg_func)
self.sentences = [s for s in self.grouped]
def get_next(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
# In[18]:
getter = SentenceGetter(data)
# In[19]:
sent = getter.get_next()
# In[21]:
sentences = getter.sentences
# In[22]:
max_len = 75
max_len_char = 10
# In[23]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}
# In[25]:
from keras.preprocessing.sequence import pad_sequences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]
# In[26]:
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')
# In[27]:
max_len_char
# In[28]:
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)
# In[29]:
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0
# In[30]:
X_char = []
for sentence in sentences:
sent_seq = []
for i in range(max_len):
word_seq = []
for j in range(max_len_char):
try:
word_seq.append(char2idx.get(sentence[i][0][j]))
except:
word_seq.append(char2idx.get("PAD"))
sent_seq.append(word_seq)
X_char.append(np.array(sent_seq))
# In[31]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
# In[32]:
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')
# In[33]:
from sklearn.model_selection import train_test_split
# In[34]:
X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)
# In[35]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
# In[37]:
# input and embedding for words
word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
input_length=max_len, mask_zero=True)(word_in)
# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,))
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
input_length=max_len_char, mask_zero=True))(char_in)
# character LSTM to get word encodings by characters
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
recurrent_dropout=0.5))(emb_char)
# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
recurrent_dropout=0.6))(x)
out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)
model = Model([word_in, char_in], out)
# In[38]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])
# In[39]:
model.summary()
# In[52]:
history = model.fit([X_word_tr,
np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
np.array(y_tr).reshape(len(y_tr), max_len, 1),
batch_size=32, epochs=10, validation_data=(X_word_te , y_te), verbose=1)
编辑:添加模型摘要
model.summary()
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_2 (InputLayer) (None, 75, 10) 0
__________________________________________________________________________________________________
input_1 (InputLayer) (None, 75) 0
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 75, 10, 10) 1000 input_2[0][0]
__________________________________________________________________________________________________
embedding_1 (Embedding) (None, 75, 20) 703600 input_1[0][0]
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, 75, 20) 2480 time_distributed_1[0][0]
__________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 75, 40) 0 embedding_1[0][0]
time_distributed_2[0][0]
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 75, 40) 0 concatenate_1[0][0]
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 75, 100) 36400 spatial_dropout1d_1[0][0]
__________________________________________________________________________________________________
time_distributed_3 (TimeDistrib (None, 75, 18) 1818 bidirectional_1[0][0]
==================================================================================================
Total params: 745,298
Trainable params: 745,298
Non-trainable params: 0
答案 0 :(得分:0)
问题可能出在model.fit()的validation_data部分内。应该是
validation_data=([X_word_te, X_char_te], y_te)
根据模型输入的要求。