我正在尝试在 bilstm keras 模型中运行一个部件,我目前正在运行该部件:
def get_pad_train_test_val(data_group, data):
#get max token and tag length
n_token = len(list(set(data['Word'].to_list())))
n_tag = len(list(set(data['Tag'].to_list())))
#Pad tokens (X var)
tokens = data_group['Word_idx'].tolist()
maxlen = max([len(s) for s in tokens])
pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
#Pad Tags (y var) and convert it into one hot encoding
tags = data_group['Tag_idx'].tolist()
pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
n_tags = len(tag2idx)
pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
#Split train, test and validation set
tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)
print(
'train_tokens length:', len(train_tokens),
'\ntrain_tokens length:', len(train_tokens),
'\ntest_tokens length:', len(test_tokens),
'\ntest_tags:', len(test_tags),
'\nval_tokens:', len(val_tokens),
'\nval_tags:', len(val_tags),
)
return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)
但是我不想将数据集划分为训练和测试,我已经将测试数据集分开了,所以我想单独计算火车的标记和标签数量并单独进行测试,而不使用 train_test_split,我该怎么做?