我正在尝试为多标签分类计算召回精度和F1分数 使用此代码
出现了有关分类报告语句的错误
ValueError: Shape of passed values is (1617, 1), indices imply (1617, 6)
Wrong number of items passed 1, placement implies 6
代码:
from sklearn.metrics import classification_report
history = model.fit(X_train, y_train, batch_size=128, epochs=1, verbose=1,validation_data=(X_test, y_test), validation_split=0.3)
pred = model.predict(X_test, batch_size=128, verbose=1)
predicted = np.argmax(pred, axis=1)
report = classification_report(y_test, axis=1), predicted,comments_labels)#,comments_labels)
print(report)
我所有的代码都是
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
comments_labels = data[["l1", "l2", "l3", "l4", "l5", "l6"]]
nltk.download('punkt')
all_words = []
for sent in data['comment_text']:
tokenize_word = word_tokenize(sent)
for word in tokenize_word:
all_words.append(word)
unique_words = set(all_words)
print(len(unique_words))
def preprocess_text(sen):
# Remove punctuations and numbers
#sentence = re.sub('[^a-zA-Z]', ' ', sen)
# Single character removal
#sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
sentence = re.sub(r'\s+', ' ', sen)
return sentence
X = []
sentences = list(data['comment_text'])
for sen in sentences:
X.append(preprocess_text(sen))
y = comments_labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
tokenizer = Tokenizer(num_words=15784)#5000
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 50 ######was 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
from numpy import array
from numpy import asarray
from numpy import zeros
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(6, activation='sigmoid')(LSTM_Layer_1)#softplus,selu,
model = Model(inputs=deep_inputs, outputs=dense_layer_1)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',recall_m,precision_m,custom_f1])
print(model.metrics_names)
from sklearn.metrics import classification_report
history = model.fit(X_train, y_train, batch_size=128, epochs=1, verbose=1,validation_data=(X_test, y_test), validation_split=0.3)
pred = model.predict(X_test, batch_size=128, verbose=1)
predicted = np.argmax(pred, axis=1)
report = classification_report(y_test, axis=1), predicted,comments_labels)#,comments_labels)
print(report)
test_output = model.predict(X_test, verbose=0)
答案 0 :(得分:0)
您对目标标签进行了热编码吗?
我认为每个标签的尺寸均为1,由于Dense(6),您需要将其转换为单标签。