计算多标签分类keras的召回精度和F1分数

时间:2020-06-12 15:14:33

标签: python machine-learning keras scikit-learn

我正在尝试为多标签分类计算召回精度和F1分数 使用此代码

出现了有关分类报告语句的错误

ValueError: Shape of passed values is (1617, 1), indices imply (1617, 6)

Wrong number of items passed 1, placement implies 6

代码:

from sklearn.metrics import classification_report
history = model.fit(X_train, y_train, batch_size=128, epochs=1, verbose=1,validation_data=(X_test, y_test), validation_split=0.3)
pred = model.predict(X_test, batch_size=128, verbose=1)
predicted = np.argmax(pred, axis=1)
report = classification_report(y_test, axis=1), predicted,comments_labels)#,comments_labels)
print(report)

我所有的代码都是

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt


comments_labels = data[["l1", "l2", "l3", "l4", "l5", "l6"]]


nltk.download('punkt')
all_words = []
for sent in data['comment_text']:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)

unique_words = set(all_words)
print(len(unique_words))

def preprocess_text(sen):
    # Remove punctuations and numbers
    #sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    #sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sen)

    return sentence

X = []
sentences = list(data['comment_text'])
for sen in sentences:
    X.append(preprocess_text(sen))

y = comments_labels


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)



tokenizer = Tokenizer(num_words=15784)#5000
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 50 ######was 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)



from numpy import array
from numpy import asarray
from numpy import zeros

deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100,  trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(6, activation='sigmoid')(LSTM_Layer_1)#softplus,selu,
model = Model(inputs=deep_inputs, outputs=dense_layer_1)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',recall_m,precision_m,custom_f1])


print(model.metrics_names)

from sklearn.metrics import classification_report
history = model.fit(X_train, y_train, batch_size=128, epochs=1, verbose=1,validation_data=(X_test, y_test), validation_split=0.3)
pred = model.predict(X_test, batch_size=128, verbose=1)
predicted = np.argmax(pred, axis=1)
report = classification_report(y_test, axis=1), predicted,comments_labels)#,comments_labels)
print(report)


test_output = model.predict(X_test, verbose=0)

1 个答案:

答案 0 :(得分:0)

您对目标标签进行了热编码吗?

我认为每个标签的尺寸均为1,由于Dense(6),您需要将其转换为单标签。