如何查看哪些测试数据分类不正确

时间:2018-03-31 09:58:37

标签: python neural-network keras classification imdb

输出的准确度不是100%,因此有些文本被网络错误分类。如何在网络后查看这些文本?

import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from keras import optimizers
from keras.layers import Conv1D, GlobalMaxPooling1D

np.random.seed(42)

max_features = 10000
maxlen = 400
batch_size = 64
embedding_dims = 200
filters = 150
kernel_size = 5
hidden_dims = 50
epochs =5

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)
print(x_train[0])
print(y_train[0])
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])

num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

model = Sequential()

model.add(Dense(512,input_dim = 1000,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes,activation='sigmoid'))
model.summary()



opt = optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=['accuracy'])

clf = model.fit(x_train, y_train, batch_size=128, epochs=5, validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=1)
print("Accuracy: ", score[1])

我尝试了这段代码,但收到了错误

y_pred = model.predict(x_test)

# bolean mask
mask = y_pred != y_test

# print rows that was classified incorrectly    
print(x_test[mask])
  

print(x_test [mask])IndexError:布尔索引与索引不匹配   沿维1的数组; dimension是1000但对应的boolean   维度是2

1 个答案:

答案 0 :(得分:0)

我更改了您的完整代码,以便它只运行一个类(因为我们正在研究二进制问题),您可以研究错误分类的样本。结果证明您正在使用完全不适合的模型来完成任务。

import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from keras import optimizers
from keras.layers import Conv1D, GlobalMaxPooling1D
import pandas as pd

np.random.seed(42)

max_features = 10000
maxlen = 400
batch_size = 64
embedding_dims = 200
filters = 150
kernel_size = 5
hidden_dims = 50
epochs =5

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)
print(x_train[0])
print(y_train[0])
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])

num_classes = 1
#y_train = keras.utils.to_categorical(y_train, num_classes)
#y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

model = Sequential()

model.add(Dense(512,input_dim = 1000,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes,activation='sigmoid'))
model.summary()



opt = optimizers.Adam()
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=['accuracy'])

clf = model.fit(x_train, y_train, batch_size=128, epochs=5, validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=1)
print("Accuracy: ", score[1])

y_pred = model.predict(x_test)
df_test_pred = pd.concat([pd.DataFrame(x_test), pd.DataFrame(y_test, columns= ['test']), pd.DataFrame(y_pred, columns= ['pred'])], axis=1)

df_wrong= df_test_pred[df_test_pred['test'] != df_test_pred['pred']]