import keras
import keras.backend as K
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.core import Activation
from keras.preprocessing.text import Tokenizer # for
tokenizing text
from keras.preprocessing.sequence import pad_sequences # for
padding sentences with zeros. To make the sentence length same
from keras.utils import to_categorical # for one-
hot encoding of the labels
from keras.layers import Dense, Input, Flatten, Dropout,
BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
from sklearn.model_selection import train_test_split
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 20000
#Reading the data
raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv")
raw_data.head()
# create training and testing vars
train, test = train_test_split(raw_data, test_size=0.3)
train.head()
test.head()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train.Procedure)
train_sequences = tokenizer.texts_to_sequences(train.Procedure)
test_sequences = tokenizer.texts_to_sequences(test.Procedure)
word_index = tokenizer.word_index
containing words and their index
# print(tokenizer.word_index)
print('Found %s unique tokens.' % len(word_index))
train_data = pad_sequences(train_sequences,
maxlen=MAX_SEQUENCE_LENGTH)
train
test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH)
test
print(train_data.shape)
print(test_data.shape)
print (word_index)
train_labels = train['dxcode']
test_labels = test['dxcode']
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() # converts the character
array to numeric array.
Assigns levels to unique labels.
le.fit(train_labels)
le.fit(test_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)
print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))
le.inverse_transform(1)
labels_train = to_categorical(np.asanyarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)
EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)
print('Training model.')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
EMBEDDING_DIM,
input_length=MAX_SEQUENCE_LENGTH
))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(23, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['acc'],)
model.fit(train_data, labels_train,
batch_size=32,
epochs=10,
validation_data=(test_data, labels_test))
model.evaluate(test_data, labels_test)
pred = model.predict(test_data)
pred
# print(model.layers)
for layer in model.layers:
print(layer)
import keras.backend as K
emd = K.function(inputs=[model.layers[0].input],
outputs=[model.layers[0].output])
rbind = np.concatenate((train_data, test_data), axis=0)
print(rbind.shape)
### Submissions file
test_results = model.predict_classes(rbind)
#print(test_results)
test_labels = le.inverse_transform(test_results)
#test_labels = [le.inverse_transform(i) for i in test_results]
submissions_CNN =
pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels})
submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)
文本文档可以标记多个标签,那么如何对此数据集进行多标签分类?我已经阅读了很多来自sklearn的文档,但我似乎无法找到进行多标签分类的正确方法。提前感谢您的帮助。
答案 0 :(得分:0)
你是否收到了这一行的错误:
train_labels = le.transform(train_labels)
如果是,那么它是因为在它上面的行中,你正在这样做:
le.fit(test_labels)
这样做会忘记之前的数据(之前对其上方的fit()
的调用)并且只记住test_labels
中的数据。因此,当一个新标签(出现在火车中而不是在测试中)出现时,它会抛出这个错误。
你需要重新排列:
le.fit(train_labels)
le.fit(test_labels)
用这个:
# I am using .tolist() because I observe that your
# train_labels, test_labels are pandas Series objects
le.fit(train_labels.tolist() + test_labels.tolist())