多标签文本分类。我有一个文本/标签csv。文本是纯文本,标签是字母数字

时间:2018-04-24 07:25:31

标签: python-3.x machine-learning conv-neural-network

    import keras
    import keras.backend as K
    from keras.optimizers import Adam
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers.core import Activation
    from keras.preprocessing.text import Tokenizer          # for 
    tokenizing text
    from keras.preprocessing.sequence import pad_sequences  # for 
    padding sentences with zeros. To make the sentence length same
    from keras.utils import to_categorical                  # for one- 
    hot encoding of the labels
    from keras.layers import Dense, Input, Flatten, Dropout, 
    BatchNormalization
    from keras.layers import Conv1D, MaxPooling1D, Embedding
    from keras.models import Sequential
    from sklearn.model_selection import train_test_split


    MAX_SEQUENCE_LENGTH = 300   
    MAX_NB_WORDS = 20000        

    #Reading the data
    raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv")
    raw_data.head()

    # create training and testing vars
    train, test = train_test_split(raw_data, test_size=0.3)
    train.head()
    test.head()

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)   
    tokenizer.fit_on_texts(train.Procedure)           
    train_sequences = tokenizer.texts_to_sequences(train.Procedure)
    test_sequences = tokenizer.texts_to_sequences(test.Procedure)

    word_index = tokenizer.word_index                
    containing words and their index
    # print(tokenizer.word_index)                  
    print('Found %s unique tokens.' % len(word_index)) 
    train_data = pad_sequences(train_sequences, 
    maxlen=MAX_SEQUENCE_LENGTH)  
    train
    test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH) 
    test

    print(train_data.shape)
    print(test_data.shape)
    print (word_index)

    train_labels = train['dxcode']
    test_labels = test['dxcode']
    from sklearn import preprocessing
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()                  # converts the character 
                                        array to numeric array. 
                                Assigns levels to unique labels.
    le.fit(train_labels)
    le.fit(test_labels)
    train_labels = le.transform(train_labels)
    test_labels = le.transform(test_labels)

    print(le.classes_)
    print(np.unique(train_labels, return_counts=True))
    print(np.unique(test_labels, return_counts=True))

    le.inverse_transform(1)

    labels_train = to_categorical(np.asanyarray(train_labels))
    labels_test  = to_categorical(np.asarray(test_labels))
    print('Shape of data tensor:', train_data.shape)
    print('Shape of label tensor:', labels_train.shape)
    print('Shape of label tensor:', labels_test.shape)

    EMBEDDING_DIM = 100
    print(MAX_SEQUENCE_LENGTH)

    print('Training model.')

    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS,
                        EMBEDDING_DIM,
                        input_length=MAX_SEQUENCE_LENGTH
                        ))
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(23, activation='softmax'))


    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'],)


    model.fit(train_data, labels_train,
              batch_size=32,
              epochs=10,
              validation_data=(test_data, labels_test))

    model.evaluate(test_data, labels_test)
    pred = model.predict(test_data)
    pred
    # print(model.layers)
    for layer in model.layers:
        print(layer)

    import keras.backend as K
    emd = K.function(inputs=[model.layers[0].input], 
                     outputs=[model.layers[0].output])

    rbind = np.concatenate((train_data, test_data), axis=0)
    print(rbind.shape)
    ### Submissions file 
    test_results = model.predict_classes(rbind)
    #print(test_results)
    test_labels = le.inverse_transform(test_results)
    #test_labels = [le.inverse_transform(i) for i in test_results] 
    submissions_CNN = 
    pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels})
    submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)

文本文档可以标记多个标签,那么如何对此数据集进行多标签分类?我已经阅读了很多来自sklearn的文档,但我似乎无法找到进行多标签分类的正确方法。提前感谢您的帮助。

1 个答案:

答案 0 :(得分:0)

你是否收到了这一行的错误:

train_labels = le.transform(train_labels)

如果是,那么它是因为在它上面的行中,你正在这样做:

le.fit(test_labels)

这样做会忘记之前的数据(之前对其上方的fit()的调用)并且只记住test_labels中的数据。因此,当一个新标签(出现在火车中而不是在测试中)出现时,它会抛出这个错误。

你需要重新排列:

le.fit(train_labels)
le.fit(test_labels)

用这个:

# I am using .tolist() because I observe that your 
# train_labels, test_labels are pandas Series objects
le.fit(train_labels.tolist() + test_labels.tolist())