我为不平衡的类分类数据建立了文本分类模型。我使用googlenews word2vec向量作为嵌入层中的基线,而不是使用keras词向量。
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D, Bidirectional, LSTM, Input, concatenate, Conv1D, GlobalMaxPooling1D, BatchNormalization
from keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import keras.backend as K
from keras import backend as K
from keras import metrics
import numpy as np
from itertools import chain
from collections import Counter
from sklearn.utils import shuffle
import nltk
import gensim
from gensim.models import KeyedVectors
from sklearn.utils import class_weight
dat = pd.read_csv('/home/data.csv',encoding='latin',delimiter='\t')
dat = shuffle(dat)
dat.reset_index(drop=True,inplace=True)
由于这是类不平衡问题,所以我使用了f1度量标准。
def f1_metric(y_true, y_pred):
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
precision = precision(y_true, y_pred)
recall = recall(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
我处理了文本并创建了如下的单词向量
def preprocess(dat):
return [nltk.word_tokenize(row) for row in dat]
x_train, x_test, y_train, y_test= train_test_split(dat.text,dat.labels,test_size=0.20)
X = preprocess(x_train)
model = KeyedVectors.load_word2vec_format('/home/user/Downloads/GoogleNews-vectors-negative300.bin', binary=True,limit=100000)
我使用此功能将文本数组转换为word2vec模型的数字值。
def word2idx(word):
return model.wv.vocab[word].index
vocab_size, emdedding_size = model.wv.syn0.shape
pretrained_weights = model.wv.syn0
print(vocab_size, emdedding_size)
100000 300
我创建了矩阵
max_sentence_len = 50
train_x = np.zeros([len(X), max_sentence_len], dtype=np.int32)
然后用word2vec模型的索引值将0替换为对应的 标记词,最多50个词。
for i in range(len(X)):
for j in range(len(X[i])):
try:
train_x[i][j] = word2idx(X[i][j])
except:
pass
我使用sklearn函数计算了班级权重,因为这是班级不平衡的问题。
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
这是创建multiConvnet模型的功能。
def model_architecture(vocab_size,emdedding_size,pretrained_weights):
# vector-space embedding:
n_dim = 64
n_unique_words = 5000
max_review_length = 50
pad_type = trunc_type = 'pre'
drop_embed = 0.2
# convolutional layer architecture:
n_conv_1 = n_conv_2 = n_conv_3 = n_conv_4= 256
k_conv_1 = 3
k_conv_2 = 2
k_conv_3 = 4
k_conv_4 = 5
# dense layer architecture:
n_dense = 256
dropout = 0.2
input_layer = Input(shape=(max_review_length,), dtype='int16', name='input') # supports integers +/- 32.7k
# embedding_layer = Embedding(n_unique_words, n_dim, input_length=max_review_length, name='embedding')(input_layer)
embedding_layer = Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights], name='embedding')(input_layer)
drop_embed_layer = SpatialDropout1D(drop_embed, name='drop_embed')(embedding_layer)
conv_1 = Conv1D(n_conv_1, k_conv_1, activation='relu', name='conv_1')(drop_embed_layer)
maxp_1 = GlobalMaxPooling1D(name='maxp_1')(conv_1)
conv_2 = Conv1D(n_conv_2, k_conv_2, activation='relu', name='conv_2')(drop_embed_layer)
maxp_2 = GlobalMaxPooling1D(name='maxp_2')(conv_2)
conv_3 = Conv1D(n_conv_3, k_conv_3, activation='relu', name='conv_3')(drop_embed_layer)
maxp_3 = GlobalMaxPooling1D(name='maxp_3')(conv_3)
concat = concatenate([maxp_1, maxp_2, maxp_3])
dense_layer = Dense(n_dense, activation='relu', name='dense')(concat)
drop_dense_layer = Dropout(dropout, name='drop_dense')(dense_layer)
dense_2 = Dense(64, activation='relu', name='dense_2')(drop_dense_layer)
dropout_2 = Dropout(dropout, name='drop_dense_2')(dense_2)
predictions = Dense(units=1, activation='sigmoid', name='output')(dropout_2)
model = Model(input_layer, predictions)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_metric])
return model
我的模特在下面
mod_keras = model_architecture(vocab_size,emdedding_size,pretrained_weights)
mod_keras.fit(train_x,y_train,batch_size=32,epochs=2,verbose=1,validation_split=0.2,class_weight=class_weights)
运行此命令时,我遇到了错误。
Train on 287895 samples, validate on 71974 samples
Epoch 1/2
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-25-fcb6fa008311> in <module>
----> 1 mod_Access.fit(train_x,y_train_Access,batch_size=32,epochs=2,verbose=1,validation_split=0.2,class_weight=class_weights)
~/.local/lib/python3.5/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1037 initial_epoch=initial_epoch,
1038 steps_per_epoch=steps_per_epoch,
-> 1039 validation_steps=validation_steps)
1040
1041 def evaluate(self, x=None, y=None,
~/.local/lib/python3.5/site-packages/keras/engine/training_arrays.py in fit_loop(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
197 ins_batch[i] = ins_batch[i].toarray()
198
--> 199 outs = f(ins_batch)
200 outs = to_list(outs)
201 for l, o in zip(out_labels, outs):
~/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
2713 return self._legacy_call(inputs)
2714
-> 2715 return self._call(inputs)
2716 else:
2717 if py_any(is_tensor(x) for x in inputs):
~/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in _call(self, inputs)
2673 fetched = self._callable_fn(*array_vals, run_metadata=self.run_metadata)
2674 else:
-> 2675 fetched = self._callable_fn(*array_vals)
2676 return fetched[:len(self.outputs)]
2677
~/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py in __call__(self, *args, **kwargs)
1437 ret = tf_session.TF_SessionRunCallable(
1438 self._session._session, self._handle, args, status,
-> 1439 run_metadata_ptr)
1440 if run_metadata:
1441 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
526 None, None,
527 compat.as_text(c_api.TF_Message(self.status.status)),
--> 528 c_api.TF_GetCode(self.status.status))
529 # Delete the underlying status object from memory otherwise it stays alive
530 # as there is a reference to status from this from the traceback due to
InvalidArgumentError: indices[26,0] = -3338 is not in [0, 100000)
[[{{node embedding/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training/Adam/Assign_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding/embeddings/read, embedding/Cast, training/Adam/gradients/embedding/embedding_lookup_grad/concat/axis)]]
我确实读了这篇帖子InvalidArgumentError (see above for traceback): indices[1] = 10 is not in [0, 10)
根据这篇文章,我需要设置词汇表。就我而言,这正是我通过使用参数vocab_size
完成的操作。