所以,我正在构建一个基本的Keras文本分类器,但无论我做什么,都无法使验证准确度高于49-50%(或者更低)。我的训练准确度正常攀升,从大约50%开始,并在4-5个时期后攀升至80%左右。
以下是输出示例:
- 54s - loss: 0.6982 - acc: 0.5064 - val_loss: 0.6932 - val_acc: 0.4950
Epoch 2/3
- 57s - loss: 0.6560 - acc: 0.6580 - val_loss: 0.7324 - val_acc: 0.4950
Epoch 3/3
- 60s - loss: 0.5359 - acc: 0.7047 - val_loss: 0.7339 - val_acc: 0.4955
这是我的代码:
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
import numpy as np
import os
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import optimizers
np.random.seed(7)
class TextClassifier:
def __init__(self):
self.tokenizer = Tokenizer(num_words = 5000)
self.top_words = 5000
self.max_words = 500
self.model = model = Sequential()
model.add(Embedding(self.top_words,64,input_length = self.max_words))
model.add(Conv1D(filters = 64,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(250,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
def train(self, X_train, y_train, X_test, y_test):
self.model.fit(X_train,y_train, shuffle = True,
validation_data = (X_test,y_test),epochs = 3,batch_size=512,verbose = 2)
def predict(self,X):
return self.model.predict(X)
def init_tokenizer(self,X):
self.tokenizer.fit_on_texts(X)
def eval(self,X,y):
return self.model.evaluate(X, y, verbose=0)
def proccess_text(self,X):
vocab_text = self.tokenizer.texts_to_sequences(X)
vocab_text = sequence.pad_sequences(vocab_text,maxlen=500)
return vocab_text
def organize_text(self,pos_path,neg_path):
data = {'label':[],'text':[]}
pos_texts = self.text_to_array(pos_path)
neg_texts = self.text_to_array(neg_path)
for i in pos_texts:
data['label'].append(0)
data['text'].append(i)
for i in neg_texts:
data['label'].append(1)
data['text'].append(i)
return data
def text_to_array(self,path):
''' Takes a path argument and retrieves all the text lines from
within a folder'''
name_list = []
texts = []
for file_ in os.listdir(path):
name_list.append(file_)
for i in name_list:
file_ = open(path+i)
texts.append(file_.read().splitlines())
return texts
from TextClassifier import *
path = './data/train/'
test_path = './data/test/'
model = TextClassifier()
data = model.organize_text(path+'pos/',path+'neg/')
tests = model.organize_text(test_path+'pos/',test_path+'neg/')
model.init_tokenizer(data['text'])
model.init_tokenizer(tests['text'])
X_train = np.array(model.proccess_text(data['text']))
X_test = np.array(model.proccess_text(tests['text']))
y_train = data['label']
y_test = tests['label']
model.train(X_train,y_train,X_test,y_test)
scores = model.eval(X_test,y_test)
input_ = model.proccess_text(['It was very good! Awesome! Enjoyable!'])
print("Predict: ")
predict = model.predict(input_)
print(predict)
print("Accuracy: %.2f%%" % (scores[1]*100))
我的语料库来自这里:http://ai.stanford.edu/~amaas/data/sentiment/我只是将每个类的前1000个作为验证数据拆分。 (他们是无序的afaik)
答案 0 :(得分:0)
我不知道Dropout(0.5)对你的模型有多大帮助。似乎BatchNormalization()可能是一个很好的替代品。您可能需要在嵌入层之后考虑LSTM图层。您可以直接向LSTM添加dropout。 E.g:
model.add(LSTM(lstm_out,dropout_U = 0.2,dropout_W = 0.2))
有一个很好的完整示例,我邀请您查看此模型: https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras
希望这会有所帮助并祝你好运!
编辑:根据我在下面的评论
我认为您的模型可能正在训练非字符标记,因为如果您使用此text_to_array(自我,路径)函数,则验证准确度为.79,.85和.87:
import re
def text_to_array(self,path):
''' Takes a path argument and retrieves all the text lines from
within a folder'''
name_list = []
texts = []
for file_ in os.listdir(path):
name_list.append(file_)
for i in name_list:
file_ = open(path+i)
for line in file_.read().splitlines():
# I just preprocessed your text here
texts.append(re.sub('[^a-zA-z0-9\s]','',line.lower()))
return texts
如果有帮助,请告诉我。
答案 1 :(得分:0)
我建议将丢失移动到密集层并添加更多转换层。学习的一部分正在玩耍。