我将当前数据放入了post.embedding
文件中,并使用Bidirectional_LSTM
进行了测试。
但是准确性并不令人满意。
如果我想提高准确性,
再训练post.embedding
更好吗?
还是训练更多Bidirectional_LSTM
更好?
下面是我当前正在使用的代码。
createEmbedding.py
from gensim.models import Word2Vec
import csv
token = loadCsv("test_data")
embeddingmodel = []
for i in range(len(token)):
temp_embeddingmodel = []
for k in range(len(token[i][0])):
temp_embeddingmodel.append(token[i][0][k])
embeddingmodel.append(temp_embeddingmodel)
embedding = Word2Vec(embeddingmodel, size=300, window=5, min_count=3, iter=100, sg=1,workers=4, max_vocab_size = 360000000)
embedding.save('post.embedding')
Bi_LSTM_train_csv.py
# -*- coding: utf-8 -*-
import time
import tensorflow as tf
import numpy as np
import Bi_LSTM as Bi_LSTM
import Word2Vec as Word2Vec
import csv
import os
tokens = token = loadCsv("test_data")
train_X = tokens[:, 0]
train_Y = tokens[:, 1]
train_Y_ = W2V.One_hot(train_Y) ## Convert to One-hot
train_X_ = W2V.Convert2Vec("Data/post.embedding",train_X) ## import word2vec model where you have trained before
Batch_size = 64
Total_size = len(train_X)
Vector_size = 300
seq_length = [len(x) for x in train_X]
Maxseq_length = max(seq_length)
learning_rate = 0.001
lstm_units = 128
num_class = 2
training_epochs = 5
keep_prob = 0.75
X = tf.placeholder(tf.float32, shape = [None, Maxseq_length, Vector_size], name = 'X')
Y = tf.placeholder(tf.float32, shape = [None, num_class], name = 'Y')
seq_len = tf.placeholder(tf.int32, shape = [None])
BiLSTM = Bi_LSTM.Bi_LSTM(lstm_units, num_class, keep_prob)
with tf.variable_scope("loss", reuse = tf.AUTO_REUSE):
logits = BiLSTM.logits(X, BiLSTM.W, BiLSTM.b, seq_len)
loss, optimizer = BiLSTM.model_build(logits, Y, learning_rate)
prediction = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
init = tf.global_variables_initializer()
total_batch = int(Total_size / Batch_size)
print("Start training!")
saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state("")
print(ckpt.model_checkpoint_path)
with tf.Session() as sess:
start_time = time.time()
saver.restore(sess, ckpt.model_checkpoint_path) # search for checkpoint file
graph = tf.get_default_graph()
train_writer = tf.summary.FileWriter('Bidirectional_LSTM', sess.graph)
i = 0
for epoch in range(training_epochs):
avg_acc, avg_loss = 0. , 0.
for step in range(total_batch):
train_batch_X = train_X_[step*Batch_size : step*Batch_size+Batch_size]
train_batch_Y = train_Y_[step*Batch_size : step*Batch_size+Batch_size]
batch_seq_length = seq_length[step*Batch_size : step*Batch_size+Batch_size]
train_batch_X = W2V.Zero_padding(train_batch_X, Batch_size, Maxseq_length, Vector_size)
sess.run(optimizer, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
# Compute average loss
loss_ = sess.run(loss, feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
avg_loss += loss_ / total_batch
acc = sess.run(accuracy , feed_dict={X: train_batch_X, Y: train_batch_Y, seq_len: batch_seq_length})
avg_acc += acc / total_batch
print("epoch : {:02d} step : {:04d} loss = {:.6f} accuracy= {:.6f}".format(epoch+1, step+1, loss_, acc))
summary = sess.run(BiLSTM.graph_build(avg_loss, avg_acc))
train_writer.add_summary(summary, i)
i += 1
duration = time.time() - start_time
minute = int(duration / 60)
second = int(duration) % 60
print("%dminutes %dseconds" % (minute,second))
save_path = saver.save(sess, "/Users/aa/Downloads/Bi_LSTM/Data/Bi_LSTM")
train_writer.close()
print('save_path',save_path)
我仍然不知道post.embedding
和Bidirectional_LSTM
在做什么。
如何提高此过程的准确性?
我应该再训练Bidirectional_LSTM
吗?
还是我应该训练更多post.embedding
?
我应该做这件事还是两者都做?