next epoch: 0.5456549558716741 ('time point 0')
next epoch: 892.5030143482156 (~14:30min)
next epoch: 1757.6139726727963 (~14:30min)
Time elapsed: 2622.23 seconds (~14min, total of 44min)
与实际语料库大小13k相比,这些时间vocab_size
只有5000 {...}
我正在Tensorflow中处理一个单词嵌入任务,它与King James圣经作为语料库一起工作,所以它有一个相当大的13000个独特单词。
在我的机器上(GTX 970),我甚至没有打印到达另一个时代,而在我朋友的CPU上,这个过程在几分钟内完成。
有人可以对这个谜团有所了解吗?
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from book_helper import Book
from collections import Counter
from math import sqrt
import random as rnd
import time
%matplotlib notebook
book = Book('./pg10.txt')
vocab_size = len(Counter(book._book_text)) # 13079
book.create_dictionaries(vocab_size)
samples = [x for x in rnd.sample(book._book_text, 10) if x.isalpha()]
print('Sample words:\t' + ', '.join(samples))
print('Words2ids:\t' + ', '.join([str(book.words2ids(samples))]))
print('Ids2words:\t' + ', '.join(book.ids2words(book.words2ids(samples))))
# hyperparameters
batch_size = 128
epochs = 3
lrate = 1
embedding_size = 64
skip_window = 2
noise_samples = 64
X = tf.placeholder(tf.int32, [batch_size], "Input_Placeholder")
desired = tf.placeholder(tf.int32, [batch_size, 1], "DesiredOutput_Placeholder")
with tf.variable_scope("embeddings"):
# create word embedding
ru_init = tf.random_uniform_initializer(-1.0, 1.0)
embeddings = tf.get_variable("embedding",
[vocab_size, embedding_size],
initializer=ru_init)
# retrieve word ids from embedding
embed = tf.nn.embedding_lookup(embeddings, X)
with tf.variable_scope("output_layer"):
weights1 = tf.get_variable("weights",
[vocab_size, embedding_size],
initializer=tf.truncated_normal_initializer(1 / sqrt(embedding_size)))
biases1 = tf.get_variable("bias", initializer=tf.zeros([vocab_size]))
drive = tf.matmul(embed, tf.transpose(weights1)) + biases1
context = tf.nn.softmax(drive)
tf.summary.histogram("drive", drive)
with tf.variable_scope("nce_loss"):
nce_weights = tf.get_variable("weights",
[vocab_size, embedding_size],
initializer=tf.truncated_normal_initializer(1 / sqrt(embedding_size)))
nce_bias = tf.get_variable("bias", initializer=tf.zeros([vocab_size]))
nce_loss = tf.reduce_mean(tf.nn.nce_loss(weights= nce_weights,
biases= nce_bias,
labels= desired,
inputs= embed,
num_sampled= noise_samples,
num_classes= vocab_size))
tf.summary.scalar("nce_loss", nce_loss)
training_step = tf.train.GradientDescentOptimizer(lrate).minimize(nce_loss)
merged_summaries = tf.summary.merge_all()
train_writer = tf.summary.FileWriter("./summaries/train", tf.get_default_graph())
with tf.Session() as session:
start_time = time.clock()
step = 0
session.run(tf.global_variables_initializer())
for _epoch in range(epochs):
print("next epoch: " + str(time.clock() - start_time))
for targets, concepts in book.get_training_batch(batch_size, skip_window):
summaries, _ = session.run([merged_summaries, training_step], feed_dict = {X: targets, desired: concepts})
train_writer.add_summary(summaries, step)
step += 1
duration = time.clock() - start_time
print("Time elapsed: {0:.2f} seconds".format(duration))