要基于this turorial,要使用Embedding Projector可视化嵌入,必须具有包含训练数据词汇的meta.tsv。我说的对吗?
我的问题是,使用keras layer时如何从预训练的模型tfhub中获取词汇或元数据?
答案 0 :(得分:0)
您可以通过稍微修改此Embedding Tutorial中提到的代码来实现。
使用 meta.tsv
创建文件vecs.tsv
和Keras Hub Layer
的工作代码如下:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
tfds.disable_progress_bar()
# Our Input Vocabulary or the Number of Words is 1000 and We want the Output of Shape 5
embedding_layer = layers.Embedding(1000, 5)
result = embedding_layer(tf.constant([1,2,3]))
print(result) #Embedding Tensor
print(result.numpy())
result = embedding_layer(tf.constant([[0,1,2],[3,4,5]]))
result.shape
(train_data, test_data), info = tfds.load(
'imdb_reviews/subwords8k',
split = (tfds.Split.TRAIN, tfds.Split.TEST),
with_info=True, as_supervised=True)
encoder = info.features['text'].encoder
encoder.subwords[:20]
padded_shapes = ([None],())
#First parameter of padded_batch is the Batch Size. So, Train Batch Size and Test Batch Size is 10
train_batches = train_data.shuffle(1000).padded_batch(10, padded_shapes = padded_shapes)
test_batches = test_data.shuffle(1000).padded_batch(10, padded_shapes = padded_shapes)
train_batch, train_labels = next(iter(train_batches))
print('train_batch.numpy() = ', train_batch.numpy())
print('train_labels.numpy() = ', train_labels.numpy())
embedding_dim=16
embedding_dim=16
model = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(model, output_shape=[20], input_shape=[],
dtype=tf.string, trainable=True)
model = keras.Sequential([
layers.Embedding(encoder.vocab_size, embedding_dim), #In this Embedding layer, we are Converting a 8185 Input Vector to 16 Output Shape
layers.GlobalAveragePooling1D(),
layers.Dense(16, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
model.summary()
# ### Compile and train the model
# In[13]:
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_batches, epochs=10, validation_data=test_batches, validation_steps=20, verbose = 2)
# With this approach our model reaches a validation accuracy of around 88% (note the model is overfitting, training accuracy is significantly higher).
# In[14]:
import matplotlib.pyplot as plt
history_dict = history.history
print(history_dict)
# In[15]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()
# ## Retrieve the learned embeddings
#
# Next, let's retrieve the word embeddings learned during training. This will be a matrix of shape `(vocab_size, embedding-dimension)`.
# In[16]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)
# We will now write the weights to disk. To use the [Embedding Projector](http://projector.tensorflow.org), we will upload two files in tab separated format: a file of vectors (containing the embedding), and a file of meta data (containing the words).
# In[17]:
import io
encoder = info.features['text'].encoder
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for num, word in enumerate(encoder.subwords):
vec = weights[num+1] # skip 0, it's padding.
out_m.write(word + "\n")
out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()
生成的文件将与包含此代码的Python文件位于同一路径。