我目前正在尝试在2d中可视化300维的单词向量。 我尝试了使用不同参数的t-SNE并阅读了https://distill.pub/2016/misread-tsne/上的博客,但到目前为止我没有得到有用的结果。
我想要一种可视化效果,该可视化效果对应于几个选定单词向量的最近邻居,但是二维可视化效果无处不在。
是否不适合使用TSNE解决我的问题?
from sklearn.manifold import TSNE
arr = []
for category in category_embeddings.keys():
arr.append(category_embeddings[category][0])
perplex = 30
tsne_steps = 50000
lr = 10
fig_tsne = plt.figure(figsize=(18, 18), dpi=800)
tsne = TSNE(perplexity=perplex,
n_components=2,
init='pca',
n_iter=tsne_steps,
learning_rate=lr,
method="exact")
plot_only = len(category_embeddings.keys())
low_dim_embs = tsne.fit_transform(np.asarray(arr))
for i, title in enumerate(category_embeddings.keys()):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(
title,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
答案 0 :(得分:0)
好,解决了。
创建距离矩阵并将TSNE馈入该矩阵可以更好地实现2D可视化。
from sklearn.metrics.pairwise import cosine_distances
c1_c2_cos_dist = {}
# Create distance Matrix
for c1in category_embeddings.keys():
tmp = {}
for c2 in category_embeddings.keys():
cos_dis = cosine_distances(category_embeddings[c1],category_embeddings[
tmp[c2] = cos_dis[0][0]
c1_c2_cos_dist[c1] = copy(tmp)
# ---
from sklearn.manifold import TSNE
arr = []
for category in category_embeddings.keys():
arr.append(category_embeddings[category][0])
perplex = 30
tsne_steps = 50000
lr = 10
fig_tsne = plt.figure(figsize=(18, 18), dpi=800)
tsne = TSNE(perplexity=perplex,
n_components=2,
metric="precomputed",
n_iter=tsne_steps,
learning_rate=lr)
distMatrix = []
for col in c1_c2_cos_dist.keys():
arr =[]
for row in c1_c2_cos_dist[col]:
arr.append(c1_c2_cos_dist[col][row])
distMatrix.append(copy(arr))
distMatrix = np.asarray(distMatrix)
low_dim_embs = tsne.fit_transform(distMatrix)
plot_only = len(category_embeddings.keys())
for i, title in enumerate(category_embeddings.keys()):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(
title,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')