我正在尝试使用词嵌入进行分类,但是我遇到typeError问题。
# glove word embeddings
import numpy as np
embeddings_index = {}
with open('glove.6B/glove.6B.50d.txt', 'r') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs```
# transform text (a title) to an embedding by averaging word embeddings
def get_mean_embeddings(docs,embeddings):
means = []
dim = len(embeddings.values()[0])
for doc in docs :
words = tokenizer.tokenize(doc)
means.append(np.mean([embeddings[w] if w in embeddings else np.zeros(dim) for w in words], axis=0))
return np.array(means)```
def get_mean_embeddings(docs,embeddings):
dim = len(embeddings.values()[0])
return np.array([
np.mean([embeddings[w]
for w in tokenizer.tokenize(doc) if w in embeddings] or
[np.zeros(dim)], axis=0)
for doc in docs
])
import sklearn.svm as svm
from sklearn.metrics import f1_score
clf = svm.SVC(kernel='rbf')
f1_scores = []
for g in genres:
genre_data = balanced_data[g]
train,test = train_test_split(genre_data,train_size = 0.6)
train_feature_matrix = get_mean_embeddings(train['title'],embeddings)
test_feature_matrix = get_mean_embeddings(test['title'],embeddings)
clf.fit(train_feature_matrix,train[g])
y_pred = clf.predict(test_feature_matrix)
f1_scores.append(f1_score(test[g],y_pred))
print('for "%s" , f1 score = %.2f' %(g,f1_scores[-1]))
print ('average f1 score over all genres : %.2f ' %(np.mean(f1_scores)))
预期和实际结果:
for "sci-fi" , f1 score = 0.70
for "horror" , f1 score = 0.68
for "fantasy" , f1 score = 0.62
for "adventure" , f1 score = 0.66
for "thriller" , f1 score = 0.63
for "mystery" , f1 score = 0.58
for "romance" , f1 score = 0.62
for "crime" , f1 score = 0.56
for "drama" , f1 score = 0.59
for "action" , f1 score = 0.67
for "comedy" , f1 score = 0.62
for "documentary" , f1 score = 0.64
for "war" , f1 score = 0.65
average f1 score over all genres : 0.63
错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-33-7c91ab021935> in <module>
6 genre_data = balanced_data[g]
7 train,test = train_test_split(genre_data,train_size = 0.6)
----> 8 train_feature_matrix = get_mean_embeddings(train['title'],embeddings)
9 test_feature_matrix = get_mean_embeddings(test['title'],embeddings)
10 clf.fit(train_feature_matrix,train[g])
<ipython-input-25-0a52cf917522> in get_mean_embeddings(docs, embeddings)
1 def get_mean_embeddings(docs,embeddings):
----> 2 dim = len(embeddings.values()[0])
3 return np.array([
4 np.mean([embeddings[w]
5 for w in tokenizer.tokenize(doc) if w in embeddings] or
TypeError: 'dict_values' object is not subscriptable
答案 0 :(得分:0)
问题在于,in Python 3, dict_values is merely a view and not a list。
如果要获得第一个元素长度,则必须替换
dim = len(embeddings.values()[0])
具有:
dim = len(list(embeddings.values())[0])
有关更多信息,请参见:Python: how to convert a dictionary into a subscriptable array?