虽然这可能是一个非常简单的问题,但我无法存储输出。他是我的代码。我为文本相似性度量做了tf-idf:
data=pd.read_csv('name.csv')
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt') # if necessary...
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(text1, text2):
tfidf = vectorizer.fit_transform([text1, text2])
return ((tfidf * tfidf.T).A)[0,1]
cnt=np.array(np.arange(0,13091))
indx=[]
for i in cnt:
indx.append(cosine_sim(data['names'][i], data['names'][i+1]))
执行代码后,我发现以下错误:
KeyError Traceback (most recent call last)
<ipython-input-93-14badf13ad3b> in <module>()
25
26 for i in cnt:
---> 27 indx.append(cosine_sim(data['names'][i], data['names'][i+1]))
/anaconda2/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
621 key = com._apply_if_callable(key, self)
622 try:
--> 623 result = self.index.get_value(self, key)
624
625 if not is_scalar(result):
/anaconda2/lib/python2.7/site-packages/pandas/core/indexes/base.pyc in get_value(self, series, key)
2558 try:
2559 return self._engine.get_value(s, k,
-> 2560 tz=getattr(series.dtype, 'tz', None))
2561 except KeyError as e1:
2562 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
但是,如果我写print语句,我得到输出:
for i in cnt:
print cosine_sim(data['names'][i], data['names'][i+1])
我不知道我在哪里犯错误。任何帮助将受到高度赞赏。