如何将以下pandas数据帧与几个文档中每个单词的tf-idf分数转换为名为“tfdif”的矩阵,以便我可以实现
import {Directive, Attribute, ElementRef, DynamicComponentLoader} from 'angular2/core';
import {Router, RouterOutlet, ComponentInstruction} from 'angular2/router';
@Directive({
selector: 'router-outlet'
})
export class LoggedInRouterOutlet extends RouterOutlet {
publicRoutes: any;
private parentRouter: Router;
constructor(_elementRef: ElementRef, _loader: DynamicComponentLoader,
_parentRouter: Router, @Attribute('name') nameAttr: string) {
super(_elementRef, _loader, _parentRouter, nameAttr);
this.parentRouter = _parentRouter;
}
activate(instruction: ComponentInstruction) {
if (!localStorage.getItem('jwt') || !tokenNotExpired('jwt')) {//Public Routes does not work with Hash Location Strategy, need to come up with something else.
// todo: redirect to Login, may be there is a better way?
if(localStorage.getItem('jwt')){
localStorage.removeItem('jwt');
}
this.parentRouter.navigate(['Login']);
}
return super.activate(instruction);
}
}
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
答案 0 :(得分:0)
您需要使用原始原始文档填充TfidfVectorizer
,然后才能使用它来转换新文档。
如果您无法访问原始文档,则可以通过构建字典来恢复每个单词的idf weights:
idfs[word] = log{(# documents) / (# documents where word has non-zero tf-idf weight)}
稍后您可以使用该字典计算新句子的tf-idf权重:
from collections import Counter
words = sentence.split()
s_tfs = Counter(words)
s_idfs = {word: idfs.get(word, 0) for word in words}
s_tfidf = {word: s_tfs.get(word, 0) * s_idfs.get(word, 0) for word in idfs.keys()}