将tf idf pandas dataframe转换为tf idf矩阵

时间:2016-02-21 07:20:49

标签: python pandas tf-idf

如何将以下pandas数据帧与几个文档中每个单词的tf-idf分数转换为名为“tfdif”的矩阵,以便我可以实现

import {Directive, Attribute, ElementRef, DynamicComponentLoader} from 'angular2/core'; import {Router, RouterOutlet, ComponentInstruction} from 'angular2/router'; @Directive({ selector: 'router-outlet' }) export class LoggedInRouterOutlet extends RouterOutlet { publicRoutes: any; private parentRouter: Router; constructor(_elementRef: ElementRef, _loader: DynamicComponentLoader, _parentRouter: Router, @Attribute('name') nameAttr: string) { super(_elementRef, _loader, _parentRouter, nameAttr); this.parentRouter = _parentRouter; } activate(instruction: ComponentInstruction) { if (!localStorage.getItem('jwt') || !tokenNotExpired('jwt')) {//Public Routes does not work with Hash Location Strategy, need to come up with something else. // todo: redirect to Login, may be there is a better way? if(localStorage.getItem('jwt')){ localStorage.removeItem('jwt'); } this.parentRouter.navigate(['Login']); } return super.activate(instruction); } } from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem.porter import PorterStemmer

enter image description here

1 个答案:

答案 0 :(得分:0)

您需要使用原始原始文档填充TfidfVectorizer,然后才能使用它来转换新文档。

如果您无法访问原始文档,则可以通过构建字典来恢复每个单词的idf weights

idfs[word] = log{(# documents) / (# documents where word has non-zero tf-idf weight)}

稍后您可以使用该字典计算新句子的tf-idf权重:

from collections import Counter
words = sentence.split()
s_tfs = Counter(words)
s_idfs = {word: idfs.get(word, 0) for word in words}
s_tfidf = {word: s_tfs.get(word, 0) * s_idfs.get(word, 0) for word in idfs.keys()}