我的文件的TF-IDF收益率为0

时间:2013-04-21 07:55:43

标签: python tf-idf

我从yebrahim得到了这个tfidf,不知怎的,我的输出文件结果全部为0。这有什么问题吗? 输出的例子是 河马0.0 臀部0.0 臀部0.0 提示0.0 后见之明0.0 山0.0 搞笑0.0

感谢您的帮助

# a list of (words-freq) pairs for each document
global_terms_in_doc = {}
# list to hold occurrences of terms across documents
global_term_freq    = {}
num_docs            = 0
lang        = 'english'
lang_dictionary     = {}
top_k               = -1
supported_langs     = ('english', 'french')
from django.utils.encoding import smart_str, smart_unicode
# support for custom language if needed
def loadLanguageLemmas(filePath):
    print('loading language from file: ' + filePath)
    f = open(filePath)
    for line in f:
        words = line.split()
        if words[1] == '=' or words[0] == words[1]:
            continue
        lang_dictionary[words[0]] = words[1]

def remove_diacritic(words):
    for i in range(len(words)):
        w = unicode(words[i], 'ISO-8859-1')
        w = unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore')
        words[i] = w.lower()
    return words

# function to tokenize text, and put words back to their roots
def tokenize(text):

    text = ' '.join(text)
    tokens = PunktWordTokenizer().tokenize(text)

    # lemmatize words. try both noun and verb lemmatizations
    lmtzr = WordNetLemmatizer()
    for i in range(0,len(tokens)):
    #tokens[i] = tokens[i].strip("'")
        if lang != 'english':
            if tokens[i] in lang_dictionary:
                tokens[i] = lang_dictionary[tokens[i]]
        else:
            res = lmtzr.lemmatize(tokens[i])
            if res == tokens[i]:
                tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
            else:
                tokens[i] = res

    # don't return any single letters
    tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
    return tokens

def remove_stopwords(text):

    # remove punctuation
    chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
            '*', '(', ')', ' - ', '_', '+' ,'=', '@', ':', '\\', ',',
            ';', '~', '`', '<', '>', '|', '[', ']', '{', '}']
    for c in chars:
        text = smart_str(text.replace(c, ' '))

    text = text.split()

    import nltk
    if lang == 'english':
        stopwords = nltk.corpus.stopwords.words('english')
    else:
        stopwords = open(lang + '_stopwords.txt', 'r').read().split()
    content = [w for w in text if w.lower().strip() not in stopwords]
    return content

# __main__ execution

import sys, re, math, unicodedata
from optparse import OptionParser

parser = OptionParser(usage='usage: %prog [options] input_file')
parser.add_option('-l', '--language', dest='language',
    help='language to use in tokenizing and lemmatizing. supported\
            languages: {english, french}', metavar='LANGUAGE')
parser.add_option('-k', '--top-k', dest='top_k',
    help='output only terms with score no less k')
parser.add_option('-m', '--mode', dest='mode',
    help='display mode. can be either "both" or "term"')
(options, args) = parser.parse_args()

if options.language:
    if options.language not in supported_langs:
        print 'only ', supported_langs, ' are supported in this version.'
        quit()
    if options.language != 'english':
        lang = options.language
        loadLanguageLemmas(options.language + '_lemmas.txt')
if options.top_k:
    top_k = int(options.top_k)
display_mode = 'both'
if options.mode:
    if options.mode == 'both' or options.mode == 'term':
        display_mode = options.mode
else:
    parser.print_help()

if not args:
    parser.print_help()
    quit()
reader = open(args[0])
all_files = reader.read().splitlines()

num_docs  = len(all_files)

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.punkt import PunktWordTokenizer

print('initializing..')
for f in all_files:

    # local term frequency map
    terms_in_doc = {}

    doc_words    = open(f).read().lower()
    #print 'words:\n', doc_words
    doc_words    = remove_stopwords(doc_words)
    #print 'after stopwords:\n', doc_words
    doc_words    = tokenize(doc_words)
    #print 'after tokenize:\n', doc_words

    #quit()

    # increment local count
    for word in doc_words:
        if word in terms_in_doc:
            terms_in_doc[word] += 1
        else:
            terms_in_doc[word]  = 1

    # increment global frequency
     for (word,freq) in terms_in_doc.items():
        if word in global_term_freq:
            global_term_freq[word] += 1
        else:
            global_term_freq[word]  = 1

     global_terms_in_doc[f] = terms_in_doc

print('working through documents.. ')
for f in all_files:

    writer = open(f + '_final', 'w')
    result = []
    # iterate over terms in f, calculate their tf-idf, put in new list
    max_freq = 0;
    for (term,freq) in global_terms_in_doc[f].items():
        if freq > max_freq:
            max_freq = freq
    for (term,freq) in global_terms_in_doc[f].items():
        idf = math.log(float(1 + num_docs) / float(1 + global_term_freq[term]))
        tfidf = float(freq) / float(max_freq) * float(idf)
        result.append([tfidf, term])

    # sort result on tfidf and write them in descending order
    result = sorted(result, reverse=True)
    for (tfidf, term) in result[:top_k]:
        if display_mode == 'both':
            writer.write(term + '\t' + str(tfidf) + '\n')
        else:
            writer.write(term + '\n')

print('success, with ' + str(num_docs) + ' documents.')

0 个答案:

没有答案