如何使用python创建表来查找文档的意思

时间:2013-12-01 12:41:44

标签: python mean corpus

我有一个包含语料库文本文件的目录,我想创建一个表,其中包含每个文档中包含文档编号和列的列的单词数。对于每个唯一的单词,行包含该文档中的单词计数...所有应该在python中完成...请帮助...谢谢...

表格应如下所示:

          word1   word2   word3  ...
doc1      14      5       45
doc2      6       1       0
 .
 .
 .

import nltk
import collections
import os.path

def cleanDoc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

path = "c://Users/Desktop/corpus files"

i=0

for file in os.listdir(path) :

    f = open("c://Users/Desktop/corpus files/file%d.txt" %i,'r')
    data= f.read()
    words = cleanDoc(data)
    fw = open("c://Users/Desktop/words/words%d.txt" %i,'w')
    fd = collections.Counter(words)
    #fd = nltk.FreqDist(words)
    #plot(fd)

    row_format = "{:>15}" * (len(words) + 1)
    print row_format.format("document %d" %i, *words)
    #for

    fw.write(str(fd))
    fw.write(str(words))
    fw.close()
    i=i+1
    f.close()

1 个答案:

答案 0 :(得分:0)

我认为这与你想要的相当接近,如果不完全的话。如果不是,我试图让事情变得容易改变。

为了生成表,所需的处理分两个阶段完成。在第一个文档中,file<document-number>.txt形式的每个文档文件中的唯一单词被找到并保存在相应的words<document-number>.txt文件中,并且它们被添加到包含所有文档中看到的所有唯一单词的集合中。文件。需要这个集合来生成包含所有文件中所有唯一单词的表列,这就是为什么需要两个处理阶段的原因。

在第二阶段,单词文件被读回并转回用于填写正在打印的表的相应列的字典。

import ast
import collections
import nltk
import re
import os

user_name = "UserName"
path = "c://Users/%s/Desktop/corpus files" % user_name

def cleanDoc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens
                           if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

# phase 1 -- find unique words, create word files, update overall unique word set

corpus_file_pattern = re.compile(r"""file(\d+).txt""")
unique_words = set()
longest_filename = 0
document_nums = []

for filename in os.listdir(path):
    corpus_file_match = corpus_file_pattern.match(filename)
    if corpus_file_match:  # corpus text file?
        if len(filename) > longest_filename:
            longest_filename = len(filename)
        document_num = int(corpus_file_match.group(1))
        document_nums.append(document_num)
        with open(os.path.join(path, filename)) as file:
            data = file.read()
        words = cleanDoc(data)
        unique_words.update(words)
        fd = collections.Counter(words)
        words_filename = "words%d.txt" % document_num
        with open(os.path.join(path, words_filename), mode = 'wt') as fw:
            fw.write(repr(dict(fd)) + '\n')  # write representation as dict

# phase 2 -- create table using unique_words and data in word files

unique_words_list = sorted(unique_words)
unique_words_empty_counter = collections.Counter({word: 0 for word
                                                            in unique_words})
document_nums = sorted(document_nums)
padding = 2  # spaces between columns
min_col_width = 5
col_headings = ["Document"] + unique_words_list
col_widths = [max(min_col_width, len(word))+padding for word in col_headings]
col_widths[0] = longest_filename+padding  # first col is special case

# print table headings
for i, word in enumerate(col_headings):
    print "{:{align}{width}}".format(word, align='>' if i else '<',
                                     width=col_widths[i]),
print

for document_num in document_nums:
    # read word in document dictionary back in
    filename = "words%d.txt" % document_num
    file_words = unique_words_empty_counter.copy()
    with open(os.path.join(path, filename)) as file:
        data = file.read()
    # convert data read into dict and update with file word counts
    file_words.update(ast.literal_eval(data))
    # print row of data
    print "{:<{width}}".format(filename, width=col_widths[0]),
    for i, word in enumerate(col_headings[1:], 1):
        print "{:>{width}n}".format(file_words[word], width=col_widths[i]),
    print