我有一个包含语料库文本文件的目录,我想创建一个表,其中包含每个文档中包含文档编号和列的列的单词数。对于每个唯一的单词,行包含该文档中的单词计数...所有应该在python中完成...请帮助...谢谢...
表格应如下所示:
word1 word2 word3 ...
doc1 14 5 45
doc2 6 1 0
.
.
.
import nltk
import collections
import os.path
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
path = "c://Users/Desktop/corpus files"
i=0
for file in os.listdir(path) :
f = open("c://Users/Desktop/corpus files/file%d.txt" %i,'r')
data= f.read()
words = cleanDoc(data)
fw = open("c://Users/Desktop/words/words%d.txt" %i,'w')
fd = collections.Counter(words)
#fd = nltk.FreqDist(words)
#plot(fd)
row_format = "{:>15}" * (len(words) + 1)
print row_format.format("document %d" %i, *words)
#for
fw.write(str(fd))
fw.write(str(words))
fw.close()
i=i+1
f.close()
答案 0 :(得分:0)
我认为这与你想要的相当接近,如果不完全的话。如果不是,我试图让事情变得容易改变。
为了生成表,所需的处理分两个阶段完成。在第一个文档中,file<document-number>.txt
形式的每个文档文件中的唯一单词被找到并保存在相应的words<document-number>.txt
文件中,并且它们被添加到包含所有文档中看到的所有唯一单词的集合中。文件。需要这个集合来生成包含所有文件中所有唯一单词的表列,这就是为什么需要两个处理阶段的原因。
在第二阶段,单词文件被读回并转回用于填写正在打印的表的相应列的字典。
import ast
import collections
import nltk
import re
import os
user_name = "UserName"
path = "c://Users/%s/Desktop/corpus files" % user_name
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens
if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
# phase 1 -- find unique words, create word files, update overall unique word set
corpus_file_pattern = re.compile(r"""file(\d+).txt""")
unique_words = set()
longest_filename = 0
document_nums = []
for filename in os.listdir(path):
corpus_file_match = corpus_file_pattern.match(filename)
if corpus_file_match: # corpus text file?
if len(filename) > longest_filename:
longest_filename = len(filename)
document_num = int(corpus_file_match.group(1))
document_nums.append(document_num)
with open(os.path.join(path, filename)) as file:
data = file.read()
words = cleanDoc(data)
unique_words.update(words)
fd = collections.Counter(words)
words_filename = "words%d.txt" % document_num
with open(os.path.join(path, words_filename), mode = 'wt') as fw:
fw.write(repr(dict(fd)) + '\n') # write representation as dict
# phase 2 -- create table using unique_words and data in word files
unique_words_list = sorted(unique_words)
unique_words_empty_counter = collections.Counter({word: 0 for word
in unique_words})
document_nums = sorted(document_nums)
padding = 2 # spaces between columns
min_col_width = 5
col_headings = ["Document"] + unique_words_list
col_widths = [max(min_col_width, len(word))+padding for word in col_headings]
col_widths[0] = longest_filename+padding # first col is special case
# print table headings
for i, word in enumerate(col_headings):
print "{:{align}{width}}".format(word, align='>' if i else '<',
width=col_widths[i]),
print
for document_num in document_nums:
# read word in document dictionary back in
filename = "words%d.txt" % document_num
file_words = unique_words_empty_counter.copy()
with open(os.path.join(path, filename)) as file:
data = file.read()
# convert data read into dict and update with file word counts
file_words.update(ast.literal_eval(data))
# print row of data
print "{:<{width}}".format(filename, width=col_widths[0]),
for i, word in enumerate(col_headings[1:], 1):
print "{:>{width}n}".format(file_words[word], width=col_widths[i]),
print