我在word_count_directory()函数中使用build_dict()函数来创建目录中三个文件的字数统计字典。我想创建三个字典(每个文件一次一个)并更新以前的字典。我的代码创建了一个单独的字典(word_count),它同时组合了所有三个字典。我想知道如何做到这一点?
def build_dict(filename):
f = open(filename, 'rU')
words = f.read().split()
count = {}
for word in words:
word = word.lower()
if word not in count:
count[word] = 1
else:
count[word] += 1
f.close()
return count
## print build_dict("C:\\Users\\Phil2040\\Desktop\\word_count\\news1.txt")
import os
import os.path
def word_count_directory(directory):
wordcount={}
filelist=[os.path.join(directory,f) for f in os.listdir(directory)]
for file in filelist:
wordcount=build_dict(file) # calling build_dict function
return wordcount
print word_count_directory("C:\\Users\\Phil2040\\Desktop\\Word_count")
答案 0 :(得分:2)
示例文件:
/tmp/foo.txt
hello world
hello world
foo bar
foo bar baz
/tmp/bar.txt
hello world
hello world
foo bar
foo bar baz
foo foo foo
您可以为每个文件创建一个Counter
,然后将它们一起添加!
from collections import Counter
def word_count(filename):
with open(filename, 'r') as f:
c = Counter()
for line in f:
c.update(line.strip().split(' '))
return c
files = ['/tmp/foo.txt', '/tmp/bar.txt']
counters = [word_count(filename) for filename in files]
# counters content (example):
# [Counter({'world': 2, 'foo': 2, 'bar': 2, 'hello': 2, 'baz': 1}),
# Counter({'foo': 5, 'world': 2, 'bar': 2, 'hello': 2, 'baz': 1})]
# Add all the word counts together:
total = sum(counters, Counter()) # sum needs an empty counter to start with
# total content (example):
# Counter({'foo': 7, 'world': 4, 'bar': 4, 'hello': 4, 'baz': 2})
答案 1 :(得分:1)
def word_count_directory(directory):
filelist=[os.path.join(directory,f) for f in os.listdir(directory)]
return [build_dict(file) for file in filelist]
这将返回一个字典列表,每个文件都有一个字典。
如果你想一个接一个地得到每个文件的wordcount,你可以使用yield:
def word_count_directory(directory):
filelist=[os.path.join(directory,f) for f in os.listdir(directory)]
for file in filelist:
yield build_dict(file)
word_count_directory(".") # gets the wordcount of the first file
word_count_directory(".") # . . . the second file
对于您的第一个函数,您应该查看集合模块中的Counter类。