我想比较两个文本文件f1.txt和f2.txt,从f2.txt中删除两个文件中的常用词,然后按频率降序对新的f2.txt进行排序
我的方法:
with open(sys.argv[1]) as f1,open(sys.argv[2]) as f2:
passage = f2.read()
common = f1.read()
words = re.findall(r'\w+', passage)
common_words = re.findall(r'\w+', common)
passage_text = [words.lower() for words in words]
final = set(passage_text) - set(common_words)
word_count = Counter(final)
for word, count in word_count.items():
print(word, ":", count)
我希望输出如下:
Foo: 12
Bar: 11
Baz: 3
Longword: 1
但是我得到每个单词的计数频率为1
答案 0 :(得分:0)
您的值final
仅包含唯一的单词(每个单词一个),这就是Counter
仅显示1次出现的原因。您需要使用这组单词过滤passage_text
,然后将过滤后的列表传递给Counter:
import re
from collections import Counter
passage = '''
Foo and Bar and Baz or Longword
Bar or Baz
Foo foo foo
'''
common = '''and or'''
words = re.findall(r'\w+', passage)
common_words = re.findall(r'\w+', common)
passage_text = [words.lower() for words in words]
final_set = set(passage_text) - set(common_words)
word_count = Counter([w for w in passage_text if w in final_set])
for word, count in sorted(word_count.items(), key=lambda k: -k[1]): # or word_count.most_common()
print(word, ":", count)
打印:
foo : 4
bar : 2
baz : 2
longword : 1
答案 1 :(得分:0)
有两种方法可以计算文本文件中的单词。
from re import split
def process_line(words, word_dict):
for word in words:
if word in word_dict:
word_dict[word] += 1
else:
word_dict[word] = 1
def process_dict(word_dict):
temp_list = []
for key, value in word_dict.items():
temp_list.append((value, key))
temp_list.sort()
return temp_list
def format_print(input_list, reverse, word_num):
if reverse:
input_list.sort(reverse=True)
print("\n", ("[Unique Words: " + str(word_num) + "]").center(35, "="))
print("-"*35 + "\n", "%-16s %s %16s" % ("Word", "|", "Count"), "\n", "-"*35)
for count, word in input_list:
print("%-16s %s %16d" % (word, "|", count))
def word_count(_file, max_to_min=False):
txt = open(_file, "rU")
word_dict = {}
for line in txt:
if line.replace(" ", "") != ("\n" or None):
process_line(filter(None, split("[^a-zA-Z']+", line.lower())), word_dict)
txt.close()
final_list = process_dict(word_dict)
format_print(final_list, max_to_min, len(word_dict))
word_count("C:\\your_path_here\\Test.txt", True)
#########################################################
from collections import Counter
import re
def openfile(filename):
fh = open(filename, "r+")
str = fh.read()
fh.close()
return str
def removegarbage(str):
# Replace one or more non-word (non-alphanumeric) chars with a space
str = re.sub(r'\W+', ' ', str)
str = str.lower()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def main(filename, topwords):
txt = openfile(filename)
txt = removegarbage(txt)
words = txt.split(' ')
bins = getwordbins(words)
for key, value in bins.most_common(topwords):
print(key,value)
main('C:\\your_path_here\\Test.txt', 500)
这里是比较两个文本文件并保留共同元素的一种方法。
with open('C:\\your_path_here\\text1.txt', 'r') as file1:
with open('C:\\your_path_here\\text2.txt', 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open('C:\\your_path_here\\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)
# For differences, use the code below:
with open('C:\\your_path_here\\text1.txt', 'r') as file1:
with open('C:\\your_path_here\\text2.txt', 'r') as file2:
same = set(file1).symmetric_difference(file2)
same.discard('\n')
with open('C:\\your_path_here\\some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)