我有5个名为1500.txt,1600.txt ..... 1900.txt的文本文件。使用Python我从1500.txt中选择了200个最常用的单词,并将这些单词与其他文本文件进行了比较。我的代码如下: -
common_words_1500 = Counter()
with open('E:\\Book\\1500.txt', "r", encoding='ISO-8859-1') as File_1500:
stop_words = set(nltk.corpus.stopwords.words('english'))
words = word_tokenize(File_1500.read())
filtered_sentence = [w for w in words if not w.lower() in stop_words]
for line in filtered_sentence:
for match in re.finditer(r'\w+', line.lower()):
word = match.group()
st=word.strip('_')
if len(st) > 3:
common_words_1500[st] += 1
common_words_1600 = Counter()
with open('E:\\Book\\1600.txt', "r", encoding='ISO-8859-1') as File_1600:
for line in File_1600:
for match in re.finditer(r'\w+', line.lower()):
word = match.group()
if len(word) > 3:
common_words_1600[word] += 1
common_words_1700 = Counter()
with open('E:\\Book\\1700.txt', "r", encoding='ISO-8859-1') as File_1700:
for line in File_1700:
for match in re.finditer(r'\w+', line.lower()):
word = match.group()
if len(word) > 3:
common_words_1700[word] += 1
common_words_1800 = Counter()
with open('E:\\Book\\1800.txt', "r", encoding='ISO-8859-1') as File_1800:
for line in File_1800:
for match in re.finditer(r'\w+', line.lower()):
word = match.group()
if len(word) > 3:
common_words_1800[word] += 1
common_words_1900 = Counter()
with open('E:\\Book\\1900.txt', "r", encoding='ISO-8859-1') as File_1900:
for line in File_1900:
for match in re.finditer(r'\w+', line.lower()):
word = match.group()
if len(word) > 3:
common_words_1900[word] += 1
for (word, count) in common_words_1500.most_common(200):
try:
count_in_file2 = common_words_1600[word]
count_in_file3 = common_words_1700[word]
count_in_file4 = common_words_1800[word]
count_in_file5 = common_words_1900[word]
except KeyError:
# if the word is not present file2_common_words,
# then its count is 0.
count_in_file2 = 0
count_in_file3 = 0
count_in_file4 = 0
count_in_file5 = 0
print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(word, count, count_in_file2,
count_in_file3, count_in_file4, count_in_file5))
现在问题是,我想从每个文本文件计算这200个单词的TF-IDF。我希望你们都理解我的问题。