我正在研究一个项目,在该项目中,我必须对文件夹中的每个文件进行预处理,并且必须将预处理数据写入同一文件中。 在这里,我将所有文件路径存储在 path 变量中,并进行打印,并显示了存储在此变量中的所有路径。当我通过在 file_write 中传递路径变量以写入模式打开文件进行写入时,它不是在写入预处理数据,而是当我通过传递 file_write 给出正确的路径时>变量本身,则它正在写入预处理数据。
import os
import codecs
import nltk
from nltk.corpus import stopwords
import re
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
import glob
file_list = glob.glob('/home/ravi/Downloads/sem1/NLC/Project/gutenberg/*.txt')
file = list(enumerate(file_list)) # this is dictionary created having the filenames and indexing as well.
file = list(enumerate(file_list))
count = 0
file_read=[]
filtered_sentence = []
a =""
index = ""
path = ""
s = RegexpTokenizer(r'\w+')
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
lmtzr = WordNetLemmatizer()
lemma=[]
pstem = PorterStemmer()
for i in file:
path = i[1]
ex_path = "/home/ravi/Downloads/sem1/NLC/Project/gutenberg/austen-emma.txt"
file_write = open("ex_path","w")
print(path)
if os.path.exists(path):
with codecs.open(str(path),'r',encoding='utf-8',errors='ignore') as f:
file_read=f.read()
tokens = s.tokenize(file_read.lower())
stop_words = set(stopwords.words('english'))
for w in tokens:
if w not in stop_words:
filtered_sentence.append(w)
for token, tag in pos_tag(filtered_sentence):
lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
stm = pstem.stem(lemma)
#print(token, "=>", lemma, "=>" ,stm)
file_write.write(" "+lemma+" ")