所以我这里有一些代码可以循环访问目录中的10个文件。每个文件可能有数千行。然后,代码逐行从这些文件中过滤掉一些单词。我知道这可能需要一段时间,但是可以以某种方式改进我的代码以使此过程更快。我是否在某个导致瓶颈的地方犯了编码错误?任何帮助或建议,将不胜感激!这是我的代码:
import os
def remove_stop_words(string, stopwords_list):
string_to_list = string.split()
x = (' '.join(i for i in string_to_list if i.lower() not in (x.lower() for x in stopwords_list)))
x = x+'\n'
return x
def get_stop_words_list(stopwords_path):
with open(stopwords_path, 'r') as f:
stopwords = f.read().split()
return stopwords
def main():
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/test/'
stop_words_path = 'C:/Users/User/Desktop/NLTK-stop-word-list.txt'
stopwords = get_stop_words_list(stop_words_path)
#print(stopwords)
for root, dirs, files in os.walk(input_location):
for name in files:
file_path = os.path.join(root, name) # joins the new path of the file to the current file in order to access the file
with open(file_path, 'r') as f: # open the file
for line in f: # read file line by line
x = remove_stop_words(line,stopwords)
new_file_path = os.path.join(output_location, name) + '_filtered' # creates a new file of the file that is currenlty being filtered of stopwords
with open(new_file_path, 'a') as output_file: # opens output file
output_file.write(x) # writes the newly filtered text to the new output file
if __name__ == "__main__":
main()
答案 0 :(得分:2)
这是一种逐个文件而不是逐行写入文件的解决方案:
{{1}}