I am trying to clean the Enron email data set. I have got all the files read and displayed by the code below:
import os
directory = os.path.normpath("C:/Users/cool_/Desktop/Enron/Enron1/ham")
for subdir, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".txt"):
f=open(os.path.join(subdir, file),'r')
a = f.read()
print (a)
f.close()
I have tried to clean it by:
import re
def clean_html(file):
# First remove inline JavaScript/CSS:
cleaned = re.sub(r"(?is)<(script|style).*?>.*?()", "", file)
# Then remove html comments.
cleaned = re.sub(r"(?s)[\n]?", "", cleaned)
# Next remove the remaining tags:
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
# Finally deal with whitespace
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r"^$", "", cleaned)
cleaned = re.sub("''|,", "", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
return cleaned
But it only cleans last file in the folder. Could you please help in where I clean all the files and save all the cleaned files in a new location classifying them as a spam or ham.
I am trying to use it this way:
file_list = os.listdir(r"C:/Users/cool_/Desktop/Enron/enron1")
for i in file_list:
ham1_html = open(file, 'r').read()
cleaned_html = clean_html(ham1_html)
try:
# create dirs for preprocess file
pre_path = 'pre' + re.search('/.*/', i).group()
os.makedirs(pre_path)
except OSError:
# ignore exist dirs
pass
finally:
# write preprocess files into pre/ directories
with open(re.sub('ham1/', 'pre/', i), 'w') as f:
f.write(cleaned_html)
Regards