Question

I am trying to clean the Enron email data set. I have got all the files read and displayed by the code below:

 import os
 directory = os.path.normpath("C:/Users/cool_/Desktop/Enron/Enron1/ham")
 for subdir, dirs, files in os.walk(directory):
     for file in files:
         if file.endswith(".txt"):
             f=open(os.path.join(subdir, file),'r')
             a = f.read()
             print (a)
             f.close()

I have tried to clean it by:

import re
def clean_html(file):
    # First remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?()", "", file)
    # Then remove html comments. 
    cleaned = re.sub(r"(?s)[\n]?", "", cleaned)
    # Next remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally deal with whitespace
    cleaned = re.sub(r" ", " ", cleaned)
    cleaned = re.sub(r"^$", "", cleaned)
    cleaned = re.sub("''|,", "", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned

But it only cleans last file in the folder. Could you please help in where I clean all the files and save all the cleaned files in a new location classifying them as a spam or ham.

I am trying to use it this way:

file_list = os.listdir(r"C:/Users/cool_/Desktop/Enron/enron1")
for i in file_list:
    ham1_html = open(file, 'r').read()
    cleaned_html = clean_html(ham1_html)
    try:
       # create dirs for preprocess file
       pre_path = 'pre' + re.search('/.*/', i).group()
       os.makedirs(pre_path)
    except OSError:
        # ignore exist dirs
        pass
    finally:
        # write preprocess files into pre/ directories
        with open(re.sub('ham1/', 'pre/', i), 'w') as f:
            f.write(cleaned_html)

Regards

Enron email data set spam classification

0 个答案: