如何将字典另存为python中的文件

时间:2019-05-30 16:45:32

标签: python-3.x machine-learning pickle

实际上,我正在构建垃圾邮件过滤器,因此从耗时太长的数据创建字典,如果在执行过程中出现错误,则必须再次等待编译。因此,是否有可能我可以保存我的字典,并且如果我必须再次运行该程序,那么这不会花费太长时间。

我尝试了泡菜模块,但无济于事。

'''

import os
import pickle 
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

def make_dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    for mail in emails:
        #print(mail)
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
     dictionary = Counter(all_words)
#print(dictionary)
list_to_remove = dictionary.keys()

for item in list(list_to_remove):
    if item.isalpha() == False:
        del dictionary[item]

    elif len(item) == 1:
        del dictionary[item]

dictionary = dictionary.most_common(3000)


return dictionary

def extract_feature(mail_dir):
    files = [os.path.join(mail_dir,f) for f in os.listdir(mail_dir)]
    feature_matrix = np.zeros((len(files),3000))
    train_labels = np.zeros(len(files))
    count = 0;
    docID = 0;

    for fil in files:

        with open(fil) as fi:
        for i,line in enumerate(fi):
            #print(i,line)
            #print('#########################')
            if i==2:
                words = line.split()
                #print(words)
                for word in words:
                    wordID = 0
                    for i,d in enumerate(dictionary):
                        if d[0]== word:
                            wordID = i
                            feature_matrix[docID,wordID] = words.count(word)
        train_labels[docID] = 0;
        filepathToken = fil.split('\\')
        #print(filepathToken)
        lastToken = filepathToken[len(filepathToken)-1]
        #print(len(filepathToken))
        if lastToken.startswith("spmsg"):
            train_labels[docID] = 1
            count = count+1
        docID = docID+1
        #print(feature_matrix)

return feature_matrix,train_labels

train_dir = 'D:/python prog/train-mails'
test_dir = 'D:/python prog/test-mails'


dictionary = make_dictionary(train_dir)
pickle_out = open('dict.pickle','wb')
pickle.dump(dictionary,pickle_out)
pickle_out.close()

feature_matrix,labels = extract_feature(train_dir)
test_feature_matrix, test_labels = extract_feature(test_dir)

model = MultinomialNB()

model.fit(feature_matrix,labels)

predicted_labels = model.predict(test_feature_matrix)

0 个答案:

没有答案