Question

我正在Windows OS的python3中进行机器学习项目。我正在尝试实施评估问题。

我尝试使用enrondataset.pkl，然后通过训练和测试数据集读取此文件，并通过在功能中使用此文件向我展示此文件，并通过输入数据进行标签测试。

evaluation.py

import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

content = ''
outsize = 0
with open("enrondataset.pkl" , "rb") as f:
    content=f.read()
    data_dict=pickle.load(f)
    try:
        data = unpickler.load(f)
    except EOFError:
        data = list() 

    features_list = ["poi", "salary"]

    data = featureFormat(data_dict, features_list)
    labels, features = targetFeatureSplit(data)

    clf = DecisionTreeClassifier()
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)
    clf.fit(features_train, labels_train)

    pred = clf.predict(features_test)
    print ("[Q1] How many POIs are predicted for the test set for your POI identifier?")
    print ("[A1]", sum(pred))
    print ("[Q2] How many people total are in your test set?")
    print ("[A2]", len(pred))
    print ("[Q3] If your identifier predicted 0. (not POI) for everyone in the test set, what would its accuracy be?")
    print ("[A3]", pred.tolist().count(0) / float(len(pred)))
    print ("[Q4] Do you get any true positives? (In this case, we define a true positive as a case where both the actual label and the predicted label are 1))")
    true_positives = 0
    for i in range(len(pred)):
        if (pred[i] == labels_test[i]) and labels_test[i] == 1:
            true_positives += 1
            print ("[A3]", true_positives)
            print ("Precision score:", precision_score(pred, labels_test))
            print ("Recall score:", recall_score(pred, labels_test))

feature_format.py

""" 
    A general tool for converting data from the
    dictionary format to an (n x k) python list that's 
    ready for training an sklearn algorithm

    n--no. of key-value pairs in dictonary
    k--no. of features being extracted

    dictionary keys are names of persons in dataset
    dictionary values are dictionaries, where each
        key-value pair in the dict is the name
        of a feature, and its value for that person

    In addition to converting a dictionary to a numpy 
    array, you may want to separate the labels from the
    features--this is what targetFeatureSplit is for

    so, if you want to have the poi label as the target,
    and the features you want to use are the person's
    salary and bonus, here's what you would do:

    feature_list = ["poi", "salary", "bonus"] 
    data_array = featureFormat( data_dictionary, feature_list )
    label, features = targetFeatureSplit(data_array)

    the line above (targetFeatureSplit) assumes that the
    label is the _first_ item in feature_list--very important
    that poi is listed first!
"""


import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """


    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print ("error: key ", feature, " not present")
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)


def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """

    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features

回溯（最近通话最近）：在第14行的文件“ evaluation.py”中 data_dict = pickle.load（f） EOFError：超出输入范围

如何在窗口OS中解决python3中的EOF错误

0 个答案: