python在调用负责为我的朴素贝叶斯分类器生成功能的函数时返回none

时间:2017-06-19 12:01:11

标签: python nltk naivebayes

我写了一个python脚本来提取文本行中的特征: 我正在尝试用标签B和I执行句子分段任务,其中B代表开始,我代表Inside.Therefore我训练了朴素贝叶斯分类器。其中data_object数据对象是如下列表: -

[["Awards & Recognition"]
 ["Awards", "&" ,"Recognitions"]
 ["Award", "&", "Recognition"]
 ["NNP", "CC", "NNS"]
 ["B"]]

和cue1,cue2,cue3,cue4,cue5,cue6,cue7,cue8和tab是单词列表

def feature_extractor(data_object):
features={}
#For each token in the line, is the form of the token a cue word 
if(len(data_object[2])>2):
    if(len(data_object[2])<8):
        features["contains(%s in cue3 )"%data_object[0]]=(data_object[0] in cue3)
        features["contains(%s in cue4)"%data_object[0]]=(data_object[0] in cue4)
        features["contains(%s in cue 5)"%data_object[0]]=(data_object[0] in cue5)
        features["contains(%s in cue6)"%data_object[0]]=(data_object[0] in cue6)
        features["Length"]="medium"

    else:
        features["Length"]="big"
else:
    features["Length"]="small"
for data in data_object[1]:
    features["contains(%s in cue1 wrking)"%data]=(data in cue1)
    features["contains(%s in tab wrking)"%data]=(data in tab)
    features["contains(%s in cue7 wrking)"%data]=(data in cue7)
    features["contains(%s in cue8 wrking)"%data]=(data in cue8)
    # if token is in the cue word list what is the position of the token in the line
    # if token is in the cue word list what is the position of the token in the line
    #if data in cue1:
        #features["have(%s)"%data]=data_object[2].index(data)
    #else:
        #features["have(%s)"%data]=0
#for each token in the line, is the step of the token a cue word
for data in data_object[2]:
    features["contains(%s in cue 1 stemmed)"%data]=(data in cue1)
    features["contains(%s in tab stemmed)"%data]=(data in tab)
    features["contains(%s in cue7 stemmed)"%data]=(data in cue7)
    features["contains(%s in cue8 stemmed)"%data]=(data in cue8)
    #if data in cue2:.
        #features["have(%s)"%data]=data_object[3].index(data)
    #else:
        #features["have(%s)"%data]=0
#including words in dictionary from original form and stemmed form without repetitions
temp=[list(izip(data_object[0].split(),data_object[0].split()[1:]))]
temp_main=[" ".join(t) for te in temp  for t in te]  
temp2=[lmtzr.lemmatize(w,wn.NOUN).encode('ascii','ignore') for w in temp_main]
uni=[temp_main.append(w2) for w2 in temp2 if w2 not in temp_main]
for data in temp_main:
    features["contains(%s in cue2 check)"%data]=(data in cue2)
    features["contains(%s in tab1 check)"%data]=(data in tab1)
    features["contains(%s in cue3 check )"%data]=(data in cue3)
    features["contains(%s in cue4 check)"%data]=(data in cue4)
    features["contains(%s in cue5 check)"%data]=(data in cue5)
    features["contains(%s in cue6 check)"%data]=(data in cue6)
    #topic continuation
    features["contains(%s in cue7 check)"%data]=(data in cue7)
    features["contains(%s in cue8 check)"%data]=(data in cue8)
    #total number of tokens in the line
    features["length(%s of data_object check)"%data]=len(data_object[1])
    features["length(%s of data_object check)"%data]=len(data_object[1])
    #part of speech tag for each token
for data in data_object[2]:
    for pos_list in nltk.pos_tag(data.split()):
        features["posttag"]=pos_list[1]
    #features["postag(%s)"%data]=nltk.pos_tag(data_object[2])   
#print features
return features
#print temps

当我训练具有上述特征提取器功能的朴素贝叶斯分类器时 并通过以下代码:

nb_classifier=nltk.NaiveBayesClassifier.train(train_set)
print "the accuracy of the classifier is %s "%nltk.classify.accuracy(nb_classifier,test_set)
nb_classifier.show_most_informative_features(3000)

我得到了这个:

contains(job in cue7 stemmed) = False               I : B      =      1.4 : 1.0
contains(job in cue8 stemmed) = False               I : B      =      1.4 : 1.0
contains(job in cue8 wrking) = False               I : B      =      1.4 : 1.0
contains(job in tab stemmed) = False               I : B      =      1.4 : 1.0
contains(knowledge in cue7 stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue8 stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue1 wrking) = False               I : B      =      1.3 : 1.0
contains(knowledge in tab stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue 1 stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue8 wrking) = False               I : B      =      1.3 : 1.0
contains(knowledge in tab wrking) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue7 wrking) = False               I : B      =      1.3 : 1.0
             posttag = 'VBG'               B : I      =      1.3 : 1.0
contains(and in tab stemmed) = None                B : I      =      1.3 : 1.0
contains(and in cue7 stemmed) = None                B : I      =      1.3 : 1.0
contains(and in cue1 wrking) = None                B : I      =      1.3 : 1.0
contains(and in cue7 wrking) = None                B : I      =      1.3 : 1.0
contains(and in cue8 wrking) = None                B : I      =      1.3 : 1.0
contains(and in tab wrking) = None                B : I      =      1.3 : 1.0
contains(and in cue 1 stemmed) = None                B : I      =      1.3 : 1.0
contains(and in cue8 stemmed) = None                B : I      =      1.3 : 1.0
contains(executive in cue1 wrking) = False               B : I      =      1.3 : 1.0
contains(executive in tab wrking) = False               B : I      =      1.3 : 1.0
contains(executive in cue8 wrking) = False               B : I      =      1.3 : 1.0
contains(executive in cue7 wrking) = False               B : I      =      1.3 : 1.0
contains(technology in tab wrking) = False               B : I      =      1.2 : 1.0
contains(technology in cue1 wrking) = False               B : I      =      1.2 : 1.0
contains(technology in cue8 wrking) = False               B : I      =      1.2 : 1.0
contains(technology in cue7 wrking) = False               B : I      =      1.2 : 1.0
contains(of in tab stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue7 wrking) = None                B : I      =      1.2 : 1.0
contains(of in tab wrking) = None                B : I      =      1.2 : 1.0
contains(of in cue7 stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue8 wrking) = None                B : I      =      1.2 : 1.0
contains(of in cue8 stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue 1 stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue1 wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue7 wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue7 stemmed) = None                B : I      =      1.2 : 1.0
contains(in in tab wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue1 wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue8 stemmed) = None                B : I      =      1.2 : 1.0
contains(in in tab stemmed) = None                B : I      =      1.2 : 1.0
contains(in in cue 1 stemmed) = None                B : I      =      1.2 : 1.0
contains(in in cue8 wrking) = None                B : I      =      1.2 : 1.0
             posttag = 'NN'                B : I      =      1.2 : 1.0

0 个答案:

没有答案