Question

我写了一个python脚本来提取文本行中的特征：我正在尝试用标签B和I执行句子分段任务，其中B代表开始，我代表Inside.Therefore我训练了朴素贝叶斯分类器。其中data_object数据对象是如下列表： -

[["Awards & Recognition"]
 ["Awards", "&" ,"Recognitions"]
 ["Award", "&", "Recognition"]
 ["NNP", "CC", "NNS"]
 ["B"]]

和cue1，cue2，cue3，cue4，cue5，cue6，cue7，cue8和tab是单词列表

def feature_extractor(data_object):
features={}
#For each token in the line, is the form of the token a cue word 
if(len(data_object[2])>2):
    if(len(data_object[2])<8):
        features["contains(%s in cue3 )"%data_object[0]]=(data_object[0] in cue3)
        features["contains(%s in cue4)"%data_object[0]]=(data_object[0] in cue4)
        features["contains(%s in cue 5)"%data_object[0]]=(data_object[0] in cue5)
        features["contains(%s in cue6)"%data_object[0]]=(data_object[0] in cue6)
        features["Length"]="medium"

    else:
        features["Length"]="big"
else:
    features["Length"]="small"
for data in data_object[1]:
    features["contains(%s in cue1 wrking)"%data]=(data in cue1)
    features["contains(%s in tab wrking)"%data]=(data in tab)
    features["contains(%s in cue7 wrking)"%data]=(data in cue7)
    features["contains(%s in cue8 wrking)"%data]=(data in cue8)
    # if token is in the cue word list what is the position of the token in the line
    # if token is in the cue word list what is the position of the token in the line
    #if data in cue1:
        #features["have(%s)"%data]=data_object[2].index(data)
    #else:
        #features["have(%s)"%data]=0
#for each token in the line, is the step of the token a cue word
for data in data_object[2]:
    features["contains(%s in cue 1 stemmed)"%data]=(data in cue1)
    features["contains(%s in tab stemmed)"%data]=(data in tab)
    features["contains(%s in cue7 stemmed)"%data]=(data in cue7)
    features["contains(%s in cue8 stemmed)"%data]=(data in cue8)
    #if data in cue2:.
        #features["have(%s)"%data]=data_object[3].index(data)
    #else:
        #features["have(%s)"%data]=0
#including words in dictionary from original form and stemmed form without repetitions
temp=[list(izip(data_object[0].split(),data_object[0].split()[1:]))]
temp_main=[" ".join(t) for te in temp  for t in te]  
temp2=[lmtzr.lemmatize(w,wn.NOUN).encode('ascii','ignore') for w in temp_main]
uni=[temp_main.append(w2) for w2 in temp2 if w2 not in temp_main]
for data in temp_main:
    features["contains(%s in cue2 check)"%data]=(data in cue2)
    features["contains(%s in tab1 check)"%data]=(data in tab1)
    features["contains(%s in cue3 check )"%data]=(data in cue3)
    features["contains(%s in cue4 check)"%data]=(data in cue4)
    features["contains(%s in cue5 check)"%data]=(data in cue5)
    features["contains(%s in cue6 check)"%data]=(data in cue6)
    #topic continuation
    features["contains(%s in cue7 check)"%data]=(data in cue7)
    features["contains(%s in cue8 check)"%data]=(data in cue8)
    #total number of tokens in the line
    features["length(%s of data_object check)"%data]=len(data_object[1])
    features["length(%s of data_object check)"%data]=len(data_object[1])
    #part of speech tag for each token
for data in data_object[2]:
    for pos_list in nltk.pos_tag(data.split()):
        features["posttag"]=pos_list[1]
    #features["postag(%s)"%data]=nltk.pos_tag(data_object[2])   
#print features
return features
#print temps

当我训练具有上述特征提取器功能的朴素贝叶斯分类器时并通过以下代码：

nb_classifier=nltk.NaiveBayesClassifier.train(train_set)
print "the accuracy of the classifier is %s "%nltk.classify.accuracy(nb_classifier,test_set)
nb_classifier.show_most_informative_features(3000)

我得到了这个：

contains(job in cue7 stemmed) = False               I : B      =      1.4 : 1.0
contains(job in cue8 stemmed) = False               I : B      =      1.4 : 1.0
contains(job in cue8 wrking) = False               I : B      =      1.4 : 1.0
contains(job in tab stemmed) = False               I : B      =      1.4 : 1.0
contains(knowledge in cue7 stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue8 stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue1 wrking) = False               I : B      =      1.3 : 1.0
contains(knowledge in tab stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue 1 stemmed) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue8 wrking) = False               I : B      =      1.3 : 1.0
contains(knowledge in tab wrking) = False               I : B      =      1.3 : 1.0
contains(knowledge in cue7 wrking) = False               I : B      =      1.3 : 1.0
             posttag = 'VBG'               B : I      =      1.3 : 1.0
contains(and in tab stemmed) = None                B : I      =      1.3 : 1.0
contains(and in cue7 stemmed) = None                B : I      =      1.3 : 1.0
contains(and in cue1 wrking) = None                B : I      =      1.3 : 1.0
contains(and in cue7 wrking) = None                B : I      =      1.3 : 1.0
contains(and in cue8 wrking) = None                B : I      =      1.3 : 1.0
contains(and in tab wrking) = None                B : I      =      1.3 : 1.0
contains(and in cue 1 stemmed) = None                B : I      =      1.3 : 1.0
contains(and in cue8 stemmed) = None                B : I      =      1.3 : 1.0
contains(executive in cue1 wrking) = False               B : I      =      1.3 : 1.0
contains(executive in tab wrking) = False               B : I      =      1.3 : 1.0
contains(executive in cue8 wrking) = False               B : I      =      1.3 : 1.0
contains(executive in cue7 wrking) = False               B : I      =      1.3 : 1.0
contains(technology in tab wrking) = False               B : I      =      1.2 : 1.0
contains(technology in cue1 wrking) = False               B : I      =      1.2 : 1.0
contains(technology in cue8 wrking) = False               B : I      =      1.2 : 1.0
contains(technology in cue7 wrking) = False               B : I      =      1.2 : 1.0
contains(of in tab stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue7 wrking) = None                B : I      =      1.2 : 1.0
contains(of in tab wrking) = None                B : I      =      1.2 : 1.0
contains(of in cue7 stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue8 wrking) = None                B : I      =      1.2 : 1.0
contains(of in cue8 stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue 1 stemmed) = None                B : I      =      1.2 : 1.0
contains(of in cue1 wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue7 wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue7 stemmed) = None                B : I      =      1.2 : 1.0
contains(in in tab wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue1 wrking) = None                B : I      =      1.2 : 1.0
contains(in in cue8 stemmed) = None                B : I      =      1.2 : 1.0
contains(in in tab stemmed) = None                B : I      =      1.2 : 1.0
contains(in in cue 1 stemmed) = None                B : I      =      1.2 : 1.0
contains(in in cue8 wrking) = None                B : I      =      1.2 : 1.0
             posttag = 'NN'                B : I      =      1.2 : 1.0

python在调用负责为我的朴素贝叶斯分类器生成功能的函数时返回none

0 个答案: