我写了一个python脚本来提取文本行中的特征: 我正在尝试用标签B和I执行句子分段任务,其中B代表开始,我代表Inside.Therefore我训练了朴素贝叶斯分类器。其中data_object数据对象是如下列表: -
[["Awards & Recognition"]
["Awards", "&" ,"Recognitions"]
["Award", "&", "Recognition"]
["NNP", "CC", "NNS"]
["B"]]
和cue1,cue2,cue3,cue4,cue5,cue6,cue7,cue8和tab是单词列表
def feature_extractor(data_object):
features={}
#For each token in the line, is the form of the token a cue word
if(len(data_object[2])>2):
if(len(data_object[2])<8):
features["contains(%s in cue3 )"%data_object[0]]=(data_object[0] in cue3)
features["contains(%s in cue4)"%data_object[0]]=(data_object[0] in cue4)
features["contains(%s in cue 5)"%data_object[0]]=(data_object[0] in cue5)
features["contains(%s in cue6)"%data_object[0]]=(data_object[0] in cue6)
features["Length"]="medium"
else:
features["Length"]="big"
else:
features["Length"]="small"
for data in data_object[1]:
features["contains(%s in cue1 wrking)"%data]=(data in cue1)
features["contains(%s in tab wrking)"%data]=(data in tab)
features["contains(%s in cue7 wrking)"%data]=(data in cue7)
features["contains(%s in cue8 wrking)"%data]=(data in cue8)
# if token is in the cue word list what is the position of the token in the line
# if token is in the cue word list what is the position of the token in the line
#if data in cue1:
#features["have(%s)"%data]=data_object[2].index(data)
#else:
#features["have(%s)"%data]=0
#for each token in the line, is the step of the token a cue word
for data in data_object[2]:
features["contains(%s in cue 1 stemmed)"%data]=(data in cue1)
features["contains(%s in tab stemmed)"%data]=(data in tab)
features["contains(%s in cue7 stemmed)"%data]=(data in cue7)
features["contains(%s in cue8 stemmed)"%data]=(data in cue8)
#if data in cue2:.
#features["have(%s)"%data]=data_object[3].index(data)
#else:
#features["have(%s)"%data]=0
#including words in dictionary from original form and stemmed form without repetitions
temp=[list(izip(data_object[0].split(),data_object[0].split()[1:]))]
temp_main=[" ".join(t) for te in temp for t in te]
temp2=[lmtzr.lemmatize(w,wn.NOUN).encode('ascii','ignore') for w in temp_main]
uni=[temp_main.append(w2) for w2 in temp2 if w2 not in temp_main]
for data in temp_main:
features["contains(%s in cue2 check)"%data]=(data in cue2)
features["contains(%s in tab1 check)"%data]=(data in tab1)
features["contains(%s in cue3 check )"%data]=(data in cue3)
features["contains(%s in cue4 check)"%data]=(data in cue4)
features["contains(%s in cue5 check)"%data]=(data in cue5)
features["contains(%s in cue6 check)"%data]=(data in cue6)
#topic continuation
features["contains(%s in cue7 check)"%data]=(data in cue7)
features["contains(%s in cue8 check)"%data]=(data in cue8)
#total number of tokens in the line
features["length(%s of data_object check)"%data]=len(data_object[1])
features["length(%s of data_object check)"%data]=len(data_object[1])
#part of speech tag for each token
for data in data_object[2]:
for pos_list in nltk.pos_tag(data.split()):
features["posttag"]=pos_list[1]
#features["postag(%s)"%data]=nltk.pos_tag(data_object[2])
#print features
return features
#print temps
当我训练具有上述特征提取器功能的朴素贝叶斯分类器时 并通过以下代码:
nb_classifier=nltk.NaiveBayesClassifier.train(train_set)
print "the accuracy of the classifier is %s "%nltk.classify.accuracy(nb_classifier,test_set)
nb_classifier.show_most_informative_features(3000)
我得到了这个:
contains(job in cue7 stemmed) = False I : B = 1.4 : 1.0
contains(job in cue8 stemmed) = False I : B = 1.4 : 1.0
contains(job in cue8 wrking) = False I : B = 1.4 : 1.0
contains(job in tab stemmed) = False I : B = 1.4 : 1.0
contains(knowledge in cue7 stemmed) = False I : B = 1.3 : 1.0
contains(knowledge in cue8 stemmed) = False I : B = 1.3 : 1.0
contains(knowledge in cue1 wrking) = False I : B = 1.3 : 1.0
contains(knowledge in tab stemmed) = False I : B = 1.3 : 1.0
contains(knowledge in cue 1 stemmed) = False I : B = 1.3 : 1.0
contains(knowledge in cue8 wrking) = False I : B = 1.3 : 1.0
contains(knowledge in tab wrking) = False I : B = 1.3 : 1.0
contains(knowledge in cue7 wrking) = False I : B = 1.3 : 1.0
posttag = 'VBG' B : I = 1.3 : 1.0
contains(and in tab stemmed) = None B : I = 1.3 : 1.0
contains(and in cue7 stemmed) = None B : I = 1.3 : 1.0
contains(and in cue1 wrking) = None B : I = 1.3 : 1.0
contains(and in cue7 wrking) = None B : I = 1.3 : 1.0
contains(and in cue8 wrking) = None B : I = 1.3 : 1.0
contains(and in tab wrking) = None B : I = 1.3 : 1.0
contains(and in cue 1 stemmed) = None B : I = 1.3 : 1.0
contains(and in cue8 stemmed) = None B : I = 1.3 : 1.0
contains(executive in cue1 wrking) = False B : I = 1.3 : 1.0
contains(executive in tab wrking) = False B : I = 1.3 : 1.0
contains(executive in cue8 wrking) = False B : I = 1.3 : 1.0
contains(executive in cue7 wrking) = False B : I = 1.3 : 1.0
contains(technology in tab wrking) = False B : I = 1.2 : 1.0
contains(technology in cue1 wrking) = False B : I = 1.2 : 1.0
contains(technology in cue8 wrking) = False B : I = 1.2 : 1.0
contains(technology in cue7 wrking) = False B : I = 1.2 : 1.0
contains(of in tab stemmed) = None B : I = 1.2 : 1.0
contains(of in cue7 wrking) = None B : I = 1.2 : 1.0
contains(of in tab wrking) = None B : I = 1.2 : 1.0
contains(of in cue7 stemmed) = None B : I = 1.2 : 1.0
contains(of in cue8 wrking) = None B : I = 1.2 : 1.0
contains(of in cue8 stemmed) = None B : I = 1.2 : 1.0
contains(of in cue 1 stemmed) = None B : I = 1.2 : 1.0
contains(of in cue1 wrking) = None B : I = 1.2 : 1.0
contains(in in cue7 wrking) = None B : I = 1.2 : 1.0
contains(in in cue7 stemmed) = None B : I = 1.2 : 1.0
contains(in in tab wrking) = None B : I = 1.2 : 1.0
contains(in in cue1 wrking) = None B : I = 1.2 : 1.0
contains(in in cue8 stemmed) = None B : I = 1.2 : 1.0
contains(in in tab stemmed) = None B : I = 1.2 : 1.0
contains(in in cue 1 stemmed) = None B : I = 1.2 : 1.0
contains(in in cue8 wrking) = None B : I = 1.2 : 1.0
posttag = 'NN' B : I = 1.2 : 1.0