我一直在关注基于分类的分块的NLTK食谱,在尝试评估我的分类器时出现了以下错误。
导致此错误的所有代码都发布在追溯
下面---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-64-201b22386c9f> in <module>()
1 chunker = ClassifierChunker(train_chunks)
----> 2 score = chunker.evaluate(test_chunks)
3 score.accuracy()
//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in evaluate(self, gold)
47 chunkscore = ChunkScore()
48 for correct in gold:
---> 49 chunkscore.score(correct, self.parse(correct.leaves()))
50 return chunkscore
51
//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in parse(self, tokens)
32 :rtype: Tree
33 """
---> 34 raise NotImplementedError()
35
36 def evaluate(self, gold):
NotImplementedError:
#from chunkers import TagChunker
from nltk.corpus import treebank_chunk
train_chunks = treebank_chunk.chunked_sents()[:3000]
test_chunks = treebank_chunk.chunked_sents()[3000:]
import nltk.chunk
from nltk.tag import ClassifierBasedTagger
def chunk_trees2train_chunks(chunk_sents):
tag_sents = [nltk.chunk.tree2conlltags(sent) for sent in chunk_sents]
return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
def prev_next_pos_iob(tokens, index, history):
word, pos = tokens[index]
if index == 0:
prevword, prevpos, previob = ('<START>',)*3
else:
prevword, prevpos = tokens[index-1]
previob = history[index-1]
if index == len(tokens) - 1:
nextword, nextpos = ('<END>',)*2
else:
nextword, nextpos = tokens[index+1]
feats = {
'word': word,
'pos': pos,
'nextword': nextword,
'nextpos': nextpos,
'prevword': prevword,
'prevpos': prevpos,
'previob': previob
}
return feats
class ClassifierChunker(nltk.chunk.ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
**kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in
chunks])
#the following is copy/pasted from chunkers.py
import nltk.tag
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger
#from .transforms import node_label
#####################
## tree conversion ##
#####################
def chunk_trees2train_chunks(chunk_sents):
tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
def conll_tag_chunks(chunk_sents):
'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
so the final result is a list of lists of (tag, chunk_tag) tuples.
>>> from nltk.tree import Tree
>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
>>> conll_tag_chunks([t])
[[('DT', 'B-NP'), ('NN', 'I-NP')]]
'''
tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
# tree.pos() flattens the tree and produces [(word, label)] where label is
# from the word's parent tree label. words in a chunk therefore get the
# chunk tag, while words outside a chunk get the same tag as the tree's
# top label
words, ents = zip(*tree.pos())
iobs = []
prev = None
# construct iob tags from entity names
for ent in ents:
# any entity that is the same as the tree's top label is outside a chunk
if ent == node_label(tree):
iobs.append('O')
prev = None
# have a previous entity that is equal so this is inside the chunk
elif prev == ent:
iobs.append('I-%s' % ent)
# no previous equal entity in the sequence, so this is the beginning of
# an entity chunk
else:
iobs.append('B-%s' % ent)
prev = ent
# get tags for each word, then construct 3-tuple for conll tags
words, tags = zip(*tag(words))
return zip(words, tags, iobs)
#################
## tag chunker ##
#################
class TagChunker(ChunkParserI):
'''Chunks tagged tokens using Ngram Tagging.'''
def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
'''Train Ngram taggers on chunked sentences'''
train_sents = conll_tag_chunks(train_chunks)
self.tagger = None
for cls in tagger_classes:
self.tagger = cls(train_sents, backoff=self.tagger)
def parse(self, tagged_sent):
'''Parsed tagged tokens into parse Tree of chunks'''
if not tagged_sent: return None
(words, tags) = zip(*tagged_sent)
chunks = self.tagger.tag(tags)
# create conll str for tree parsing
return conlltags2tree([(w,t,c) for (w,(t,c)) in zip(words, chunks)])
########################
## classifier chunker ##
########################
def prev_next_pos_iob(tokens, index, history):
word, pos = tokens[index]
if index == 0:
prevword, prevpos, previob = ('<START>',)*3
else:
prevword, prevpos = tokens[index-1]
previob = history[index-1]
if index == len(tokens) - 1:
nextword, nextpos = ('<END>',)*2
else:
nextword, nextpos = tokens[index+1]
feats = {
'word': word,
'pos': pos,
'nextword': nextword,
'nextpos': nextpos,
'prevword': prevword,
'prevpos': prevpos,
'previob': previob
}
return feats
class ClassifierChunker(ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
#############
## pattern ##
#############
class PatternChunker(ChunkParserI):
def parse(self, tagged_sent):
# don't import at top since don't want to fail if not installed
from pattern.en import parse
s = ' '.join([word for word, tag in tagged_sent])
# not tokenizing ensures that the number of tagged tokens returned is
# the same as the number of input tokens
sents = parse(s, tokenize=False).split()
if not sents: return None
return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
答案 0 :(得分:2)
你打算自己定义一个解析方法,你可以在源代码中看到它没有实现:
class ChunkParserI(ParserI):
"""
A processing interface for identifying non-overlapping groups in
unrestricted text. Typically, chunk parsers are used to find base
syntactic constituents, such as base noun phrases. Unlike
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
will always generate a parse.
"""
def parse(self, tokens):
"""
Return the best chunk structure for the given tokens
and return a tree.
:param tokens: The list of (word, tag) tokens to be chunked.
:type tokens: list(tuple)
:rtype: Tree
"""
raise NotImplementedError()
你实际上有一个定义,我认为你的缩进是问题:
class ClassifierChunker(nltk.chunk.ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
**kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent): # indent inside the class
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in
chunks])
你在class
内没有它,但就nltk.chunk.ChunkParserI
而言你没有实施parse
方法
nltk.chunk.conlltags2tree
nltk.chunk.util
return nltk.chunk.util.conlltags2tree([(w,t,c) for ((w,t),c) in
chunks])