`nltk` CoreNLPParser:防止在POS标记器中的连字符处分裂

时间:2018-09-24 14:47:11

标签: python parsing nlp nltk stanford-nlp

我正在this answer中将nltk CoreNLPParser与Stanford NLP服务器一起用于POS标记。

此标记器将带有连字符的单词视为多个单词,例如将2007-08之类的日期标记为CP, :, CP。但是,我的模型使用连字符作为一个标记的单词。是否可以使用CoreNLPParser来防止连字符分割?

1 个答案:

答案 0 :(得分:1)


from nltk.parse.corenlp import GenericCoreNLPParser

class CoreNLPParser(GenericCoreNLPParser):
    _OUTPUT_FORMAT = 'penn'
    parser_annotator = 'parse'

    def make_tree(self, result):
        return Tree.fromstring(result['parse'])

    def tag_sents(self, sentences, properties=None):
        Tag multiple sentences.

        Takes multiple sentences as a list where each sentence is a list of

        :param sentences: Input sentences to tag
        :type sentences: list(list(str))
        :rtype: list(list(tuple(str, str))
        # Converting list(list(str)) -> list(str)
        sentences = (' '.join(words) for words in sentences)
        if properties == None:
            properties = {'tokenize.whitespace':'true'}
        return [sentences[0] for sentences in self.raw_tag_sents(sentences, properties)]

    def tag(self, sentence, properties=None):
        Tag a list of tokens.

        :rtype: list(tuple(str, str))

        >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
        >>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
        >>> parser.tag(tokens)
        [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
        ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]

        >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
        >>> tokens = "What is the airspeed of an unladen swallow ?".split()
        >>> parser.tag(tokens)
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
        ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
        ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        return self.tag_sents([sentence], properties)[0]

    def raw_tag_sents(self, sentences, properties=None):
        Tag multiple sentences.

        Takes multiple sentences as a list where each sentence is a string.

        :param sentences: Input sentences to tag
        :type sentences: list(str)
        :rtype: list(list(list(tuple(str, str)))
        default_properties = {'ssplit.isOneSentence': 'true',
                              'annotators': 'tokenize,ssplit,' }

        default_properties.update(properties or {})

        # Supports only 'pos' or 'ner' tags.
        assert self.tagtype in ['pos', 'ner']
        default_properties['annotators'] += self.tagtype
        for sentence in sentences:
            tagged_data = self.api_call(sentence, properties=default_properties)
            yield [[(token['word'], token[self.tagtype]) for token in tagged_sentence['tokens']]
                    for tagged_sentence in tagged_data['sentences']]

pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
sent = ['My', 'birthday', 'is', 'on', '09-12-2050']


[('My', 'PRP$'), ('birthday', 'NN'), ('is', 'VBZ'), ('on', 'IN'), ('09-12-2050', 'CD')]

