NLTK POS标记器使用大量文本错误地标记

时间:2019-06-25 15:58:58

标签: python nltk pos-tagger



def process_file(_file, tagger, stemmer, stopwords, filename, printinfo):
    sentences = []
    _nnp = set()
    words = dict()
    for line in _file:
        for sentence in nltk.tokenize.sent_tokenize(line):
    sent_count = 0
    for sentence in sentences:
        tags = tagger.tag(sentence)
        for tag in tags:
            if tag[1] == "NNP" or tag[1] == "NNPS":
                if tag[0] not in stopwords:
                    stemmed_word = stemmer.stem(tag[0])
                    if stemmed_word not in words.keys():
                        words[stemmed_word] = 1
                        words[stemmed_word] += 1
        print("\r[{0}] Reading file '{1}'[{2:>3.1%}] ".format(printinfo, filename, sent_count/len(sentences)), end='')
return _nnp, words


dictionary = dict()
nnp = set()

# Initialize tagger and stemmer
pos_tagger = nltk.tag.PerceptronTagger()
ps = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
file_count = 0

# For each file, extract words and named entities
for file in files:
    file_count += 1
    _nnp, _dictionary = process_file(open(file, 'r', encoding='utf-8'), pos_tagger, ps, stopwords, file, str(file_count)+"/"+str(len(files)))

    # Extend dictionary
    for word in _dictionary.keys():
        if word in dictionary.keys():
            dictionary[word] += _dictionary[word]
            dictionary[word] = _dictionary[word]

    # Join sets
    nnp = nnp.union(_nnp)
end = time.time()
print("COMPLETED: Step 2 completed in {0:.3f}s".format(end-start))


In slow motion, afraid of what he was about to witness, Langdon rotated the fax 180 degrees. He looked at the word upside down.

Instantly, the breath went out of him. It was like he had been hit by a truck. Barely able to believe his eyes, he rotated the fax again, reading the brand right-side up and then upside down.

"Illuminati," he whispered.

0 个答案:
