我试图通过输入正确的正则表达式来匹配python中这个doctest的结果。
这是我到目前为止所做的:
import re
import nltk
import collections
nltk.download('punkt')
nltk.download('gutenberg')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
regexp = re.compile('[0-9]+$')
def annotateOD(listoftokens):
"""Annotate the ordinal numbers in the list of tokens
>>> annotateOD("the second tooth".split())
[('the', ''), ('second', 'OD'), ('tooth', '')]
"""
result = []
for t in listoftokens:
if regexp.match(t):
result.append((t, 'OD'))
else:
result.append((t, ''))
return result
# DO NOT MODIFY THE CODE BELOW
def compute_f1(result, tagged):
assert len(result) == len(tagged) # This is a check that the length of the result and tagged are equal
correct = [result[i][0] for i in range(len(result)) if result[i][1][:2] == 'OD' and tagged[i][1][:2] == 'OD']
numbers_result = [result[i][0] for i in range(len(result)) if result[i][1][:2] == 'OD']
numbers_tagged = [tagged[i][0] for i in range(len(tagged)) if tagged[i][1][:2] == 'OD']
if len(numbers_tagged) > 0:
r = len(correct)/len(numbers_tagged)
else:
r = 0.0
if len(numbers_result) > 0:
p = len(correct)/len(numbers_result)
else:
p = 0.0
return 2*r*p/(r+p)
if __name__ == "__main__":
import doctest
doctest.testmod()
nltk.download('brown')
tagged = nltk.corpus.brown.tagged_words(categories='news')
words = [t for t, w in tagged]
result = annotateOD(words)
f1 = compute_f1(result, tagged)
print("F1 score:", f1)
我的结果是:
Expected :[('the', ''), ('second', 'OD'), ('tooth', '')]
Actual :[('the', ''), ('second', ''), ('tooth', '')]
然后我尝试了另一种方法并尝试了这个正则表达式:
regexp = re.compile('.*(st|nd|rd|th)$')
得到了这个结果:
Expected :[('the', ''), ('second', 'OD'), ('tooth', '')]
Actual :[('the', ''), ('second', 'OD'), ('tooth', 'OD')]
我花了很长时间试图通过doctest,但总有一个不匹配。