我正在编写以下两个函数:
def sent_transform(sent_string):
stemmer = PorterStemmer()
tokens = word_tokenize(sent_string)
tokens = [stemmer.stem(token.lower()) for token in tokens]
tokens = ['num' if token.translate({ord(c): None for c in ",.-"}).isdigit() else token for token in tokens]
return tokens
def make_ngram_tuples(samples):
def get_ngram(i):
if 2 > 1:
return (tuple([samples[i-j] for j in range(2-1,0,-1)]),samples[i])
return map(get_ngram,range(2-1,len(samples)))
class BiGramModel:
def __init__(self, trainfiles):
self.trainfiles=trainfiles
wordlist = re.findall(r'\b\w+\b', self.trainfiles)
self.wordlist = Counter(wordlist)
self.list1=[]
for word, count in self.wordlist.items():
if count == 1:
self.trainfiles = re.sub(r'\b{}\b'.format(word), '<unk>', self.trainfiles)
self.list1.append('<unk>')
else:
self.list1.append(word)
self.l2=list(set(self.list1))
print self.l2
print self.trainfiles
self.m = [' '.join(i) for s2 in self.trainfiles.split('.') for i in zip(s2.split(),s2.split()[1:])]
print self.m
def logprob(self,context,event):
str = context + " " + event
list=[]
list.append(str)
c1 = self.m.count(str)
v=len(self.l2)
c2=self.trainfiles.count(context)
c3=(c1+1.0)/(c2+v)
self.prob=math.log(c3,2)
return self.prob
def getppl(self, testfile):
ppl = 0.0
t = 0.0
with open(testfile) as f:
text = f.read()
sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
print sentences
for sentence in sentences:
tokens = sent_transform(sentence)
l = make_ngram_tuples(tokens)
for p in l:
t += 1
ppl += self.logprob(p[0], p[1])
return math.pow(2, -ppl/t)
def main():
bi=BiGramModel("STOP to be or not to be STOP")
l = bi.getppl("/Users/abc/Desktop/example.txt")
print l
main()
sent_transform用句子来代替句子,用make替换数字,而make_ngram_tuples用它来克,但是当我运行它时,我得到以下错误:
编辑:现在出现此错误:
Traceback (most recent call last):
File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 133, in <module>
main()
File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 131, in main
l = bi.getppl("/Users/abc/Desktop/example.txt")
File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 121, in getppl
ppl += self.logprob(p[0], p[1])
File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 88, in logprob
str = context + " " + event
TypeError: can only concatenate tuple (not "str") to tuple
如何解决这个问题? 谢谢!
答案 0 :(得分:1)
sent_transform
应该返回令牌,而不是print
令牌。
def sent_transform(sent_string):
stemmer = PorterStemmer()
tokens = word_tokenize(sent_string)
tokens = [stemmer.stem(token.lower()) for token in tokens]
tokens = ['num' if token.translate({ord(c): None for c in ",.-"}).isdigit() else token for token in tokens]
return tokens
如上所述,sent_transform
将始终返回 None
1 ,这意味着tokens
在None
中getppl
{1}}。这将make_ngram_tuples
传递给samples
,这会导致您在尝试获取samples
的长度时看到的例外。
1 没有返回值的函数总是返回None
。