Question

我正在编写以下两个函数：

def sent_transform(sent_string):
    stemmer = PorterStemmer()
    tokens = word_tokenize(sent_string)
    tokens = [stemmer.stem(token.lower()) for token in tokens]
    tokens = ['num' if token.translate({ord(c): None for c in ",.-"}).isdigit() else token for token in tokens]
    return tokens

def make_ngram_tuples(samples):
    def get_ngram(i):
        if 2 > 1:
         return (tuple([samples[i-j] for j in range(2-1,0,-1)]),samples[i])
    return map(get_ngram,range(2-1,len(samples)))

class BiGramModel:
    def __init__(self, trainfiles):
        self.trainfiles=trainfiles
        wordlist = re.findall(r'\b\w+\b', self.trainfiles)
        self.wordlist = Counter(wordlist)
        self.list1=[]
        for word, count in self.wordlist.items():
            if count == 1:
                self.trainfiles = re.sub(r'\b{}\b'.format(word), '<unk>', self.trainfiles)
                self.list1.append('<unk>')
            else:
                self.list1.append(word)
        self.l2=list(set(self.list1))
        print self.l2
        print self.trainfiles
        self.m = [' '.join(i) for s2 in self.trainfiles.split('.') for i in zip(s2.split(),s2.split()[1:])]
        print self.m


   def logprob(self,context,event):
        str = context + " " + event
        list=[]
        list.append(str)
        c1 = self.m.count(str)
        v=len(self.l2)

        c2=self.trainfiles.count(context)
        c3=(c1+1.0)/(c2+v)
        self.prob=math.log(c3,2)
        return self.prob

   def getppl(self, testfile):
        ppl = 0.0
        t = 0.0
        with open(testfile) as f:
            text = f.read()

        sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
        print sentences
        for sentence in sentences:
            tokens = sent_transform(sentence)
            l = make_ngram_tuples(tokens)
            for p in l:
                t += 1
                ppl += self.logprob(p[0], p[1])
        return math.pow(2, -ppl/t)

   def main():
       bi=BiGramModel("STOP to be or not to be STOP")
       l = bi.getppl("/Users/abc/Desktop/example.txt")
       print l
   main()

sent_transform用句子来代替句子，用make替换数字，而make_ngram_tuples用它来克，但是当我运行它时，我得到以下错误：

编辑：现在出现此错误：

Traceback (most recent call last):
  File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 133, in <module>
    main()
  File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 131, in main
    l = bi.getppl("/Users/abc/Desktop/example.txt")
  File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 121, in getppl
    ppl += self.logprob(p[0], p[1])
  File "/Users/abc/Downloads/stanford-corenlp-2012-07-09/code.py", line 88, in logprob
    str = context + " " + event
TypeError: can only concatenate tuple (not "str") to tuple

如何解决这个问题？谢谢！

Answer 1

sent_transform应该返回令牌，而不是print令牌。

def sent_transform(sent_string):
    stemmer = PorterStemmer()
    tokens = word_tokenize(sent_string)
    tokens = [stemmer.stem(token.lower()) for token in tokens]
    tokens = ['num' if token.translate({ord(c): None for c in ",.-"}).isdigit() else token for token in tokens]
    return tokens

如上所述，sent_transform将始终返回 None ¹，这意味着tokens在None中getppl {1}}。这将make_ngram_tuples传递给samples，这会导致您在尝试获取samples的长度时看到的例外。

^{¹没有返回值的函数总是返回None。}

获取TypeError：只能将元组（不是“str”）连接到元组。怎么解决？

1 个答案: