TypeError:不能在像object这样的字节上使用字符串模式

时间:2017-09-28 16:27:03

标签: python jupyter-notebook tokenize

我遇到错误"不能在类字节对象上使用字符串模式"我尝试使用 b 将其转换为字节,但我遇到了同样的错误,我应该尝试在我的代码中使用.decode方法吗?是否对代码进行了任何修改以纠正此错误?

def tokenize_and_stem(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        stems = [stemmer.stem(t) for t in filtered_tokens]
        return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in text:
   i = i.encode('ascii', errors='ignore')
   allwords_stemmed = tokenize_and_stem(i)
   totalvocab_stemmed.extend(allwords_stemmed)

   allwords_tokenized = tokenize_only(i)
   totalvocab_tokenized.extend(allwords_tokenized)

错误TraceBack

TypeError                                 Traceback (most recent call last)
<ipython-input-45-2592662d1420> in <module>()
      3 for i in text:
      4     i = i.encode('ascii', errors='ignore')
----> 5     allwords_stemmed = tokenize_and_stem(i)
      6     totalvocab_stemmed.extend(allwords_stemmed)
      7 
<ipython-input-44-51638fce7bd8> in tokenize_and_stem(text)
  1 def tokenize_and_stem(text):
  2     # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
  ----> 3     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
  4     filtered_tokens = []
  5     # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)
 95     """
 96     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
 ---> 97     return tokenizer.tokenize(text)
 98 
 99 # Standard word tokenizer.

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in tokenize(self, text, realign_boundaries)
1233         Given a text, returns a list of the sentences in that text.
1234         """
-> 1235         return list(self.sentences_from_text(text, realign_boundaries))
1236 
1237     def debug_decisions(self, text):

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in sentences_from_text(self, text, realign_boundaries)
1281         follows the period.
1282         """
-> 1283         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1284 
1285     def _slices_from_text(self, text):

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in span_tokenize(self, text, realign_boundaries)
1272         if realign_boundaries:
1273             slices = self._realign_boundaries(text, slices)
-> 1274         return [(sl.start, sl.stop) for sl in slices]
1275 
1276     def sentences_from_text(self, text, realign_boundaries=True):

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in <listcomp>(.0)
1272         if realign_boundaries:
1273             slices = self._realign_boundaries(text, slices)
-> 1274         return [(sl.start, sl.stop) for sl in slices]
1275 
1276     def sentences_from_text(self, text, realign_boundaries=True):

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _realign_boundaries(self, text, slices)
1312         """
1313         realign = 0
-> 1314         for sl1, sl2 in _pair_iter(slices):
1315             sl1 = slice(sl1.start + realign, sl1.stop)
1316             if not sl2:

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _pair_iter(it)
310     """
311     it = iter(it)
--> 312     prev = next(it)
313     for el in it:
314         yield (prev, el)

C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _slices_from_text(self, text)
1285     def _slices_from_text(self, text):
1286         last_break = 0
-> 1287         for match in self._lang_vars.period_context_re().finditer(text):
1288             context = match.group() + match.group('after_tok')
1289             if self.text_contains_sentbreak(context):

TypeError: cannot use a string pattern on a bytes-like object

包含完整的回溯错误

0 个答案:

没有答案