我遇到错误"不能在类字节对象上使用字符串模式"我尝试使用 b 将其转换为字节,但我遇到了同样的错误,我应该尝试在我的代码中使用.decode方法吗?是否对代码进行了任何修改以纠正此错误?
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in text:
i = i.encode('ascii', errors='ignore')
allwords_stemmed = tokenize_and_stem(i)
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
错误TraceBack
TypeError Traceback (most recent call last)
<ipython-input-45-2592662d1420> in <module>()
3 for i in text:
4 i = i.encode('ascii', errors='ignore')
----> 5 allwords_stemmed = tokenize_and_stem(i)
6 totalvocab_stemmed.extend(allwords_stemmed)
7
<ipython-input-44-51638fce7bd8> in tokenize_and_stem(text)
1 def tokenize_and_stem(text):
2 # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
----> 3 tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
4 filtered_tokens = []
5 # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)
95 """
96 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
---> 97 return tokenizer.tokenize(text)
98
99 # Standard word tokenizer.
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in tokenize(self, text, realign_boundaries)
1233 Given a text, returns a list of the sentences in that text.
1234 """
-> 1235 return list(self.sentences_from_text(text, realign_boundaries))
1236
1237 def debug_decisions(self, text):
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in sentences_from_text(self, text, realign_boundaries)
1281 follows the period.
1282 """
-> 1283 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1284
1285 def _slices_from_text(self, text):
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in span_tokenize(self, text, realign_boundaries)
1272 if realign_boundaries:
1273 slices = self._realign_boundaries(text, slices)
-> 1274 return [(sl.start, sl.stop) for sl in slices]
1275
1276 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in <listcomp>(.0)
1272 if realign_boundaries:
1273 slices = self._realign_boundaries(text, slices)
-> 1274 return [(sl.start, sl.stop) for sl in slices]
1275
1276 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _realign_boundaries(self, text, slices)
1312 """
1313 realign = 0
-> 1314 for sl1, sl2 in _pair_iter(slices):
1315 sl1 = slice(sl1.start + realign, sl1.stop)
1316 if not sl2:
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _pair_iter(it)
310 """
311 it = iter(it)
--> 312 prev = next(it)
313 for el in it:
314 yield (prev, el)
C:\Users\mike\AppData\Local\Continuum\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _slices_from_text(self, text)
1285 def _slices_from_text(self, text):
1286 last_break = 0
-> 1287 for match in self._lang_vars.period_context_re().finditer(text):
1288 context = match.group() + match.group('after_tok')
1289 if self.text_contains_sentbreak(context):
TypeError: cannot use a string pattern on a bytes-like object
包含完整的回溯错误