您好我正在尝试使用nltk进行标记并生成一些pos标记,但是尽管导入了nltk,我仍然得到了错误响应
bs=BeautifulSoup(web.text, 'html.parser')
print (bs)
tokes=nltk.word_tokenize (bs)
tags= nltk.pos_tag(tokes)
TypeError Traceback (most recent call last)
<ipython-input-71-f1434047d3f5> in <module>()
1 bs=BeautifulSoup(web.text, 'html.parser')
2 print (bs)
----> 3 tokes=nltk.word_tokenize (bs)
4 tags= nltk.pos_tag(tokes)
5 tags
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in word_tokenize(text, language)
104 :param language: the model name in the Punkt corpus
105 """
--> 106 return [token for sent in sent_tokenize(text, language)
107 for token in _treebank_word_tokenize(sent)]
108
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)
89 """
90 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
---> 91 return tokenizer.tokenize(text)
92
93 # Standard word tokenizer.
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in tokenize(self, text, realign_boundaries)
1224 Given a text, returns a list of the sentences in that text.
1225 """
-> 1226 return list(self.sentences_from_text(text, realign_boundaries))
1227
1228 def debug_decisions(self, text):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in sentences_from_text(self, text, realign_boundaries)
1272 follows the period.
1273 """
-> 1274 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1275
1276 def _slices_from_text(self, text):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in span_tokenize(self, text, realign_boundaries)
1263 if realign_boundaries:
1264 slices = self._realign_boundaries(text, slices)
-> 1265 return [(sl.start, sl.stop) for sl in slices]
1266
1267 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in <listcomp>(.0)
1263 if realign_boundaries:
1264 slices = self._realign_boundaries(text, slices)
-> 1265 return [(sl.start, sl.stop) for sl in slices]
1266
1267 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _realign_boundaries(self, text, slices)
1302 """
1303 realign = 0
-> 1304 for sl1, sl2 in _pair_iter(slices):
1305 sl1 = slice(sl1.start + realign, sl1.stop)
1306 if not sl2:
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _pair_iter(it)
308 """
309 it = iter(it)
--> 310 prev = next(it)
311 for el in it:
312 yield (prev, el)
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _slices_from_text(self, text)
1276 def _slices_from_text(self, text):
1277 last_break = 0
-> 1278 for match in self._lang_vars.period_context_re().finditer(text):
1279 context = match.group() + match.group('after_tok')
1280 if self.text_contains_sentbreak(context):
TypeError: expected string or bytes-like object
任何人都可以帮我弄清楚我的语法究竟出错了吗?
答案 0 :(得分:1)
当您传递bs
bs.text
传递给tokenize函数