当我调用以下函数时
def get_words(self):
blocks = self.soup.find_all("block", {"blockType": lambda x: x not in ('Separator', 'SeparatorsBox')})
wrds_blcks = []
for i, block in enumerate(blocks):
if block['blockType'] == 'Table':
rslt = self._get_words_from_block_table(block)
else:
rslt = self._get_words_from_block_text(block)
rslt = self._cleanup_word(rslt)
if rslt != [[]] and rslt != []:
wrds_blcks.append(rslt)
return wrds_blcks
我收到以下错误
in get_words
blocks = self.soup.find_all("block", {"blockType": lambda x: x not in ('Separator', 'SeparatorsBox')})
AttributeError: 'AbbyExtractor' object has no attribute 'soup'
指的是第一行:
blocks = self.soup.find_all("block", {"blockType": lambda x: x not in ('Separator', 'SeparatorsBox')})
出了什么问题?
答案 0 :(得分:1)
您需要先soup
。将从网页检索到的html
代码作为参数传递给get_words
方法。并制作soup
。然后完成你的任务。
def get_words(self, html):
self.soup = BeautifulSoup(html,"lxml")
blocks = self.soup.find_all("block", {"blockType": lambda x: x not in ('Separator', 'SeparatorsBox')})
wrds_blcks = []
for i, block in enumerate(blocks):
if block['blockType'] == 'Table':
rslt = self._get_words_from_block_table(block)
else:
rslt = self._get_words_from_block_text(block)
rslt = self._cleanup_word(rslt)
if rslt != [[]] and rslt != []:
wrds_blcks.append(rslt)
return wrds_blcks