我正在使用BeautifulSoup创建一些东西,直到它返回5个段落之前,它都会进行抓取,但是现在它只能工作到某个点。我不确定问题出在哪里。为什么说“未知的网址类型”?
我的代码的主要部分是
def parse_page(page_url):
count = 0
article = soup.find('div', attrs = {'id':'mw-content-text'})
if article is None:
return
else:
paragraphs = article.find_all('p')
extracted_this_page = []
links_this_page = article.find_all('a', href = re.compile('(/wiki/)+'))
for paragraph in paragraphs:
text = paragraph.text
if user in text:
record = {
'text':text,
}
if record != None:
print(record)
count = count + 1
extracted_this_page.append(record)
return extracted_this_page, count, links_this_page
def call_parse_page(links):
print('Start')
tracker = 0
my_set_count = 0
my_links_this_page = 'my_links_this_page'
for link in links:
url = link['href']
print(url)
if url.startswith('http') == False:
url = 'https://en.wikipedia.org' + url
my_extracted_this_page, my_count, my_links_this_page = parse_page(url)
extracted.append(my_extracted_this_page)
tracker = tracker + 1
my_set_count = my_set_count + my_count
print(my_set_count)
if my_set_count >= 5:
print('Success!')
return extracted
elif my_set_count < 5 and len(links) != tracker:
continue
else:
parse_page(my_links_this_page)
正在运行:
Parsing...
{'text': "At the end of 2012, Flamengo elected Eduardo Bandeira de Mello as club president for three years. The goal of his term was to improve the club's finances after an independent audit assessed Flamengo's debt at R$750\xa0million.[37] After a typical series of managerial changes, Jayme de Almeida was appointed as interim manager during which he fought off relegation and won the 2013 Copa do Brasil final against Atlético Paranaense. It was Flamengo's third Copa title, after 1990 and 2006.\n"}
4
/wiki/Club_Atl%C3%A9tico_River_Plate
错误:
Parsing...
4
Traceback (most recent call last):
File "wikipedia_pt3.py", line 76, in <module>
print(call_parse_page(my_links))
File "wikipedia_pt3.py", line 74, in call_parse_page
parse_page(my_links_this_page)
File "wikipedia_pt3.py", line 20, in parse_page
request = urllib.request.Request(page_url, headers={'User-Agent':user_agent})
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 328, in __init__
self.full_url = url
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 354, in full_url
self._parse()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '[<a href="/wiki/Template:Months" title="Template:Months"><abbr style=";;background:none transparent;border:none;-moz-box-shadow:none;-webkit-box-shadow:none;box-shadow:none; padding:0;" title="View this template">v</abbr></a>, <a href=