在函数get_links中,我正在获取URL的链接。在Scrape函数中,我使用text_from_html函数(不在代码中)获取每个URL的内容。我想将url
和visible_text
附加到两个包含URL和每个URL的visible_text的列表中。在此列表中仅包含一项,而上一项将被替换。我也想保留以前的值。
我得到的输出为:
['https://www.scrapinghub.com']
['https://www.goodreads.com/quotes']
我需要它们在一个列表中。
def get_links(url):
visited_list.append(url)
try:
source_code = requests.get(url)
except Exception:
get_links(fringe.pop(0))
plain_text = source_code.text
soup = BeautifulSoup(plain_text,"lxml")
for link in soup.findAll(re.compile(r'(li|a)')):
href = link.get('href')
if (href is None) or (href in visited_list) or (href in fringe) or (('http://' not in href) and ('https://' not in href)):
continue
else:
subs = href.split('/')[2]
fstr = repr(fringe)
if subs in fstr:
continue
else:
if('blah' in href):
if('www' not in href):
href = href.split(":")[0] + ':' + "//" + "www." + href.split(":")[1][2:]
fringe.append(href)
else:
fringe.append(href)
return fringe
def test(url):
try:
res = requests.get(url)
plain_text = res.text
soup = BeautifulSoup(plain_text,"lxml")
visible_text = text_from_html(plain_text)
URL.append(url)
paragraph.append(visible_text)
except Exception:
print("CHECK the URL {}".format(url))
if __name__ == "__main__":
p = Pool(10)
p.map(test,fringe)
p.terminate()
p.join()