我试图通过递归调用all_pages函数获取网站的所有唯一网址,但此功能未提供网站的所有网址。
我想要做的就是使用the source获取网站的所有唯一网址。我的代码如下所示:
var oEmbedUrls = [
'https://api.instagram.com/oembed?url=https://www.instagram.com/p/5456544654565/',
'https://publish.twitter.com/oembed?url=https://twitter.com/coca/status/546664465342324',
'https://publish.twitter.com/oembed?url=https://twitter.com/muse/status/65353453F',
'https://api.instagram.com/oembed?url=https://www.instagram.com/p/cftzezeker5/',
'https://www.facebook.com/plugins/post/oembed.json/?url=https://www.facebook.com/cocacola/posts/fdgyez556Yds'
];
var oEmbedsHtml = [];
var doneCount = 0;
var currentSlideIndex;
$.each(oEmbedUrls, function(i, url){
$.ajax({
url: url,
dataType: "jsonp",
cache: false,
success: function (data) {
var itemIndex = oEmbedsHtml.length;
oEmbedsHtml.push(data.html);
$('#embeds-carousel .carousel-inner').append('<div class="item" id="item' + itemIndex + '"></div>');
oEmbedUrlResponded();
},
error: function(){
console.warn('oEmbed URL could not be loaded: ', url);
oEmbedUrlResponded();
}
}
});
function renderEmbedSlide(index){
currentSlideIndex = index;
$('#item' + index).html(oEmbedsHtml[index]);
}
function oEmbedUrlResponded(){
doneCount ++;
// Check if all the URLs have been loaded (successfully or not) when we can now render the carousel and the first slide as well
if(doneCount == urls.length){
renderEmbedSlide(0); // Render the first embed
/*** CALL HERE THE CODE TO ACTIVATE THE CAROUSEL COMPONENT ***/
}
}
答案 0 :(得分:1)
使用response.text
代替response.content
此外,您需要在某个时候返回。此外,不是将unique_urls作为列表,而是将其设置为一个集合,它们将始终是唯一的。
另外,你的方法是递归的,python有一个max recursion depth,所以也许你应该这样做:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = {}
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
unique_urls.add(unvisited_url)
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)