我正在尝试使用selenium来抓取动态javascript加载的网页,但是我想要抓取的表实际上从未在源中加载而只是显示javascript。我试图运行的代码如下:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup
url = ""
browser = webdriver.Firefox(executable_path=r'/Users/brendanbernstein/Downloads/geckodriver')
browser.get(url)
WebDriverWait(browser, 10).until(
EC.visibility_of_element_located((By.ID, "maintable")))
html_page = browser.page_source
browser.quit()
soup = BeautifulSoup(html_page)
不幸的是,即使使用selenium,javascript也没有执行。执行并生成表格的脚本我认为如下所示,并且是我在扫描的源代码中看到的所有内容:
<script type="text/javascript">
var hash = window.location.hash.substring(1);
$(document).ready(function () {
if (hash != '') {
activaTab(hash);
} else {
loadIframeSource3();
};
//$('i').tooltip({ placement: 'bottom', trigger: 'manual' }).tooltip('show');
});
function activaTab(tab) {
if (tab.indexOf('comment') >= 0) {
tab = 'comments';
loaddisqus();
} else if (tab.indexOf('readContract') >= 0) {
loadIframeSource();
} else if (tab.indexOf('balances') >= 0) {
loadIframeSource2();
};
$('.nav-tabs a[href="#' + tab + '"]').tab('show');
};
function updatehash(strhash) {
if (strhash == '') {
history.replaceState("", document.title, window.location.pathname);
} else {
var baseUrl = window.location.href.split('#')[0];
history.replaceState("", document.title, baseUrl + '#' + strhash);
}
}
var disqusloaded = false;
function loaddisqus() {
if (disqusloaded == false) {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
disqusloaded = true;
}
updatehash('comments');
}
var readContractLoaded = false;
function loadIframeSource() {
if (readContractLoaded == false) {
readContractLoaded = true;
document.getElementById('readcontractiframe').src = '/readContract?a=0xe94327d07fc17907b4db788e5adf2ed424addff6&v=0xe94327d07fc17907b4db788e5adf2ed424addff6';
}
}
var token_holders_loaded = false;
function loadIframeSource2() {
if (token_holders_loaded == false) {
token_holders_loaded = true;
document.getElementById('tokeholdersiframe').src = '/token/generic-tokenholders2?a=0xe94327d07fc17907b4db788e5adf2ed424addff6&s=11000000000000000000000000';
}
}
var token_transactions_loaded = false;
function loadIframeSource3() {
if (token_transactions_loaded == false) {
token_transactions_loaded = true;
document.getElementById('tokentxnsiframe').src = '/token/generic-tokentxns2?contractAddress=0xe94327d07fc17907b4db788e5adf2ed424addff6&a=&mode=';
}
}
</script>
有什么建议吗?