我有一个使用selenium和beautifulsoup的网络抓取python脚本,我使用apscheduler定期安排Python代码以提取信息。我将提取的信息存储在JSON文件中,然后将其传递到html文件并使用AJAX显示。是否可以始终执行python脚本,而无需我手动执行它?例如,我的最终目标是托管网页,我希望网页抓取工具能够不断执行。
我创建了python脚本,该脚本从站点收集信息,并且目前我在网页上显示所有信息。每当我想更新信息时,我都必须手动执行python脚本。
# grabs all the trending quotes for that day
def getTrendingQuotes(browser):
# wait until trending links appear, not really needed only for example
all_trendingQuotes = WebDriverWait(browser, 10).until(
lambda d: d.find_elements_by_css_selector('#trendingQuotes a')
)
return [link.get_attribute('href') for link in all_trendingQuotes]
def getStockDetails(url, browser):
print(url)
browser.get(url)
quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper')
quote_name = quote_wrapper.find_element_by_class_name(
"quote-name").find_element_by_tag_name('h2').text
quote_price = quote_wrapper.find_element_by_class_name("quote-price").text
quote_volume = quote_wrapper.find_element_by_class_name(
"quote-volume").text
print("\n")
print("Quote Name: " + quote_name)
print("Quote Price: " + quote_price)
print("Quote Volume: " + quote_volume)
print("\n")
convertToJson(quote_name, quote_price, quote_volume, url)
quotesArr = []
# Convert to a JSON file
def convertToJson(quote_name, quote_price, quote_volume, url):
quoteObject = {
"url": url,
"Name": quote_name,
"Price": quote_price,
"Volume": quote_volume
}
quotesArr.append(quoteObject)
def trendingBot(url, browser):
browser.get(url)
trending = getTrendingQuotes(browser)
for trend in trending:
getStockDetails(trend, browser)
# requests finished, write json to file
# REMOVE ANY DUPLICATE url from the list, then write json to file.
quotesArr_dict = {quote['url']: quote for quote in quotesArr}
newList = list(quotesArr_dict.values())
with open('trendingQuoteData.json', 'w') as outfile:
json.dump(newList, outfile)
def Main():
scheduler = BlockingScheduler()
chrome_options = Options()
chrome_options.add_argument("--headless")
# applicable to windows os only
chrome_options.add_argument('--disable-gpu')
url = 'https://www.tmxmoney.com/en/index.html'
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get(url)
os.system('cls')
print("[+] Success! Bot Starting!")
scheduler.add_job(trendingBot, 'interval', hours=1,
next_run_time=datetime.now(), args=[url, browser])
scheduler.start()
# trendingBot(url, browser)
browser.quit()
if __name__ == "__main__":
Main()
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta http-equiv="X-UA-Compatible" content="ie=edge" />
<link rel="stylesheet" href="css/main.css" />
<title>Document</title>
</head>
<body>
<header id="main-header">
<div class="menu-branding"></div>
<nav class="menu">
<ul class="menu-nav">
<li class="nav-item current">
<a href="index.html" class="nav-link">Home</a>
</li>
<li class="nav-item">
<a href="about.html" class="nav-link">About</a>
</li>
<li class="nav-item">
<a href="contact.html" class="nav-link">Contact</a>
</li>
</ul>
</nav>
</header>
<main id="main-quote">
<h1>Hello!</h1>
<ul id="quotes"></ul>
</main>
<!-- footer -->
<footer id="main-footer">
Copyright ©
<a href="https://github.com/pennyfea/Web_Crawler">Designed & Built by 2018</a
>
</footer>
</body>
<script>
var xhttp = new XMLHttpRequest();
xhttp.onreadystatechange = function() {
if (this.readyState == 4 && this.status == 200) {
// Typical action to be performed when the document is ready:
var response = JSON.parse(xhttp.responseText);
var output = " ";
for (var i = 0; i < response.length; i++) {
output += "<li>" + response[i].Name + ": " + response[i].Price;
("</li>");
}
document.getElementById("quotes").innerHTML = output;
}
};
xhttp.open("GET", "trendingQuoteData.json", true);
xhttp.send();
</script>
</html>