如何在网页上连续调用抓取网络的python脚本?烧瓶是执行此操作的方法吗?

时间:2019-01-01 22:17:34

标签: python html web-scraping

我有一个使用selenium和beautifulsoup的网络抓取python脚本,我使用apscheduler定期安排Python代码以提取信息。我将提取的信息存储在JSON文件中,然后将其传递到html文件并使用AJAX显示。是否可以始终执行python脚本,而无需我手动执行它?例如,我的最终目标是托管网页,我希望网页抓取工具能够不断执行。

我创建了python脚本,该脚本从站点收集信息,并且目前我在网页上显示所有信息。每当我想更新信息时,我都必须手动执行python脚本。

# grabs all the trending quotes for that day
def getTrendingQuotes(browser):
    # wait until trending links appear, not really needed only for example
    all_trendingQuotes = WebDriverWait(browser, 10).until(
        lambda d: d.find_elements_by_css_selector('#trendingQuotes a')
    )
    return [link.get_attribute('href') for link in all_trendingQuotes]


def getStockDetails(url, browser):

    print(url)
    browser.get(url)

    quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper')
    quote_name = quote_wrapper.find_element_by_class_name(
        "quote-name").find_element_by_tag_name('h2').text
    quote_price = quote_wrapper.find_element_by_class_name("quote-price").text
    quote_volume = quote_wrapper.find_element_by_class_name(
        "quote-volume").text

    print("\n")
    print("Quote Name: " + quote_name)
    print("Quote Price: " + quote_price)
    print("Quote Volume: " + quote_volume)
    print("\n")

    convertToJson(quote_name, quote_price, quote_volume, url)


quotesArr = []
# Convert to a JSON  file


def convertToJson(quote_name, quote_price, quote_volume, url):

    quoteObject = {
        "url": url,
        "Name": quote_name,
        "Price": quote_price,
        "Volume": quote_volume
    }
    quotesArr.append(quoteObject)


def trendingBot(url, browser):
    browser.get(url)
    trending = getTrendingQuotes(browser)
    for trend in trending:
        getStockDetails(trend, browser)
    # requests finished, write json to file

    # REMOVE ANY DUPLICATE url from the list, then write json to file.

    quotesArr_dict = {quote['url']: quote for quote in quotesArr}
    newList = list(quotesArr_dict.values())

    with open('trendingQuoteData.json', 'w') as outfile:
        json.dump(newList, outfile)


def Main():
    scheduler = BlockingScheduler()
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    # applicable to windows os only
    chrome_options.add_argument('--disable-gpu')

    url = 'https://www.tmxmoney.com/en/index.html'
    browser = webdriver.Chrome(chrome_options=chrome_options)

    browser.get(url)

    os.system('cls')
    print("[+] Success! Bot Starting!")
    scheduler.add_job(trendingBot, 'interval', hours=1,
                      next_run_time=datetime.now(), args=[url, browser])
    scheduler.start()
    # trendingBot(url, browser)
    browser.quit()


if __name__ == "__main__":
    Main()

<!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <meta http-equiv="X-UA-Compatible" content="ie=edge" />
  <link rel="stylesheet" href="css/main.css" />
  <title>Document</title>
</head>

<body>
  <header id="main-header">
    <div class="menu-branding"></div>
    <nav class="menu">
      <ul class="menu-nav">
        <li class="nav-item current">
          <a href="index.html" class="nav-link">Home</a>
        </li>
        <li class="nav-item">
          <a href="about.html" class="nav-link">About</a>
        </li>
        <li class="nav-item">
          <a href="contact.html" class="nav-link">Contact</a>
        </li>
      </ul>
    </nav>
  </header>

  <main id="main-quote">
    <h1>Hello!</h1>
    <ul id="quotes"></ul>
  </main>

  <!-- footer -->
  <footer id="main-footer">
    Copyright &copy;
    <a href="https://github.com/pennyfea/Web_Crawler">Designed & Built by 2018</a
      >
    </footer>
  </body>
  <script>
    var xhttp = new XMLHttpRequest();
    xhttp.onreadystatechange = function() {
      if (this.readyState == 4 && this.status == 200) {
        // Typical action to be performed when the document is ready:
        var response = JSON.parse(xhttp.responseText);
        var output = " ";
        for (var i = 0; i < response.length; i++) {
          output += "<li>" + response[i].Name + ": " + response[i].Price;

          ("</li>");
        }
        document.getElementById("quotes").innerHTML = output;
      }
    };
    xhttp.open("GET", "trendingQuoteData.json", true);
    xhttp.send();
  </script>
</html>

0 个答案:

没有答案