使用Java进行实时Web抓取

时间:2018-08-13 17:28:14

标签: java web web-scraping

我已经做过一个演示程序,可以通过静态网站进行抓取,但是不断更改其数据的网站又如何呢? 例如https://www.oanda.com/currency/live-exchange-rates/

我应该如何解决我的问题?

唯一想到的方法是每秒发送200个请求,但是我不会被禁止吗?

浏览器如何处理此类网站,我如何用Java代码复制这些网站?

3 个答案:

答案 0 :(得分:0)

浏览器正在每个时间段https://www.oanda.com/lfr/rates_lrrr执行一次数据请求。代替解析html页面,您应该尝试找出它们的数据API。

答案 1 :(得分:0)

Selenium Web Driver非常棒。

下载Selenium jar,并将其包含在您的构建路径中:https://www.seleniumhq.org/download/ 从此处下载ChromeDriver:http://chromedriver.chromium.org/downloads

import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.ie.InternetExplorerDriver;

public class ExchangeRates {

    public static void main(String[] args) throws InterruptedException {
        //Chrome Driver
        System.setProperty("webdriver.chrome.driver", "C:\\chromedriver.exe");
        WebDriver driver = new ChromeDriver();

        //Internet Explorer Driver
        //System.setProperty("webdriver.ie.driver", "C:\\MicrosoftWebDriver.exe");
        //WebDriver driver = new InternetExplorerDriver();

        driver.get("https://www.oanda.com/currency/live-exchange-rates/");
        for(int i = 0; i < 10; i++) {
            getExchangeRates(driver);
            Thread.sleep(1000);
        }
        driver.close();
    }

    private static void getExchangeRates(WebDriver driver) {
        String exchangeRateJSON = runJavascript(driver, "var rateRows = $('.rate_row'); " + 
                                                        "var exchangeRates = []; " + 
                                                        "for(var i = 0; i < rateRows.length; i++) { " + 
                                                        "   var row = $(rateRows[i]); " + 
                                                        "   var lbls = row.find(\".inline.title.left a\"); " + 
                                                        "   var vals = row.find(\".inline.value.right\"); " + 
                                                        "   var lbl1 = $(lbls[0]).text(); " + 
                                                        "   var lbl1Parts = lbl1.split(\"/\"); " + 
                                                        "   exchangeRates.push({ " + 
                                                        "       fromCur:lbl1Parts[0], " + 
                                                        "       toCur:lbl1Parts[1], " + 
                                                        "       bid:parseFloat($(vals[1]).text()), " + 
                                                        "       ask:parseFloat($(vals[0]).text()) " + 
                                                        "   }); " + 
                                                        "   var lbl2 = $(lbls[1]).text(); " + 
                                                        "   var lbl2Parts = lbl2.split(\"/\"); " + 
                                                        "   exchangeRates.push({ " + 
                                                        "       fromCur:lbl2Parts[0], " + 
                                                        "       toCur:lbl2Parts[1], " + 
                                                        "       bid:parseFloat($(vals[3]).text()), " + 
                                                        "       ask:parseFloat($(vals[2]).text()) " + 
                                                        "   }); " + 
                                                        "} " + 
                                                        "return JSON.stringify(exchangeRates);");
        System.out.println(exchangeRateJSON);
    }
    public static String runJavascript(WebDriver driver, String script) {
        WebElement body = driver.findElement(By.cssSelector("body"));
        return (String)((JavascriptExecutor)driver).executeScript(script, body);
    }
}

Selenium Webdriver的基本前提是您可以打开一个网页,然后执行诸如在页面上下文中运行Javascript之类的操作。这样可以防止您被检测为机器人。

javascript返回的JSON如下所示:

[
  {
    "fromCur": "EUR",
    "toCur": "USD",
    "bid": 1.13949,
    "ask": 1.13962
  },
  {
    "fromCur": "USD",
    "toCur": "EUR",
    "bid": 0.87749,
    "ask": 0.87759
  },
  {
    "fromCur": "GBP",
    "toCur": "USD",
    "bid": 1.27602,
    "ask": 1.27619
  },
  {
    "fromCur": "USD",
    "toCur": "GBP",
    "bid": 0.78358,
    "ask": 0.78369
  },
  {
    "fromCur": "USD",
    "toCur": "CAD",
    "bid": 1.31391,
    "ask": 1.31408
  },
  {
    "fromCur": "CAD",
    "toCur": "USD",
    "bid": 0.76099,
    "ask": 0.76109
  },
  {
    "fromCur": "USD",
    "toCur": "CHF",
    "bid": 0.9936,
    "ask": 0.99379
  },
  {
    "fromCur": "CHF",
    "toCur": "USD",
    "bid": 1.00625,
    "ask": 1.00644
  },
  {
    "fromCur": "USD",
    "toCur": "JPY",
    "bid": 110.752,
    "ask": 110.765
  },
  {
    "fromCur": "JPY",
    "toCur": "USD",
    "bid": 0.00903,
    "ask": 0.00903
  },
  {
    "fromCur": "EUR",
    "toCur": "GBP",
    "bid": 0.89291,
    "ask": 0.89308
  },
  {
    "fromCur": "GBP",
    "toCur": "EUR",
    "bid": 1.11972,
    "ask": 1.11993
  },
  {
    "fromCur": "EUR",
    "toCur": "CHF",
    "bid": 1.13236,
    "ask": 1.13252
  },
  {
    "fromCur": "CHF",
    "toCur": "EUR",
    "bid": 0.88299,
    "ask": 0.88311
  },
  {
    "fromCur": "AUD",
    "toCur": "USD",
    "bid": 0.72642,
    "ask": 0.72654
  },
  {
    "fromCur": "USD",
    "toCur": "AUD",
    "bid": 1.37639,
    "ask": 1.37661
  },
  {
    "fromCur": "EUR",
    "toCur": "JPY",
    "bid": 126.207,
    "ask": 126.226
  },
  {
    "fromCur": "JPY",
    "toCur": "EUR",
    "bid": 0.00792,
    "ask": 0.00792
  },
  {
    "fromCur": "GBP",
    "toCur": "JPY",
    "bid": 141.328,
    "ask": 141.355
  },
  {
    "fromCur": "JPY",
    "toCur": "GBP",
    "bid": 0.00707,
    "ask": 0.00708
  },
  {
    "fromCur": "EUR",
    "toCur": "AUD",
    "bid": 1.56854,
    "ask": 1.56877
  },
  {
    "fromCur": "AUD",
    "toCur": "EUR",
    "bid": 0.63744,
    "ask": 0.63754
  },
  {
    "fromCur": "EUR",
    "toCur": "CZK",
    "bid": 25.75355,
    "ask": 25.78107
  },
  {
    "fromCur": "CZK",
    "toCur": "EUR",
    "bid": 0.03879,
    "ask": 0.03883
  },
  {
    "fromCur": "EUR",
    "toCur": "HUF",
    "bid": 323.69,
    "ask": 324.089
  },
  {
    "fromCur": "HUF",
    "toCur": "EUR",
    "bid": 0.00309,
    "ask": 0.00309
  },
  {
    "fromCur": "EUR",
    "toCur": "NZD",
    "bid": 1.73338,
    "ask": 1.73374
  },
  {
    "fromCur": "NZD",
    "toCur": "EUR",
    "bid": 0.57679,
    "ask": 0.57691
  },
  {
    "fromCur": "EUR",
    "toCur": "SEK",
    "bid": 10.39725,
    "ask": 10.39991
  },
  {
    "fromCur": "SEK",
    "toCur": "EUR",
    "bid": 0.09615,
    "ask": 0.09618
  },
  {
    "fromCur": "EUR",
    "toCur": "SGD",
    "bid": 1.56877,
    "ask": 1.5692
  },
  {
    "fromCur": "SGD",
    "toCur": "EUR",
    "bid": 0.63727,
    "ask": 0.63744
  },
  {
    "fromCur": "EUR",
    "toCur": "CAD",
    "bid": 1.49727,
    "ask": 1.49748
  },
  {
    "fromCur": "CAD",
    "toCur": "EUR",
    "bid": 0.66779,
    "ask": 0.66788
  },
  {
    "fromCur": "EUR",
    "toCur": "DKK",
    "bid": 7.45298,
    "ask": 7.45446
  },
  {
    "fromCur": "DKK",
    "toCur": "EUR",
    "bid": 0.13415,
    "ask": 0.13417
  },
  {
    "fromCur": "EUR",
    "toCur": "NOK",
    "bid": 9.53927,
    "ask": 9.54229
  },
  {
    "fromCur": "NOK",
    "toCur": "EUR",
    "bid": 0.1048,
    "ask": 0.10483
  },
  {
    "fromCur": "EUR",
    "toCur": "PLN",
    "bid": 4.31208,
    "ask": 4.31726
  },
  {
    "fromCur": "PLN",
    "toCur": "EUR",
    "bid": 0.23163,
    "ask": 0.23191
  },
  {
    "fromCur": "EUR",
    "toCur": "TRY",
    "bid": 7.95697,
    "ask": 7.97624
  },
  {
    "fromCur": "TRY",
    "toCur": "EUR",
    "bid": 0.12537,
    "ask": 0.12568
  },
  {
    "fromCur": "EUR",
    "toCur": "ZAR",
    "bid": 16.43966,
    "ask": 16.4571
  },
  {
    "fromCur": "ZAR",
    "toCur": "EUR",
    "bid": 0.06076,
    "ask": 0.06083
  },
  {
    "fromCur": "USD",
    "toCur": "CNH",
    "bid": 6.90501,
    "ask": 6.90585
  },
  {
    "fromCur": "CNH",
    "toCur": "USD",
    "bid": 0.1448,
    "ask": 0.14482
  },
  {
    "fromCur": "USD",
    "toCur": "DKK",
    "bid": 6.53998,
    "ask": 6.5415
  },
  {
    "fromCur": "DKK",
    "toCur": "USD",
    "bid": 0.15287,
    "ask": 0.15291
  },
  {
    "fromCur": "USD",
    "toCur": "HUF",
    "bid": 284.084,
    "ask": 284.37
  },
  {
    "fromCur": "HUF",
    "toCur": "USD",
    "bid": 0.00352,
    "ask": 0.00352
  },
  {
    "fromCur": "USD",
    "toCur": "MXN",
    "bid": 19.1822,
    "ask": 19.18824
  },
  {
    "fromCur": "MXN",
    "toCur": "USD",
    "bid": 0.05212,
    "ask": 0.05213
  },
  {
    "fromCur": "USD",
    "toCur": "PLN",
    "bid": 3.78415,
    "ask": 3.78769
  },
  {
    "fromCur": "PLN",
    "toCur": "USD",
    "bid": 0.26401,
    "ask": 0.26426
  },
  {
    "fromCur": "USD",
    "toCur": "SEK",
    "bid": 9.12374,
    "ask": 9.12617
  },
  {
    "fromCur": "SEK",
    "toCur": "USD",
    "bid": 0.10958,
    "ask": 0.1096
  },
  {
    "fromCur": "USD",
    "toCur": "THB",
    "bid": 33.316,
    "ask": 33.434
  },
  {
    "fromCur": "THB",
    "toCur": "USD",
    "bid": 0.02991,
    "ask": 0.03002
  },
  {
    "fromCur": "USD",
    "toCur": "ZAR",
    "bid": 14.42721,
    "ask": 14.44086
  },
  {
    "fromCur": "ZAR",
    "toCur": "USD",
    "bid": 0.06925,
    "ask": 0.06931
  },
  {
    "fromCur": "USD",
    "toCur": "CZK",
    "bid": 22.60223,
    "ask": 22.61947
  },
  {
    "fromCur": "CZK",
    "toCur": "USD",
    "bid": 0.04421,
    "ask": 0.04424
  },
  {
    "fromCur": "USD",
    "toCur": "HKD",
    "bid": 7.84967,
    "ask": 7.85
  },
  {
    "fromCur": "HKD",
    "toCur": "USD",
    "bid": 0.12739,
    "ask": 0.12739
  },
  {
    "fromCur": "USD",
    "toCur": "INR",
    "bid": 69.997,
    "ask": 70.102
  },
  {
    "fromCur": "INR",
    "toCur": "USD",
    "bid": 0.01426,
    "ask": 0.01429
  },
  {
    "fromCur": "USD",
    "toCur": "NOK",
    "bid": 8.3705,
    "ask": 8.37378
  },
  {
    "fromCur": "NOK",
    "toCur": "USD",
    "bid": 0.11942,
    "ask": 0.11947
  },
  {
    "fromCur": "USD",
    "toCur": "SAR",
    "bid": 3.7495,
    "ask": 3.75125
  },
  {
    "fromCur": "SAR",
    "toCur": "USD",
    "bid": 0.26658,
    "ask": 0.2667
  },
  {
    "fromCur": "USD",
    "toCur": "SGD",
    "bid": 1.37673,
    "ask": 1.37695
  },
  {
    "fromCur": "SGD",
    "toCur": "USD",
    "bid": 0.72624,
    "ask": 0.72636
  },
  {
    "fromCur": "USD",
    "toCur": "TRY",
    "bid": 6.97585,
    "ask": 7.00085
  },
  {
    "fromCur": "TRY",
    "toCur": "USD",
    "bid": 0.14284,
    "ask": 0.14335
  },
  {
    "fromCur": "GBP",
    "toCur": "AUD",
    "bid": 1.75644,
    "ask": 1.75679
  },
  {
    "fromCur": "AUD",
    "toCur": "GBP",
    "bid": 0.56922,
    "ask": 0.56933
  },
  {
    "fromCur": "GBP",
    "toCur": "CHF",
    "bid": 1.26798,
    "ask": 1.26827
  },
  {
    "fromCur": "CHF",
    "toCur": "GBP",
    "bid": 0.78848,
    "ask": 0.78866
  },
  {
    "fromCur": "GBP",
    "toCur": "ZAR",
    "bid": 18.4094,
    "ask": 18.42928
  },
  {
    "fromCur": "ZAR",
    "toCur": "GBP",
    "bid": 0.05426,
    "ask": 0.05432
  },
  {
    "fromCur": "GBP",
    "toCur": "SGD",
    "bid": 1.75673,
    "ask": 1.75725
  },
  {
    "fromCur": "SGD",
    "toCur": "GBP",
    "bid": 0.56907,
    "ask": 0.56924
  },
  {
    "fromCur": "AUD",
    "toCur": "JPY",
    "bid": 80.454,
    "ask": 80.47
  },
  {
    "fromCur": "JPY",
    "toCur": "AUD",
    "bid": 0.01243,
    "ask": 0.01243
  },
  {
    "fromCur": "AUD",
    "toCur": "SGD",
    "bid": 1.00008,
    "ask": 1.00041
  },
  {
    "fromCur": "SGD",
    "toCur": "AUD",
    "bid": 0.99959,
    "ask": 0.99992
  },
  {
    "fromCur": "CAD",
    "toCur": "JPY",
    "bid": 84.284,
    "ask": 84.301
  },
  {
    "fromCur": "JPY",
    "toCur": "CAD",
    "bid": 0.01186,
    "ask": 0.01186
  },
  {
    "fromCur": "CHF",
    "toCur": "JPY",
    "bid": 111.446,
    "ask": 111.472
  },
  {
    "fromCur": "JPY",
    "toCur": "CHF",
    "bid": 0.00897,
    "ask": 0.00897
  },
  {
    "fromCur": "NZD",
    "toCur": "CAD",
    "bid": 0.86359,
    "ask": 0.86388
  },
  {
    "fromCur": "CAD",
    "toCur": "NZD",
    "bid": 1.15757,
    "ask": 1.15796
  },
  {
    "fromCur": "NZD",
    "toCur": "USD",
    "bid": 0.65731,
    "ask": 0.65746
  },
  {
    "fromCur": "USD",
    "toCur": "NZD",
    "bid": 1.52101,
    "ask": 1.52135
  },
  {
    "fromCur": "SGD",
    "toCur": "JPY",
    "bid": 80.432,
    "ask": 80.455
  },
  {
    "fromCur": "JPY",
    "toCur": "SGD",
    "bid": 0.01243,
    "ask": 0.01243
  },
  {
    "fromCur": "ZAR",
    "toCur": "JPY",
    "bid": 7.666,
    "ask": 7.681
  },
  {
    "fromCur": "JPY",
    "toCur": "ZAR",
    "bid": 0.13019,
    "ask": 0.13045
  },
  {
    "fromCur": "GBP",
    "toCur": "CAD",
    "bid": 1.67665,
    "ask": 1.67698
  },
  {
    "fromCur": "CAD",
    "toCur": "GBP",
    "bid": 0.59631,
    "ask": 0.59643
  },
  {
    "fromCur": "GBP",
    "toCur": "NZD",
    "bid": 1.94104,
    "ask": 1.94156
  },
  {
    "fromCur": "NZD",
    "toCur": "GBP",
    "bid": 0.51505,
    "ask": 0.51519
  },
  {
    "fromCur": "GBP",
    "toCur": "PLN",
    "bid": 4.82832,
    "ask": 4.83505
  },
  {
    "fromCur": "PLN",
    "toCur": "GBP",
    "bid": 0.20682,
    "ask": 0.20711
  },
  {
    "fromCur": "AUD",
    "toCur": "CAD",
    "bid": 0.95447,
    "ask": 0.95468
  },
  {
    "fromCur": "CAD",
    "toCur": "AUD",
    "bid": 1.04747,
    "ask": 1.0477
  },
  {
    "fromCur": "AUD",
    "toCur": "NZD",
    "bid": 1.10499,
    "ask": 1.10529
  },
  {
    "fromCur": "NZD",
    "toCur": "AUD",
    "bid": 0.90474,
    "ask": 0.90499
  },
  {
    "fromCur": "CAD",
    "toCur": "CHF",
    "bid": 0.75611,
    "ask": 0.75636
  },
  {
    "fromCur": "CHF",
    "toCur": "CAD",
    "bid": 1.32212,
    "ask": 1.32256
  },
  {
    "fromCur": "CAD",
    "toCur": "SGD",
    "bid": 1.0477,
    "ask": 1.04801
  },
  {
    "fromCur": "SGD",
    "toCur": "CAD",
    "bid": 0.95419,
    "ask": 0.95447
  },
  {
    "fromCur": "CHF",
    "toCur": "ZAR",
    "bid": 14.51736,
    "ask": 14.53388
  },
  {
    "fromCur": "ZAR",
    "toCur": "CHF",
    "bid": 0.0688,
    "ask": 0.06888
  },
  {
    "fromCur": "NZD",
    "toCur": "JPY",
    "bid": 72.797,
    "ask": 72.82
  },
  {
    "fromCur": "JPY",
    "toCur": "NZD",
    "bid": 0.01373,
    "ask": 0.01374
  },
  {
    "fromCur": "NZD",
    "toCur": "SGD",
    "bid": 0.90493,
    "ask": 0.90529
  },
  {
    "fromCur": "SGD",
    "toCur": "NZD",
    "bid": 1.10462,
    "ask": 1.10506
  },
  {
    "fromCur": "TRY",
    "toCur": "JPY",
    "bid": 15.823,
    "ask": 15.864
  },
  {
    "fromCur": "JPY",
    "toCur": "TRY",
    "bid": 0.06304,
    "ask": 0.0632
  },
  {
    "fromCur": "XAG",
    "toCur": "JPY",
    "bid": 1663.5,
    "ask": 1665.5
  },
  {
    "fromCur": "JPY",
    "toCur": "XAG",
    "bid": 0.0006,
    "ask": 0.0006
  },
  {
    "fromCur": "XAU",
    "toCur": "JPY",
    "bid": 132176,
    "ask": 132220
  },
  {
    "fromCur": "JPY",
    "toCur": "XAU",
    "bid": 0.00001,
    "ask": 0.00001
  },
  {
    "fromCur": "XAG",
    "toCur": "USD",
    "bid": 15.01958,
    "ask": 15.03569
  },
  {
    "fromCur": "USD",
    "toCur": "XAG",
    "bid": 0.06651,
    "ask": 0.06658
  },
  {
    "fromCur": "XAU",
    "toCur": "USD",
    "bid": 1193.441,
    "ask": 1193.691
  },
  {
    "fromCur": "USD",
    "toCur": "XAU",
    "bid": 0.00084,
    "ask": 0.00084
  }
]

答案 2 :(得分:0)

快速答案是无头浏览器。这些站点中的大多数都通过带有页面加载方法的套接字/ ajax /异步提供新信息。因此,要爬网动态网站是绝对正确的,最简单的方法是使其行为更像浏览器而不是脚本。有很多方法可以使用硒或phantomjs。通常,人们会使用诸如坚果之类的东西来大规模控制爬行流量。您可能还需要研究代理服务器场。