从WebView抓取HTML并将其分配给字符串

时间:2013-01-08 17:59:22

标签: android android-webview web-scraping

基本上我的主要目标是登录网站,从我访问的页面中获取所有数据并解析出我不需要的内容。我设法使用webview实现了部分日志记录,这意味着我需要获取html并将其分配给字符串,以便仍然使用相同的webview进行解析。我查看了这里的一些帖子,发现了一些我修改过的代码:

public class MainActivity extends Activity {

private WebView mainWebView;
private EditText htmlBox;
private String webViewHtml;
private Boolean loadedTwice = true;
private String loadingDone = "incomplete";

/* An instance of this class will be registered as a JavaScript interface */  
class MyJavaScriptInterface   
{  
    @SuppressWarnings("unused")  
    public void showHTML(String html)  
    {
        webViewHtml = html;
    }  
}  

//onCreate
@Override
public void onCreate(Bundle savedInstanceState) {
    super.onCreate(savedInstanceState);
    setContentView(R.layout.activity_main);

    mainWebView = (WebView) findViewById(R.id.mainWebView);
    htmlBox = (EditText) findViewById(R.id.htmlBox);

    if (mainWebView.getVisibility() == View.VISIBLE) {
        mainWebView.setVisibility(View.INVISIBLE);
    }

    WebSettings webSetts = mainWebView.getSettings();

    //mainWebView.setWebViewClient(new WebViewClient());
    webSetts.setJavaScriptEnabled(true);
    mainWebView.addJavascriptInterface(new MyJavaScriptInterface(), "HTMLOUT");
    mainWebView.setWebViewClient(new WebViewClient() {  
        @Override  
        public void onPageFinished(WebView view, String url)  
        {  
            /* This call inject JavaScript into the page which just finished loading. */  
            mainWebView.loadUrl("javascript:window.HTMLOUT.showHTML('<head>'+document.getElementsByTagName('html')[0].innerHTML+'</head>');");
            if (loadedTwice == false){
                mainWebView.loadUrl(url);
                loadedTwice = true;
                loadingDone = "loading";
            }

            htmlBox.setText(webViewHtml);
            if (loadingDone == "loading"){
                loadingDone = "complete";
            }
        }  
    });

}

private WebView mainWebView; private EditText htmlBox; private String webViewHtml; private Boolean loadedTwice = true; private String loadingDone = "incomplete"; /* An instance of this class will be registered as a JavaScript interface */ class MyJavaScriptInterface { @SuppressWarnings("unused") public void showHTML(String html) { webViewHtml = html; } } //onCreate @Override public void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_main); mainWebView = (WebView) findViewById(R.id.mainWebView); htmlBox = (EditText) findViewById(R.id.htmlBox); if (mainWebView.getVisibility() == View.VISIBLE) { mainWebView.setVisibility(View.INVISIBLE); } WebSettings webSetts = mainWebView.getSettings(); //mainWebView.setWebViewClient(new WebViewClient()); webSetts.setJavaScriptEnabled(true); mainWebView.addJavascriptInterface(new MyJavaScriptInterface(), "HTMLOUT"); mainWebView.setWebViewClient(new WebViewClient() { @Override public void onPageFinished(WebView view, String url) { /* This call inject JavaScript into the page which just finished loading. */ mainWebView.loadUrl("javascript:window.HTMLOUT.showHTML('<head>'+document.getElementsByTagName('html')[0].innerHTML+'</head>');"); if (loadedTwice == false){ mainWebView.loadUrl(url); loadedTwice = true; loadingDone = "loading"; } htmlBox.setText(webViewHtml); if (loadingDone == "loading"){ loadingDone = "complete"; } } }); }

此代码的问题在于,如果我不加载页面两次,则htmlBox仅显示上一页加载的html。我也试过在不同的void中使用if循环做一些事情,如果loadingDone ==“完成”,但它从未起作用。为什么呢?

0 个答案:

没有答案