使用jsoup和android从网站上抓取链接

时间:2014-02-27 11:02:38

标签: android jsoup screen-scraping

我正在尝试制作一个可以从网站上浏览链接的应用。作为一个起点,我只是想把它们作为系统信息。当我运行程序时,我根本没有看到任何消息。怎么了?请帮助。

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import android.app.Activity;

import java.io.*;


public class HTMLLinkExtractor extends Activity {


        public static void main() {
            scrapeLink();
        }




            public static void scrapeLink(){
                File input = new File("/tmp/input.html");
                Document doc = null;
                try {
                    doc = Jsoup.parse(input, "UTF-8", "http://www.homedepot.com");
                    Elements link = doc.select("a[href]");
                    String stringLink = null;

                            for(int i=0; i<link.size(); i++){

                                stringLink = link.toString();
                                System.out.println(stringLink);
                            }

                    System.out.println(link);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                Element links = doc.select("a[href]").first();
                System.out.println(links);
                }
}

2 个答案:

答案 0 :(得分:1)

由于您没有实际包含html数据的input.html(baseUri用于解析该html中的相对链接),您可以修改代码以直接连接到网站并直接获取数据:

尝试,

public static void main(String... args) {
        scrapeLink();
    }

    public static void scrapeLink() {
        // File input = new File("/tmp/input.html");
        Document doc = null;
        try {
            doc = Jsoup.connect("http://www.homedepot.com").get();
            Elements link = doc.select("a[href]");
            String stringLink = null;

            for (int i = 0; i < link.size(); i++) {

                stringLink = link.toString();
                System.out.println(stringLink);
            }

            System.out.println(link);
        } catch (IOException e) {
            e.printStackTrace();
        }
        Element links = doc.select("a[href]").first();
        System.out.println(links);
    }

输出,

<a href="http://www.homedepot.ca/?eid=us-language-selection-en&amp;utm_source=us-language-selection-en"><font color="#ED8A3D"><b>English</b></font></a>
<a href="http://www.homedepot.ca/accueil?eid=us-language-selection-fr&amp;utm_source=us-language-selection-fr"><font color="#ED8A3D"><b>Francais</b></font></a>
...

注意,我将主程序作为Java应用程序运行,您可以将其更改为在ADT中运行。

答案 1 :(得分:0)

android asyncTask的答案

class scrapeLinks extends AsyncTask<Void,Void,Void> {

       @Override
       protected void onPreExecute() {
           super.onPreExecute();
           pDialog = new ProgressDialog(MainActivity.this);
           pDialog.setMessage("getting links");
           pDialog.setIndeterminate(false);
           pDialog.setCancelable(true);
           pDialog.show();
       }


    @Override
    protected Void doInBackground(Void... params) {
         Document doc = null;
            try {
                doc = Jsoup.connect("http://www.homedepot.com").get();
                Elements link = doc.select("a[href]");
                String stringLink = null;

                for (int i = 0; i < link.size(); i++) {

                    stringLink = link.toString();
                    System.out.println(stringLink);
                }

                System.out.println(link);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            Element links = doc.select("a[href]").first();
            System.out.println(links);

            return null;
        }
         }

    protected void onPostExecute(Void...params){
        pDialog.dismiss();
    }

       }