Jsoup解析谷歌搜索结果

时间:2014-12-31 16:08:14

标签: java google-chrome jsoup web-development-server

我想从特定的谷歌搜索中获取所有链接和缩略图。这是我的代码。

package com.esocial.util;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class ListLinks {
    public static void main(String[] args) throws IOException {
        String url = "https://www.google.co.in/webhp?sourceid=chrome-instant&rlz=1C1CHWA_enIN609IN609&ion=1&espv=2&ie=UTF-8#q=thermodynamics%20cbse";
        System.out.println("Fetching : "+url+"\n\n");

        Document doc = Jsoup.connect(url).userAgent("Mozilla").get();

        Elements div = doc.select("div.srg");

        for(Element di : div)
        {
            Elements lists = di.select("li.g");

            for(Element list : lists)
            {
                Element anc = list.select("a").first();
                Element img = list.select("img").first();


                System.out.println("\nLink : "+anc.attr("href")+"\nImage Link : "+img.attr("src")+"\n------------------------------------------\n");

            }
        }   
    }

}

但是此代码运行不正常并且不显示结果。我不明白这是什么问题。

1 个答案:

答案 0 :(得分:0)

我还努力抓住搜索结果。

获取html页面。我请求了一个REST后呼叫。

    // HTTP GET request
private void sendGet(String query, GoogleSearchCallback googleSearchCallback) throws Exception {
    ArrayList<GoogleSearchResult> googleSearchResults = new ArrayList<>();
    String url = "https://www.google.co.in/search?q=" + query + "#q=" + query + "course";

    URL obj = new URL(url);
    HttpURLConnection con = (HttpURLConnection) obj.openConnection();

    // optional default is GET
    con.setRequestMethod("GET");

    con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64)");

    int responseCode = con.getResponseCode();
    System.out.println("\nSending 'GET' request to URL : " + url);
    System.out.println("Response Code : " + responseCode);

    BufferedReader in = new BufferedReader(new InputStreamReader(
            con.getInputStream()));
    String inputLine;
    StringBuffer response = new StringBuffer();

    while ((inputLine = in.readLine()) != null) {
        response.append(inputLine);
    }
    in.close();

    // print result
    System.out.println(response.toString());
    Document document = Jsoup.parse(response.toString());
    Elements links = document.select("a[href]");
    for (Element link : links) {

        String temp = link.attr("href");
        if (temp.startsWith("/url?q=")) {
            Log.i(TAG, "" + temp.replace("/url?q=", "") + "");

            URL tempUrl = new URL(temp.replace("/url?q=", ""));
            String path = tempUrl.getFile().substring(0, tempUrl.getFile().lastIndexOf('/'));
            String base = tempUrl.getProtocol() + "://" + tempUrl.getHost() + path;
            Log.i(TAG, "" + base);
            Log.e(TAG, getDomainName(temp));

            GoogleSearchResult googleSearchResult = new GoogleSearchResult();
            googleSearchResult.setResultLink(base);
            googleSearchResult.setResultTitle(link.text());
            googleSearchResults.add(googleSearchResult);
        }
    }
    googleSearchCallback.onGetGoogleResultSuccess(googleSearchResults);
}

 private String getDomainName(String url) {

    String domainName = "";
    matcher = patternDomainName.matcher(url);
    if (matcher.find()) {
        domainName = matcher.group(0).toLowerCase().trim();
    }
    return domainName;

}

private static Pattern patternDomainName;
private Matcher matcher;
private static final String DOMAIN_NAME_PATTERN = "^(http(s)?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";

static {
    patternDomainName = Pattern.compile(DOMAIN_NAME_PATTERN);
}