Question

我想用Java构建一个抓取工具，该抓取工具可以为我提供网站上的所有cookie。据信，该爬虫会自动爬取网站列表（显然是其下端）。

我在计划中使用了jSoup和Selenium。

package com.mycompany.app;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;

public class BasicWebCrawler {
    private static Set<String> uniqueURL = new HashSet<String>();
    private static List<String> link_list = new ArrayList<String>();

    private static Set<String> uniqueCookies = new HashSet<String>();

    private static void get_links(String url) {
        Connection connection = null;
        Connection.Response response = null;
        String this_link = null;

        try {
            connection = Jsoup.connect(url);
            response = connection.execute();

            //cookies_http = response.cookies();

            // fetch the document over HTTP
            Document doc = response.parse();

            // get all links in page
            Elements links = doc.select("a[href]");

            if(links.isEmpty()) {
                return;
            }

            for (Element link : links) {
                this_link = link.attr("href");

                boolean add = uniqueURL.add(this_link);

                System.out.println("\n" + this_link + "\n" + "title: " + doc.title());

                if (add && (this_link.contains(url))) {
                    System.out.println("\n" + this_link + "\n" + "title: " + doc.title());

                    link_list.add(this_link);

                    get_links(this_link);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        get_links("https://de.wikipedia.org/wiki/Wikipedia");

        /**
         * Hier kommt Selenium ins Spiel
         */
        WebDriver driver;

        System.setProperty("webdriver.chrome.driver", "D:\\crawler\\driver\\chromedriver.exe");

        driver = new ChromeDriver();

        // create file named Cookies to store Login Information
        File file = new File("Cookies.data");

        FileWriter fileWrite = null;
        BufferedWriter Bwrite = null;

        try {
            // Delete old file if exists
            file.delete();
            file.createNewFile();

            fileWrite = new FileWriter(file);
            Bwrite = new BufferedWriter(fileWrite);
            // loop for getting the cookie information
        } catch (Exception ex) {
            ex.printStackTrace();
        }


        for(String link : link_list) {
            System.out.println("Open Link: " + link);

            driver.get(link);

            try {
                // loop for getting the cookie information
                for (Cookie ck : driver.manage().getCookies()) {
                    String tmp = (ck.getName() + ";" + ck.getValue() + ";" + ck.getDomain() + ";" + ck.getPath() + ";" + ck.getExpiry() + ";" + ck.isSecure());

                    if(uniqueCookies.add(tmp)) {
                        Bwrite.write("Link: " + link + "\n" + (ck.getName() + ";" + ck.getValue() + ";" + ck.getDomain() + ";" + ck.getPath() + ";" + ck.getExpiry() + ";" + ck.isSecure())+ "\n\n");
                        Bwrite.newLine();
                    }
                }
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }

        try {
            Bwrite.close();
            fileWrite.close();

            driver.close();
            driver.quit();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}

我在wikipedia页面上测试了此代码，并将结果与Cookie扫描程序调用{{3}}进行了比较。

我的代码仅显示四个cookie：

Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
GeoIP;DE:NW:M__nster:51.95:7.54:v4;.wikipedia.org;/;null;true


Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
WMF-Last-Access-Global;13-May-2019;.wikipedia.org;/;Mon Jan 19 02:28:33 CET 1970;true


Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
WMF-Last-Access;13-May-2019;de.wikipedia.org;/;Mon Jan 19 02:28:33 CET 1970;true


Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
mwPhp7Seed;55e;de.wikipedia.org;/;Mon Jan 19 03:09:08 CET 1970;false

但是cookie扫描器显示七个。我不知道为什么我的代码显示的比CookieMetrix小。你能帮我吗？

Answer 1

java.util.Set<Cookie> getCookies()的JavaDoc：

获取当前域的所有cookie。 相当于调用“ document.cookie” 并解析结果

document.cookie将不返回HttpOnly cookie，仅仅是因为JavaScript不允许。
还请注意，“ CookieMetrix”似乎列出了来自不同域的cookie。

解决方案：

要获取诸如“ CookieMetrix”（1 + 2）之类的清单，您可以在浏览器之后添加代理并嗅探请求。
如果要获取当前域的所有cookie，包括HttpOnly（1），可以尝试直接访问Chrome的DevTools API（同样，它还会返回HttpOnly Cookie）

Java搜寻器可获取所有第一方和第三方Cookie

1 个答案: