我想用Java构建一个抓取工具,该抓取工具可以为我提供网站上的所有cookie。据信,该爬虫会自动爬取网站列表(显然是其下端)。
我在计划中使用了jSoup和Selenium。
package com.mycompany.app;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
public class BasicWebCrawler {
private static Set<String> uniqueURL = new HashSet<String>();
private static List<String> link_list = new ArrayList<String>();
private static Set<String> uniqueCookies = new HashSet<String>();
private static void get_links(String url) {
Connection connection = null;
Connection.Response response = null;
String this_link = null;
try {
connection = Jsoup.connect(url);
response = connection.execute();
//cookies_http = response.cookies();
// fetch the document over HTTP
Document doc = response.parse();
// get all links in page
Elements links = doc.select("a[href]");
if(links.isEmpty()) {
return;
}
for (Element link : links) {
this_link = link.attr("href");
boolean add = uniqueURL.add(this_link);
System.out.println("\n" + this_link + "\n" + "title: " + doc.title());
if (add && (this_link.contains(url))) {
System.out.println("\n" + this_link + "\n" + "title: " + doc.title());
link_list.add(this_link);
get_links(this_link);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
get_links("https://de.wikipedia.org/wiki/Wikipedia");
/**
* Hier kommt Selenium ins Spiel
*/
WebDriver driver;
System.setProperty("webdriver.chrome.driver", "D:\\crawler\\driver\\chromedriver.exe");
driver = new ChromeDriver();
// create file named Cookies to store Login Information
File file = new File("Cookies.data");
FileWriter fileWrite = null;
BufferedWriter Bwrite = null;
try {
// Delete old file if exists
file.delete();
file.createNewFile();
fileWrite = new FileWriter(file);
Bwrite = new BufferedWriter(fileWrite);
// loop for getting the cookie information
} catch (Exception ex) {
ex.printStackTrace();
}
for(String link : link_list) {
System.out.println("Open Link: " + link);
driver.get(link);
try {
// loop for getting the cookie information
for (Cookie ck : driver.manage().getCookies()) {
String tmp = (ck.getName() + ";" + ck.getValue() + ";" + ck.getDomain() + ";" + ck.getPath() + ";" + ck.getExpiry() + ";" + ck.isSecure());
if(uniqueCookies.add(tmp)) {
Bwrite.write("Link: " + link + "\n" + (ck.getName() + ";" + ck.getValue() + ";" + ck.getDomain() + ";" + ck.getPath() + ";" + ck.getExpiry() + ";" + ck.isSecure())+ "\n\n");
Bwrite.newLine();
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
try {
Bwrite.close();
fileWrite.close();
driver.close();
driver.quit();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
我在wikipedia页面上测试了此代码,并将结果与Cookie扫描程序调用{{3}}进行了比较。
我的代码仅显示四个cookie:
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
GeoIP;DE:NW:M__nster:51.95:7.54:v4;.wikipedia.org;/;null;true
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
WMF-Last-Access-Global;13-May-2019;.wikipedia.org;/;Mon Jan 19 02:28:33 CET 1970;true
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
WMF-Last-Access;13-May-2019;de.wikipedia.org;/;Mon Jan 19 02:28:33 CET 1970;true
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
mwPhp7Seed;55e;de.wikipedia.org;/;Mon Jan 19 03:09:08 CET 1970;false
但是cookie扫描器显示七个。我不知道为什么我的代码显示的比CookieMetrix小。你能帮我吗?
答案 0 :(得分:0)
java.util.Set<Cookie> getCookies()
的JavaDoc:
获取当前域的所有cookie。 相当于调用“ document.cookie” 并解析结果
document.cookie
将不返回HttpOnly cookie,仅仅是因为JavaScript不允许。
还请注意,“ CookieMetrix”似乎列出了来自不同域的cookie。
解决方案:
要获取诸如“ CookieMetrix”(1 + 2)之类的清单,您可以在浏览器之后添加代理并嗅探请求。
如果要获取当前域的所有cookie,包括HttpOnly
(1),可以尝试直接访问Chrome的DevTools API(同样,它还会返回HttpOnly
Cookie)