package scraper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Scraper {
public static void main(String[] args) throws Exception {
final Document document = Jsoup.connect("https://www.indeed.com.pk/jobs?q=java&l=").userAgent("Mozilla").cookie("auth", "token").timeout(3000) .get();
Elements rows = document.select("div.row.result") ;
for (Element row : rows){
Elements innerDivs = row.select("div");
String header = innerDivs.get(1).text();
String content = innerDivs.get(2).text();
System.out.println("header = "+header+ " -> "+content);
}
}
}
在这段代码中,我正在搜索搜索查询Java的工作,但它只删除了当前页面(代码中搜索到的查询的链接)。我想废弃与Java相关的所有页面
请帮忙
答案 0 :(得分:0)
您需要找到具有.pagination
类的分页div,然后选择第一页的第一个内部链接,第二页的第二个内部链接等。
这是一个如何执行此操作的示例。您需要修改它以加载正确的页面:
Elements pages = document.select("div.pagination a");
for(Element page : pages) {
// Load the next page
Document nextPage = Jsoup.connect(pages.attr("href"));
...
}
工作示例:
package scraper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Scraper {
public static void main(String[] args) throws Exception {
final Document document =
Jsoup.connect("https://www.indeed.com.pk/jobs?q=java&l=")
.userAgent("Mozilla")
.cookie("auth", "token")
.timeout(3000)
.get();
scrape(document);
// Move to the next page
Element page = document.select("div.pagination a").get(1);
System.out.println("Page link: " + page.attr("href"));
Document pageDoc = Jsoup.connect(page.attr("abs:href")).get();
scrape(pageDoc);
}
public static void scrape(Document document) {
Elements rows = document.select("div.row.result") ;
for (Element row : rows) {
Elements innerDivs = row.select("div");
String header = innerDivs.get(1).text();
String content = innerDivs.get(2).text();
System.out.println("header = "+header+ " -> "+content);
}
}
}