所以我得到前12个左右链接所需的结果很奇怪,然后每次都在同一链接(AAA 090)上崩溃。我不确定链接之间有什么区别。
import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;
import java.io.*; // Only needed if scraping a local File.
public class Scraper {
public Scraper() {
org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();
// get the page title
String title = page.title();
System.out.println("title: " + title);
// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();
//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("\nhref = " + link3.attr("href") + "\n" + cls.text() + "\n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());
}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}
public static void main (String args[]) {
new Scraper();
}
}
org.jsoup.HttpStatusException:提取URL时发生HTTP错误。状态= 400,> URL = https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?> pi_search_type = SB_COURSE&_subj_code = AAA&pi_crse_numb = 090&pi_archive_date =&pi_co> urse_status = A&pi_term_code = 201920,位于> org.jsoup.helper.HttpConnection $ Response.execute(at)。 > org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306)上的> org.jsoup.helper.HttpConnection $ Response.execute(HttpConnection.java:722)在> org.jsoup.helper.HttpConnection.get(HttpConnection .java:295)在Scraper。(Scraper.java:42)在Scraper.main(Scraper.java:64)“
答案 0 :(得分:0)
我认为您遵循的路径是“为什么要复杂一些才能使其简单?”。我只是看一下您要剪贴的网站。该网站的结构和组织良好。因此,无需使用正则表达式来连接字符串,并且无需在构造函数中进行所有抓取。我建议采取以下步骤:
您的代码可能类似于:
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";
public static void main(String[] args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"\n" + courseTitle + "\n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
}