大家好我使用Java开发了以下代码(使用Net beans IDE 8.2)& jsoup库将网页抓取到一个深度并将URL保存到数据库中,但现在我想将爬网深度增加到三级。这意味着一个例子,如果从那里转到关于页面的主页并访问关于我们页面的URL并将其保存到数据库。任何人都可以帮我编码吗?我不知道该怎么做。这是我的代码:
package webcrawler;
import DbConnection.DbConnect;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author RUSIRU N PELANGODA
*/
public class WebCrawler {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws IOException {
Connection conn = null;
Prepared Statement pst = null;
conn = DbConnect.connect();
// TODO code application logic here
System.setProperty("http.proxyhost","127.0.0.1");
System.setProperty("http.proxyport","8081");
`String str=Jsoup.connect("http://localhost/hotel
/index.php").userAgent("Mozilla/5.0 (Windows NT 6.3; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115
Safari/537.36").ignoreContentType(true).get().text();
System.out.println(str);
Document doc = Jsoup.connect("http://localhost/hotel
/index.php").get();
String title = doc.title();
System.out.println("Web Page Title is : "+title);
Elements links = doc.select("a[href]");
for(Element link :links){
System.out.println("\nlink:"+link.attr("href"));
System.out.println("text :"+link.text());
String url1= link.attr("href");
String text1= link.text();
try {
String q = "INSERT INTO web(url,text)
VALUES('"+"http://localhost/hotel/"+url1+"','"+text1+"')";
pst = conn.prepareStatement(q);
pst.execute();
}
catch (SQLException ex) {
Logger.getLogger(WebCrawler.class.getName()).
log(Level.SEVERE, null, ex);
}
FileWriter writer=new FileWriter("Output.html");
writer.write(doc.toString());
link.attr("href").toString();
}
}
}