Java爬虫增加了爬行速度

时间:2017-10-03 04:32:51

标签: java mysql jsoup

大家好我使用Java开发了以下代码(使用Net beans IDE 8.2)& jsoup库将网页抓取到一个深度并将URL保存到数据库中,但现在我想将爬网深度增加到三级。这意味着一个例子,如果从那里转到关于页面的主页并访问关于我们页面的URL并将其保存到数据库。任何人都可以帮我编码吗?我不知道该怎么做。这是我的代码:

package webcrawler;

import DbConnection.DbConnect;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


/**
 *
 * @author RUSIRU N PELANGODA
 */
public class WebCrawler {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException {

        Connection conn = null;

        Prepared Statement pst = null;

        conn = DbConnect.connect();


        // TODO code application logic here
        System.setProperty("http.proxyhost","127.0.0.1");

        System.setProperty("http.proxyport","8081");

        `String str=Jsoup.connect("http://localhost/hotel   
          /index.php").userAgent("Mozilla/5.0 (Windows NT 6.3; Win64; x64)
          AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 
          Safari/537.36").ignoreContentType(true).get().text();

          System.out.println(str);

           Document doc = Jsoup.connect("http://localhost/hotel
           /index.php").get();

        String title = doc.title();

        System.out.println("Web Page Title is : "+title);

        Elements links = doc.select("a[href]");

        for(Element link :links){


            System.out.println("\nlink:"+link.attr("href"));

            System.out.println("text :"+link.text());

            String url1= link.attr("href");

            String text1= link.text();



            try { 

                String q = "INSERT INTO web(url,text)  

                VALUES('"+"http://localhost/hotel/"+url1+"','"+text1+"')";

                pst = conn.prepareStatement(q);

                pst.execute();


            } 

            catch (SQLException ex) {

                    Logger.getLogger(WebCrawler.class.getName()).
                    log(Level.SEVERE, null, ex);
            }


              FileWriter writer=new FileWriter("Output.html");

              writer.write(doc.toString());

              link.attr("href").toString();        
        }         
    }   
}

0 个答案:

没有答案