如何拆分URL获取网页中的所有链接?

时间:2016-07-28 09:00:17

标签: java

我正在执行项目Hyperlink Crawler以检查断开的链接。这是我的代码。 www.utem.edu.my/portal/portal。这样链接就会出现404错误。我认为我的拆分URL代码是错误的。请帮帮我。

public class HInterface extends JFrame {

    // Declaring variables to be used as components in the interface
    private JLabel lblURL;
    private JTextField inputSearch;
    private JButton btnSearch;
    private JEditorPane outputLinks;

    public HInterface() {
        super("Hyperlink Crawler");
        setType(Type.POPUP);
        setResizable(false);
        getContentPane().setBackground(Color.BLACK);
        setTitle("Web Link Crawler For Inspecting Broken Link");
        FlowLayout flowLayout = new FlowLayout();
        flowLayout.setAlignment(FlowLayout.LEFT);
        getContentPane().setLayout(flowLayout);

        // Creates a label for displaying a text
        lblURL = new JLabel("\r\nENTER URL :    ");
        lblURL.setLocation(new Point(13, 9));
        lblURL.setDoubleBuffered(true);
        lblURL.setAlignmentY(Component.BOTTOM_ALIGNMENT);
        lblURL.setAlignmentX(Component.RIGHT_ALIGNMENT);
        lblURL.setVerticalAlignment(SwingConstants.TOP);
        lblURL.setForeground(Color.WHITE);
        lblURL.setFont(new Font("Tw Cen MT Condensed", Font.BOLD, 20));
        getContentPane().add(lblURL);

        // Creates text field for URL input
        inputSearch = new JTextField();
        inputSearch.setText("http://");
        inputSearch.setPreferredSize(new Dimension(400, 32));
        inputSearch.setFont(new Font("SansSerif", Font.BOLD, 17));
        getContentPane().add(inputSearch);

        // Creates a search button
        btnSearch = new JButton("  Search  ");
        btnSearch.setPreferredSize(new Dimension(100, 32));
        btnSearch.setFont(new Font("SansSerif", Font.BOLD, 13));
        getContentPane().add(btnSearch);

        // Adds the results text area to a scroll-able pane
        JScrollPane scrollOutput = new JScrollPane (JScrollPane.VERTICAL_SCROLLBAR_AS_NEEDED, JScrollPane.HORIZONTAL_SCROLLBAR_NEVER);
        scrollOutput.setBackground(Color.GRAY);
        scrollOutput.setPreferredSize(new Dimension(1100, 670));
        getContentPane().add(scrollOutput);

        outputLinks = new JEditorPane();
        outputLinks.addHyperlinkListener(new HyperlinkListener() {
            public void hyperlinkUpdate(HyperlinkEvent e) {
                if (HyperlinkEvent.EventType.ACTIVATED.equals(e.getEventType())) {
                    System.out.println(e.getURL());
                    Desktop desktop = Desktop.getDesktop();
                    try {
                        desktop.browse(e.getURL().toURI());
                    } catch (Exception ex) {
                        ex.printStackTrace();
                    }
                }

            }
        }); 

        outputLinks.setText("RESULT");
        outputLinks.setContentType("text/html");
        outputLinks.setEditable(true);
        scrollOutput.setColumnHeaderView(outputLinks);
        outputLinks.setEditable(false);

        // Add event handler to search button click
        HandleEvents theEventHandler = new HandleEvents();
        inputSearch.addActionListener(theEventHandler);
        btnSearch.addActionListener(theEventHandler);
    }

    private class HandleEvents implements ActionListener {

        public void actionPerformed(ActionEvent event) { // Called when elements are triggered
            // Preparing output variable string
            String strOutput = "No results were found!";

            if (event.getSource() == btnSearch || event.getSource() == inputSearch) {
                if (!inputSearch.getText().equals(""))
                    strOutput = crawlURL(inputSearch.getText());
                else
                    strOutput = "Please enter URL to crawl it's hyperlinks";
            }

            // Prints out the results
            outputLinks.setText(strOutput);
        }

        public String pullURL(String strUrl) {
            String resutls = "";
            URLConnection connection = null;
            try {
              connection =  new URL(strUrl).openConnection();
              @SuppressWarnings("resource")
            Scanner scanner = new Scanner(connection.getInputStream());
              scanner.useDelimiter("\\Z");
              if(scanner.hasNext())
                  resutls = scanner.next();
            } catch ( Exception ex ) {
                ex.printStackTrace();
            }

            return resutls;
        }


        public String crawlURL(String strUrl) {
            String results = ""; // For return
            String protocol = "http://";

            // Assigns the input to the inURL variable and checks to add http
            String inURL = strUrl;
            if (!inURL.toLowerCase().contains("http://".toLowerCase()) && 
                    !inURL.toLowerCase().contains("https://".toLowerCase())) {
                inURL = protocol + inURL;
            }

            // Pulls URL contents from the web
            String contectURL = pullURL(inURL);
            if (contectURL == "") { // If it fails, then try with https
                protocol = "https://";
                inURL = protocol + inURL.split("http://")[1];
                contectURL = pullURL(inURL);
            }

            // Declares some variables to be used inside loop
            String aTagAttr = "";
            String href = "";
            String msg = "";

            // Finds A tag and stores its href value into output var
            String bodyTag = contectURL.split("<body")[1]; // Find 1st <body>
            String[] aTags = bodyTag.split(">"); // Splits on every tag

            //To show link different from one another
            int index = 0;

            for (String s: aTags) {
                // Process only if it is A tag and contains href
                if (s.toLowerCase().contains("<a") && s.toLowerCase().contains("href")) {

                    aTagAttr = s.split("href")[1]; // Split on href

                    // Split on space if it contains it
                    if (aTagAttr.toLowerCase().contains("\\s"))
                        aTagAttr = aTagAttr.split("\\s")[2];

                    // Splits on the link and deals with " or ' quotes
                    href = aTagAttr.split(
                        ((aTagAttr.toLowerCase().contains("\""))? "\"" : "\'")
                    )[1];

                    if (!results.toLowerCase().contains(href)) 
                        //results += "~~~ " + href + "\r\n";

                    /*
                     * Last touches to URl before display
                     *      Adds http(s):// if not exist
                     *      Adds base url if not exist
                     */

                        if(results.toLowerCase().indexOf("about") != -1) {
                               //Contains 'about'
                            }
                    if (!href.toLowerCase().contains("http://") &&
                            !href.toLowerCase().contains("https://")) {

                        // http:// + baseURL + href
                        if (!href.toLowerCase().contains(inURL.split("://")[1]))
                            href = protocol + inURL.split("://")[1] + href;
                        else
                            href = protocol + href;
                    }

                    System.out.println(href);//debug

                    try {
                        msg = URLheker(href);
                    } catch (Exception e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }

                    // Store the link in output var
                    if (!results.toLowerCase().contains(href)){


                        results += "<a href=\"";
                        results += href;
                        results += "\">";
                        results +=  "Link" + (index + 1)+ " : "+ href  ;
                        results += "</a>";
                        results += "                                                                                        :  ";
                        results += msg;
                        results += "<br>";
                        index++;
                    }

                }

            }

            System.out.println(results);
            return results;

        }
    }


public String URLheker(String href) throws Exception {

    String msg = "";
    int code = 0;
    URL url = new URL(href);
    URLConnection connection = url.openConnection();
        if(connection instanceof HttpURLConnection) {
            HttpURLConnection httpconn=(HttpURLConnection)connection;
            code = httpconn.getResponseCode();
            msg = httpconn.getResponseMessage();
                if(code == HttpURLConnection.HTTP_OK )
                    System.out.println("Return normal response :"+msg);
                else
                    System.out.println(code);
            }

        msg = msg+" [" + Integer.toString(code) + "]";

return msg; 

    }

}

1 个答案:

答案 0 :(得分:0)

我不确定它是否可以解决您的问题,但您可以在从连接获取输入流之前检查响应代码:

public String pullURL(String strUrl) {
    String resutls = "";
    HttpURLConnection connection = null;
    try {
        connection = (HttpURLConnection)new URL(strUrl).openConnection();
        //connection.getResponseCode() <- CHECK YOUR RESPONSE CODE
        @SuppressWarnings("resource") Scanner scanner = new Scanner(connection.getInputStream());
        scanner.useDelimiter("\\Z");
        if (scanner.hasNext())
            resutls = scanner.next();
    } catch (Exception ex) {
        ex.printStackTrace();
    }

    return resutls;
}