我正在执行项目Hyperlink Crawler
以检查断开的链接。这是我的代码。 www.utem.edu.my/portal/portal。这样链接就会出现404错误。我认为我的拆分URL代码是错误的。请帮帮我。
public class HInterface extends JFrame {
// Declaring variables to be used as components in the interface
private JLabel lblURL;
private JTextField inputSearch;
private JButton btnSearch;
private JEditorPane outputLinks;
public HInterface() {
super("Hyperlink Crawler");
setType(Type.POPUP);
setResizable(false);
getContentPane().setBackground(Color.BLACK);
setTitle("Web Link Crawler For Inspecting Broken Link");
FlowLayout flowLayout = new FlowLayout();
flowLayout.setAlignment(FlowLayout.LEFT);
getContentPane().setLayout(flowLayout);
// Creates a label for displaying a text
lblURL = new JLabel("\r\nENTER URL : ");
lblURL.setLocation(new Point(13, 9));
lblURL.setDoubleBuffered(true);
lblURL.setAlignmentY(Component.BOTTOM_ALIGNMENT);
lblURL.setAlignmentX(Component.RIGHT_ALIGNMENT);
lblURL.setVerticalAlignment(SwingConstants.TOP);
lblURL.setForeground(Color.WHITE);
lblURL.setFont(new Font("Tw Cen MT Condensed", Font.BOLD, 20));
getContentPane().add(lblURL);
// Creates text field for URL input
inputSearch = new JTextField();
inputSearch.setText("http://");
inputSearch.setPreferredSize(new Dimension(400, 32));
inputSearch.setFont(new Font("SansSerif", Font.BOLD, 17));
getContentPane().add(inputSearch);
// Creates a search button
btnSearch = new JButton(" Search ");
btnSearch.setPreferredSize(new Dimension(100, 32));
btnSearch.setFont(new Font("SansSerif", Font.BOLD, 13));
getContentPane().add(btnSearch);
// Adds the results text area to a scroll-able pane
JScrollPane scrollOutput = new JScrollPane (JScrollPane.VERTICAL_SCROLLBAR_AS_NEEDED, JScrollPane.HORIZONTAL_SCROLLBAR_NEVER);
scrollOutput.setBackground(Color.GRAY);
scrollOutput.setPreferredSize(new Dimension(1100, 670));
getContentPane().add(scrollOutput);
outputLinks = new JEditorPane();
outputLinks.addHyperlinkListener(new HyperlinkListener() {
public void hyperlinkUpdate(HyperlinkEvent e) {
if (HyperlinkEvent.EventType.ACTIVATED.equals(e.getEventType())) {
System.out.println(e.getURL());
Desktop desktop = Desktop.getDesktop();
try {
desktop.browse(e.getURL().toURI());
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
});
outputLinks.setText("RESULT");
outputLinks.setContentType("text/html");
outputLinks.setEditable(true);
scrollOutput.setColumnHeaderView(outputLinks);
outputLinks.setEditable(false);
// Add event handler to search button click
HandleEvents theEventHandler = new HandleEvents();
inputSearch.addActionListener(theEventHandler);
btnSearch.addActionListener(theEventHandler);
}
private class HandleEvents implements ActionListener {
public void actionPerformed(ActionEvent event) { // Called when elements are triggered
// Preparing output variable string
String strOutput = "No results were found!";
if (event.getSource() == btnSearch || event.getSource() == inputSearch) {
if (!inputSearch.getText().equals(""))
strOutput = crawlURL(inputSearch.getText());
else
strOutput = "Please enter URL to crawl it's hyperlinks";
}
// Prints out the results
outputLinks.setText(strOutput);
}
public String pullURL(String strUrl) {
String resutls = "";
URLConnection connection = null;
try {
connection = new URL(strUrl).openConnection();
@SuppressWarnings("resource")
Scanner scanner = new Scanner(connection.getInputStream());
scanner.useDelimiter("\\Z");
if(scanner.hasNext())
resutls = scanner.next();
} catch ( Exception ex ) {
ex.printStackTrace();
}
return resutls;
}
public String crawlURL(String strUrl) {
String results = ""; // For return
String protocol = "http://";
// Assigns the input to the inURL variable and checks to add http
String inURL = strUrl;
if (!inURL.toLowerCase().contains("http://".toLowerCase()) &&
!inURL.toLowerCase().contains("https://".toLowerCase())) {
inURL = protocol + inURL;
}
// Pulls URL contents from the web
String contectURL = pullURL(inURL);
if (contectURL == "") { // If it fails, then try with https
protocol = "https://";
inURL = protocol + inURL.split("http://")[1];
contectURL = pullURL(inURL);
}
// Declares some variables to be used inside loop
String aTagAttr = "";
String href = "";
String msg = "";
// Finds A tag and stores its href value into output var
String bodyTag = contectURL.split("<body")[1]; // Find 1st <body>
String[] aTags = bodyTag.split(">"); // Splits on every tag
//To show link different from one another
int index = 0;
for (String s: aTags) {
// Process only if it is A tag and contains href
if (s.toLowerCase().contains("<a") && s.toLowerCase().contains("href")) {
aTagAttr = s.split("href")[1]; // Split on href
// Split on space if it contains it
if (aTagAttr.toLowerCase().contains("\\s"))
aTagAttr = aTagAttr.split("\\s")[2];
// Splits on the link and deals with " or ' quotes
href = aTagAttr.split(
((aTagAttr.toLowerCase().contains("\""))? "\"" : "\'")
)[1];
if (!results.toLowerCase().contains(href))
//results += "~~~ " + href + "\r\n";
/*
* Last touches to URl before display
* Adds http(s):// if not exist
* Adds base url if not exist
*/
if(results.toLowerCase().indexOf("about") != -1) {
//Contains 'about'
}
if (!href.toLowerCase().contains("http://") &&
!href.toLowerCase().contains("https://")) {
// http:// + baseURL + href
if (!href.toLowerCase().contains(inURL.split("://")[1]))
href = protocol + inURL.split("://")[1] + href;
else
href = protocol + href;
}
System.out.println(href);//debug
try {
msg = URLheker(href);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// Store the link in output var
if (!results.toLowerCase().contains(href)){
results += "<a href=\"";
results += href;
results += "\">";
results += "Link" + (index + 1)+ " : "+ href ;
results += "</a>";
results += " : ";
results += msg;
results += "<br>";
index++;
}
}
}
System.out.println(results);
return results;
}
}
public String URLheker(String href) throws Exception {
String msg = "";
int code = 0;
URL url = new URL(href);
URLConnection connection = url.openConnection();
if(connection instanceof HttpURLConnection) {
HttpURLConnection httpconn=(HttpURLConnection)connection;
code = httpconn.getResponseCode();
msg = httpconn.getResponseMessage();
if(code == HttpURLConnection.HTTP_OK )
System.out.println("Return normal response :"+msg);
else
System.out.println(code);
}
msg = msg+" [" + Integer.toString(code) + "]";
return msg;
}
}
答案 0 :(得分:0)
我不确定它是否可以解决您的问题,但您可以在从连接获取输入流之前检查响应代码:
public String pullURL(String strUrl) {
String resutls = "";
HttpURLConnection connection = null;
try {
connection = (HttpURLConnection)new URL(strUrl).openConnection();
//connection.getResponseCode() <- CHECK YOUR RESPONSE CODE
@SuppressWarnings("resource") Scanner scanner = new Scanner(connection.getInputStream());
scanner.useDelimiter("\\Z");
if (scanner.hasNext())
resutls = scanner.next();
} catch (Exception ex) {
ex.printStackTrace();
}
return resutls;
}