基本源代码来自here
import java.applet.Applet;
import java.awt.*;
import java.awt.List;
import java.awt.event.*;
import java.util.*;
import java.net.*;
import java.io.*;
public class WebCrawler extends Applet implements ActionListener, Runnable
{
private static final long serialVersionUID = 1L;
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;
Panel panelMain;
List listMatches;
Label labelStatus;
// URLs to be searched
Vector<String> vectorToSearch;
// URLs already searched
Vector<String> vectorSearched;
// URLs which match
Vector<String> vectorMatches;
Thread searchThread;
TextField textURL;
Choice choiceType;
public void init()
{
// set up the main UI panel
panelMain = new Panel();
panelMain.setLayout(new BorderLayout(5, 5));
// text entry components
Panel panelEntry = new Panel();
panelEntry.setLayout(new BorderLayout(5, 5));
Panel panelURL = new Panel();
panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelURL = new Label("Starting URL: ", Label.RIGHT);
panelURL.add(labelURL);
textURL = new TextField("", 40);
panelURL.add(textURL);
panelEntry.add("North", panelURL);
Panel panelType = new Panel();
panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelType = new Label("Content type: ", Label.RIGHT);
panelType.add(labelType);
choiceType = new Choice();
choiceType.addItem("text/html");
choiceType.addItem("audio/basic");
choiceType.addItem("audio/au");
choiceType.addItem("audio/aiff");
choiceType.addItem("audio/wav");
choiceType.addItem("video/mpeg");
choiceType.addItem("video/x-avi");
panelType.add(choiceType);
panelEntry.add("South", panelType);
panelMain.add("North", panelEntry);
// list of result URLs
Panel panelListButtons = new Panel();
panelListButtons.setLayout(new BorderLayout(5, 5));
Panel panelList = new Panel();
panelList.setLayout(new BorderLayout(5, 5));
Label labelResults = new Label("Search results");
panelList.add("North", labelResults);
Panel panelListCurrent = new Panel();
panelListCurrent.setLayout(new BorderLayout(5, 5));
listMatches = new List(10);
panelListCurrent.add("North", listMatches);
labelStatus = new Label("");
panelListCurrent.add("South", labelStatus);
panelList.add("South", panelListCurrent);
panelListButtons.add("North", panelList);
// control buttons
Panel panelButtons = new Panel();
Button buttonSearch = new Button(SEARCH);
buttonSearch.addActionListener(this);
panelButtons.add(buttonSearch);
Button buttonStop = new Button(STOP);
buttonStop.addActionListener(this);
panelButtons.add(buttonStop);
panelListButtons.add("South", panelButtons);
panelMain.add("South", panelListButtons);
add(panelMain);
setVisible(true);
repaint();
// initialize search data structures
vectorToSearch = new Vector<String>();
vectorSearched = new Vector<String>();
vectorMatches = new Vector<String>();
// set default for URL access
URLConnection.setDefaultAllowUserInteraction(false);
}
public void start()
{}
public void stop()
{
if (searchThread != null)
{
setStatus("stopping...");
searchThread = null;
}
}
public void destroy()
{}
boolean robotSafe(URL url)
{
String strHost = url.getHost();
// form URL of the robots.txt file
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try
{
urlRobot = new URL(strRobot);
}
catch (MalformedURLException e)
{
// something weird is happening, so don't trust it
return false;
}
String strCommands;
try
{
InputStream urlRobotStream = urlRobot.openStream();
// read in entire file
byte b[] = new byte[1000];
int numRead = urlRobotStream.read(b);
strCommands = new String(b, 0, numRead);
while (numRead != -1)
{
if (Thread.currentThread() != searchThread) break;
numRead = urlRobotStream.read(b);
if (numRead != -1)
{
String newCommands = new String(b, 0, numRead);
strCommands += newCommands;
}
}
urlRobotStream.close();
}
catch (IOException e)
{
// if there is no robots.txt file, it is OK to search
return true;
}
// assume that this robots.txt refers to us and
// search for "Disallow:" commands.
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf(DISALLOW, index)) != -1)
{
index += DISALLOW.length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
if (!st.hasMoreTokens()) break;
String strBadPath = st.nextToken();
// if the URL starts with a disallowed path, it is not safe
if (strURL.indexOf(strBadPath) == 0) return false;
}
return true;
}
public void paint(Graphics g)
{
// Draw a Rectangle around the applet's display area.
g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);
panelMain.paint(g);
panelMain.paintComponents(g);
// update(g);
// panelMain.update(g);
}
public void run()
{
String strURL = textURL.getText();
String strTargetType = choiceType.getSelectedItem();
int numberSearched = 0;
int numberFound = 0;
if (strURL.length() == 0)
{
setStatus("ERROR: must enter a starting URL");
return;
}
// initialize search data structures
vectorToSearch.removeAllElements();
vectorSearched.removeAllElements();
vectorMatches.removeAllElements();
listMatches.removeAll();
vectorToSearch.addElement(strURL);
while ((vectorToSearch.size() > 0) && (Thread.currentThread() == searchThread))
{
// get the first element from the to be searched list
strURL = (String) vectorToSearch.elementAt(0);
setStatus("searching " + strURL);
URL url;
try
{
url = new URL(strURL);
}
catch (MalformedURLException e)
{
setStatus("ERROR: invalid URL " + strURL);
break;
}
// mark the URL as searched (we want this one way or the other)
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);
// can only search http: protocol URLs
if (url.getProtocol().compareTo("http") != 0) break;
// test to make sure it is before searching
if (!robotSafe(url)) break;
try
{
// try opening the URL
URLConnection urlConnection = url.openConnection();
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String type = URLConnection.guessContentTypeFromStream(urlStream);
if (type == null) break;
if (type.compareTo("text/html") != 0) break;
// search the input stream for links
// first, read in the entire URL
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1)
{
if (Thread.currentThread() != searchThread) break;
numRead = urlStream.read(b);
if (numRead != -1)
{
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();
if (Thread.currentThread() != searchThread) break;
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1) break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1) break;
if (Thread.currentThread() != searchThread) break;
index++;
String remaining = content.substring(index);
StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try
{
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
}
catch (MalformedURLException e)
{
setStatus("ERROR: bad URL " + strLink);
continue;
}
// only look at http links
if (urlLink.getProtocol().compareTo("http") != 0) break;
if (Thread.currentThread() != searchThread) break;
try
{
// try opening the URL
URLConnection urlLinkConnection = urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType =
URLConnection.guessContentTypeFromStream(linkStream);
linkStream.close();
// if another page, add to the end of search list
if (strType == null) break;
if (strType.compareTo("text/html") == 0)
{
// check to see if this URL has already been
// searched or is going to be searched
if ((!vectorSearched.contains(strLink)) &&
(!vectorToSearch.contains(strLink)))
{
// test to make sure it is robot-safe!
if (robotSafe(urlLink)) vectorToSearch
.addElement(strLink);
}
}
// if the proper type, add it to the results list
// unless we have already seen it
if (strType.compareTo(strTargetType) == 0)
{
if (vectorMatches.contains(strLink) == false)
{
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT) break;
}
}
}
catch (IOException e)
{
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
}
catch (IOException e)
{
setStatus("ERROR: couldn't open URL " + strURL);
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT) break;
}
if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT) setStatus("reached search limit of " +
SEARCH_LIMIT);
else setStatus("done");
searchThread = null;
// searchThread.stop();
}
void setStatus(String status)
{
labelStatus.setText(status);
}
public void actionPerformed(ActionEvent event)
{
String command = event.getActionCommand();
if (command.compareTo(SEARCH) == 0)
{
setStatus("searching...");
// launch a thread to do the search
if (searchThread == null)
{
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.compareTo(STOP) == 0)
{
stop();
}
}
public static void main(String argv[])
{
Frame f = new Frame("WebFrame");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);
/*
* Behind a firewall set your proxy and port here!
*/
// Properties props= new Properties(System.getProperties());
// props.put("http.proxySet", "true");
// props.put("http.proxyHost", "webcache-cup");
// props.put("http.proxyPort", "8080");
//
// Properties newprops = new Properties(props);
// System.setProperties(newprops);
/**/
applet.init();
applet.start();
f.pack();
// f.show();
}
}
答案 0 :(得分:2)
可能是因为这句话:
String type = URLConnection.guessContentTypeFromStream(urlStream);
对于我尝试过的大部分网站,都返回了null
。这种方法很严重。即使是Java语言之父和最初编写此方法的人James Gosling也在early API docs承认这一点:
guessContentTypeFromStream(InputStream)
// This disgusting hack is used to check for files have some type that can be determined by inspection.
这个措辞已在最新的API文档中删除,但该方法仍然是一个黑客。以下是代码片段:
static public String guessContentTypeFromStream(InputStream is)
{
is.mark(12);
int c1 = is.read();
int c2 = is.read();
int c3 = is.read();
int c4 = is.read();
int c5 = is.read();
int c6 = is.read();
int c7 = is.read();
int c8 = is.read();
int c9 = is.read();
int c10 = is.read();
int c11 = is.read();
is.reset();
if (c1 == '<') {
if (c2 == '!'
|| ((c2 == 'h' && (c3 == 't' && c4 == 'm' && c5 == 'l' ||
c3 == 'e' && c4 == 'a' && c5 == 'd') ||
(c2 == 'b' && c3 == 'o' && c4 == 'd' && c5 == 'y'))) ||
((c2 == 'H' && (c3 == 'T' && c4 == 'M' && c5 == 'L' ||
c3 == 'E' && c4 == 'A' && c5 == 'D') ||
(c2 == 'B' && c3 == 'O' && c4 == 'D' && c5 == 'Y')))) {
return "text/html";
}
}
它在做什么?读取前11个字节并查看是否显示<!
,<html
,<head
,<body
,<HTML
,<HEAD
或{{1} }。
严重。这是Java 6中的送货代码。这甚至可能符合DailyWTF CodeSOD的条件。 *
所以无论如何,网页不再这样做了,即使他们在1998年撰写该文章时也是如此。说到这一点,根据今天的标准,文章中的代码也非常糟糕。我甚至都试图挽救它。我会在那里看some of the much better open-source web-crawlers。