我正在努力将下面的爬虫类中的StringBuffer值传递给主类:
import java.util.Scanner;
import java.util.ArrayList;
import java.util.*;
import java.io.*;
import java.net.URL;
public class Crawler
{
public int threads = 0;
public int results = 0;
public String output; //this is the string I will pass the buffer value to
public void crawler(String startingURL)
{
ArrayList<String> listOfPendingURLs = new ArrayList<String>();
ArrayList<String> listOfTraversedURLs = new ArrayList<String>();
listOfPendingURLs.add(startingURL); //Add the starting URL to a list named listOfPendingURLs
while (!listOfPendingURLs.isEmpty() && //while listOfPendingURLs is not empty
listOfTraversedURLs.size() <= 100)
{
String urlString = listOfPendingURLs.remove(0); //Remove a URL from listOfPendingURLs
if (!listOfTraversedURLs.contains(urlString)) //if this URL is not in listOfTraversedURLs
{
listOfTraversedURLs.add(urlString); //Add it to listOfTraversedURLs
System.out.println("Craw " + urlString); //Display this URL -- change this to display on the panel
try
{
URL oURL = new URL(urlString);
BufferedReader in = new BufferedReader(
new InputStreamReader(oURL.openStream()));
StringBuffer strbuf = new StringBuffer();
String lines;
while ((lines = in.readLine()) != null)
strbuf.append(lines); //I want to pass html source code to the main from here
output = strbuf.toString(); //convert to string
strbuf.delete(0,strbuf.length());//empty the buffer
results++;//GUI statistics variable - for future use
in.close();
}
catch (Exception e)
{
e.printStackTrace();
}
for (String s: getSubURLs(urlString)) { //Read the page from this URL and for each URL contained in the page
if (!listOfTraversedURLs.contains(s))
listOfPendingURLs.add(s); //Add it to listOfPendingURLs if it is not is listOfTraversedURLs
} //Exit the while loop when the size of S is equal to 100
}
}
}
public static ArrayList<String> getSubURLs(String urlString) {
ArrayList<String> list = new ArrayList<String>();
try {
java.net.URL url = new java.net.URL(urlString);
Scanner input = new Scanner(url.openStream());
int current = 0;
while (input.hasNext())
{
String line = input.nextLine();
current = line.indexOf("http:", current);
while (current > 0)
{
int endIndex = line.indexOf("\"", current);
if (endIndex > 0) { // Ensure that a correct URL is found
list.add(line.substring(current, endIndex));
current = line.indexOf("http:", endIndex);
}
else
current = -1;
}
}
}
catch (Exception ex)
{
System.out.println("Error: " + ex.getMessage());
}
return list;
}
}
我试图传递缓冲区的主要功能如下所示:
public class mainApp
{
public static void main( String args[] )
{
int width = 800;
int height = 600;
String title = "Kangaroo";
int threads = 1;
int spiders = 20;
int results = 10325;
String status = "completed";
MyFrame frame = new MyFrame(width, height, title, threads, spiders, results, status);
Crawler crawler = new Crawler();
TextAnalyser textAnalyser = new TextAnalyser();
while(width > 1)
{
if(frame.startSearch == true)//strt crawling
{
//crawler.crawler(frame.url);
//System.out.println(crawler.source);
crawler.crawler(frame.url);//start getting html source code
textAnalyser.analyse(crawler.output);//send the source code to text analyser
}
}
}//end main method
}//end class
由于某种原因,它不会将缓冲区读为null也不会显示源代码...任何想法?我一直在尝试很多变化,但似乎没有任何变化。
答案 0 :(得分:0)
调整您的crawler方法以在调用时返回一个字符串,如:
public String crawler(String startingURL) {
String result;
listOfPendingURLs = new ArrayList<String>();
listOfTraversedURLs = new ArrayList<String>();
result = "";
listOfPendingURLs.add(startingURL);
while (!listOfPendingURLs.isEmpty()
&& listOfTraversedURLs.size() <= 100) {
String urlString = listOfPendingURLs.remove(0);
if (!listOfTraversedURLs.contains(urlString)) {
listOfTraversedURLs.add(urlString);
// TODO display on panel instead
System.out.println("Craw " + urlString);
try {
URL oURL = new URL(urlString);
BufferedReader in = new BufferedReader(
new InputStreamReader(oURL.openStream()));
StringBuffer strbuf = new StringBuffer();
String lines;
while ((lines = in.readLine()) != null) {
strbuf.append(lines);
}
result = result + strbuf.toString(); // convert to string
strbuf.delete(0, strbuf.length());// empty the buffer
results++;// GUI statistics variable - for future use
in.close();
}
catch (Exception e) {
e.printStackTrace();
}
for (String s : getSubURLs(urlString)) { // Read the page from
// this URL and for
// each URL
// contained in the
// page
if (!listOfTraversedURLs.contains(s))
listOfPendingURLs.add(s); // Add it to listOfPendingURLs
// if it is not is
// listOfTraversedURLs
}
}
}
return result;
}