错误字符串索引在爬网时超出范围

时间:2014-03-11 12:23:36

标签: java http web-crawler search-engine text-analysis

我的程序在得到前2个URL的“线程异常”AWT-EventQueue-0“java.lang.StringIndexOutOfBoundsException:String index out of range:0”之后一直出错。第一对URL的抓取就像我想要的那样,我使用另一个类中的方法从中获取文本。另一堂课可能是我不知道的问题。请查看我的代码,看看发生了什么。

package WebCrawler;

import java.util.Scanner;
import java.util.ArrayList;

import static TextAnalyser.Textanalyser.analyse;

public class Crawler {

    public static void main(String[] args) {
        //   java.util.Scanner input = new java.util.Scanner(System.in);
        //  System.out.print("Enter a URL: ");
        //  String url = input.nextLine();
        crawler("http://www.port.ac.uk/"); // Traverse the Web from the a starting url 
    }

    public static void crawler(String startingURL) {
        ArrayList<String> listOfPendingURLs = new ArrayList<String>();
        ArrayList<String> listOfTraversedURLs = new ArrayList<String>();

        listOfPendingURLs.add(startingURL);
        while (!listOfPendingURLs.isEmpty() && listOfTraversedURLs.size() <= 100) {
            String urlString = listOfPendingURLs.remove(0);

            if (!listOfTraversedURLs.contains(urlString)) {
                listOfTraversedURLs.add(urlString);
                String text = urlString;
                text = ReadTextfromURL.gettext(text);
                text = analyse(text);
                System.out.println("text : " + text);
                System.out.println("Craw " + urlString);

                for (String s: getSubURLs(urlString)) {
                    if (!listOfTraversedURLs.contains(s)) {
                        listOfPendingURLs.add(s);
                    }
                }
            }
        }
    }

    public static ArrayList<String> getSubURLs(String urlString) {
        ArrayList <String> list = new ArrayList<String>();

        try {
            java.net.URL url = new java.net.URL(urlString);
            Scanner input = new Scanner(url.openStream());
            int current = 0;
            while (input.hasNext()) {
                String line = input.nextLine();
                current = line.indexOf("http:", current);
                while (current > 0) {
                    int endIndex = line.indexOf("\"", current);
                    if (endIndex > 0) { // Ensure that a correct URL is found 
                        list.add(line.substring(current, endIndex));
                        current = line.indexOf("http:", endIndex);
                    } else {
                        current = -1;
                    }
                }
            }
        } catch (Exception ex) {
            System.out.println("Error: " + ex.getMessage());
        }

        return list;
    }
}

0 个答案:

没有答案