Question

以下是我在Hadoop中使用Web爬虫的reducer类，它包含Jsoup HTTP连接并导入org.jsoup.Connection会引发错误。 PS - 我已经导入了hadoop特定的jsoup jar文件，所有其他头文件都正常工作。

以下是以下代码。

import java.io.IOException;
/*import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

//import lakshay.webcrawler.com.SpiderLeg;
*/

//import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/*import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

*/
public class rcrawler
extends Reducer<Text, Text, Text, Text> {

    //private static final int MAX_PAGES_TO_SEARCH = 5;
    //private Set<String> pagesVisited = new HashSet<String>();
    //private List<String> pagesToVisit = new LinkedList<String>();
    private static final String USER_AGENT =
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
   // private List<String> links = new LinkedList<String>();
    private Document htmlDocument;  

/*
 * This function is to search for the word and return the word and the url on which the word is been found.
 * At the end, the function also returns statement when it is done with the process. 
 * 
 */
public String search(String url, String searchWord){



            String CurrentUrl = url;
            rcrawler leg = new rcrawler();

            /*if(this.pagesToVisit.isEmpty()){

                CurrentUrl = url;
                this.pagesVisited.add(url);
            }

            else
            {
                CurrentUrl = this.nextUrl();
            }
            */

            leg.crawl(CurrentUrl);

            boolean success = leg.searchForWord(searchWord);
            if(success){
                //System.out.println(String.format("**Success** Word %s found at %s", searchWord, CurrentUrl));
                return String.format("Success, Word %s found", searchWord);
            }

            return String.format("Failed, Word %s not found", searchWord);

            //this.pagesToVisit.addAll(leg.getLinks());

         //System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
    }   


/*
 * This function return the next url to be searched for the above function to perform
 * 
 */


/* private String nextUrl()
{
    String nextUrl;
    do
    {
        nextUrl = this.pagesToVisit.remove(0);
    } while(this.pagesVisited.contains(nextUrl));
    this.pagesVisited.add(nextUrl);
    return nextUrl;
}

*/


/*
 * This function gives the http requests and it also shows that how many links it has found on the partu=icular url
 * 
 */
public boolean crawl(String url)
{
    try
    {
        Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
        Document htmlDocument = connection.get();
        this.htmlDocument = htmlDocument;
        if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
                                                      // indicating that everything is great.
        {
            //System.out.println("\n**Visiting** Received web page at " + url);
            return true;
        }
        if(!connection.response().contentType().contains("text/html"))
        {
            //System.out.println("**Failure** Retrieved something other than HTML");
            return false;
        }

        /* Elements linksOnPage = htmlDocument.select("a[href]");
        System.out.println("Found (" + linksOnPage.size() + ") links");
        for(Element link : linksOnPage)
        {
            this.links.add(link.absUrl("href"));
        }
        */
        return true;

    }
    catch(IOException ioe)
    {
        // We were not successful in our HTTP request
        return false;
    }
}

/*
 * This is the main fuction that looks for the word in the whole page and returns the value to the first function search
 */
public boolean searchForWord(String searchWord)
{
    // Defensive coding. This method should only be used after a successful crawl.
    if(this.htmlDocument == null)
    {
        //System.out.println("ERROR! Call crawl() before performing analysis on the document");
        return false;
    }
    //System.out.println("Searching for the word " + searchWord + "...");
    String bodyText = this.htmlDocument.body().text();
    return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}
/*
 * This function is to return all the links that the url consists of
 */

/*public List<String> getLinks()
{
    return this.links;
}

*/


@Override
public void reduce(Text Key, Iterable<Text>values, Context context)
throws IOException, InterruptedException {

    for(Text value : values ) {
    rcrawler spider = new rcrawler();
    String Key1 = Key.toString(); String value1 = value.toString();

    String result = spider.search(Key1, value1);
     Text result1=new Text (result);
          int r;
           if(String.valueOf(result.charAt(0))=="S")
                   r=1;
           else r=0;
           String str = Integer.toString(r);
         Text r1=new Text(str);
    context.write(result1, r1);
    }
    //spider.search("http://w...content-available-to-author-only...t.in/", "MIME");
    //context.write(searchWord, CurrentUrl);
        }

}

Jsoup Connection无法解析为Hadoop中的类型

0 个答案: