MapReduce Hadoop 2.4.1 Reducer没有运行

时间:2015-10-11 21:48:08

标签: java hadoop mapreduce

由于某种原因,我的Reducer似乎没有运行。

我的司机

import java.io.File;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


public class PageRank {

    public static void main(String[] args) throws Exception {
        PageRank pageRanking = new PageRank();

        //In and Out dirs in HDFS

        pageRanking.runXmlParsing(args[0], args[1]);
        System.out.println("finished");

    }

    public void runXmlParsing(String inputPath, String outputPath) throws IOException {
        Configuration conf = new Configuration();
        conf.set(XmlInputFormat.START_TAG_KEY, "<page>");
        conf.set(XmlInputFormat.END_TAG_KEY, "</page>");

        Job job1 = Job.getInstance(conf);
        job1.setJarByClass(PageRank.class);

        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(Text.class);

        // Our class to parse links from content.
        job1.setMapperClass(WikiPageXMLMapper.class);
        job1.setReducerClass(WikiLinksReducer.class);

        job1.setInputFormatClass(XmlInputFormat.class);
        job1.setOutputFormatClass(TextOutputFormat.class);

        // Remove output if already exists
        FileSystem.getLocal(conf).delete(new Path(outputPath), true);

        FileInputFormat.setInputPaths(job1, new Path(inputPath));
        FileOutputFormat.setOutputPath(job1, new Path(outputPath));  

        System.out.println("BEFORE RUN");

        try {
            job1.waitForCompletion(true);
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }    
    }

    public void deleteDir(File dir) {
        File[] files = dir.listFiles();

        for (File myFile: files) {
            if (myFile.isDirectory()) {  
                deleteDir(myFile);
            } 
            myFile.delete();

        }
    }
}

我的Mapper是

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;


public class WikiPageXMLMapper extends Mapper<LongWritable, Text, Text, Text> {

    @Override
    public void map(LongWritable key, Text value, Context output) throws IOException {

        String[] titleAndText = parseTitleAndText(value.toString());

        String pageString = titleAndText[0];
        Text page = new Text(pageString.replace(' ', '_'));

        String[] parts = titleAndText[1].split("\\[\\[");

        String pages = "!@#$ ";
        for (int i = 1; i < parts.length; i++) {
            int lastIndexBrackets = parts[i].lastIndexOf("]]");
            // This checks and skips the first part of the outer link
            if (lastIndexBrackets == -1)
                continue;

            String insideLinkPlusExtra = parts[i].substring(0, lastIndexBrackets);
            int multipleClosingBrackets = insideLinkPlusExtra.indexOf("]]");

            String otherPage = insideLinkPlusExtra;

            if (multipleClosingBrackets != -1) {
                otherPage = insideLinkPlusExtra.substring(0, multipleClosingBrackets);
            }

            otherPage = otherPage.split("\\|")[0];
            otherPage = checkForDuplicates(otherPage, pages);
            otherPage = (otherPage.indexOf(":") == -1) ? otherPage : "";
            otherPage = (otherPage.indexOf("#") == -1) ? otherPage : "";
            otherPage = checkForSubpageLinks(otherPage);
            otherPage = checkForRedLink(otherPage);

            if (otherPage == "")
                continue;

            Text oP = new Text(otherPage.replace(' ', '_'));
            pages += oP + " ";

            // taking each outlink and making it its own key (ingraph)
            try {
                output.write(new Text(oP), new Text(page));
            } catch (InterruptedException e) {
                e.printStackTrace();
            }                   
        }

        // Designate this page as not a redlink
        try {
            output.write(new Text(page), new Text("!@#$"));
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return; 
    }
 }

我的减速机是:

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


public class WikiLinksReducer extends Reducer<Text, Text, Text, Text> {

    public void reduce(Text key, Iterator<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException {

        System.out.println("REDUCER");
        String links = "";        
        boolean isNotRedLink = false;

        System.out.println("Starting reduce");

        // Brett concern (and zach's): if n pages link to a redlink
        // we will iterate n times and it could be wasteful
        while(values.hasNext()){
            String v = values.next().toString();

            // Check first outlink is not #, if so, it is a redlink
            if (v.equals("!@#$")) {
                isNotRedLink = true;
                continue;

            } else {
                links += v;
                continue;
            }
        }

        // If the key is not a redlink, send it to the output
        if (isNotRedLink) {

            try {
                output.write(key, new Text(links));
                output.write(key, new Text("TESTING!"));
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println(links);


        } else {

            System.out.println(output);
            try {
                output.write(key, new Text("BLEG"));
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            System.out.println(key + " IS A RED LINK");
            return;
        }
     }
}

我的Reducer中的System.out.println(...)没有输出到控制台,程序的输出(它在我的硬盘上留下的文件)只有Mapper的结果。

1 个答案:

答案 0 :(得分:0)

我觉得很傻。我在Iterable行中尝试了Iterator而不是public void reduce(Text key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException {,我的问题就解决了。