使用mapreduce程序清理数据

时间:2015-03-18 17:23:10

标签: hadoop mapreduce hdfs hadoop-streaming hadoop2

我有30行的数据。我正在尝试使用mapreduce程序清理数据。数据正在清理,但只有一行显示30行。我想记录阅读器不是在这里逐行阅读。你可以检查我的代码,让我知道问题出在哪里。我是hadoop的新手。

数据: -

 1  Vlan154.DEL-ISP-COR-SWH-002.mantraonline.com (61.95.250.140)  0.460 ms  0.374 ms  0.351 ms
 2  202.56.223.213 (202.56.223.213)  39.718 ms  39.511 ms  39.559 ms
 3  202.56.223.17 (202.56.223.17)  39.714 ms  39.724 ms  39.628 ms
 4  125.21.167.153 (125.21.167.153)  41.114 ms  40.001 ms  39.457 ms
 5  203.208.190.65 (203.208.190.65)  120.340 ms  71.384 ms  71.346 ms
 6  ge-0-1-0-0.sngtp-dr1.ix.singtel.com (203.208.149.158)  71.493 ms ge-0-1-2-0.sngtp-dr1.ix.singtel.com (203.208.149.210)  71.183 ms ge-0-1-0-0.sngtp-dr1.ix.singtel.com (203.208.149.158)  71.739 ms
 7  ge-0-0-0-0.sngtp-ar3.ix.singtel.com (203.208.182.2)  80.917 ms ge-2-0-0-0.sngtp-ar3.ix.singtel.com (203.208.183.20)  71.550 ms ge-1-0-0-0.sngtp-ar3.ix.singtel.com (203.208.182.6)  71.534 ms
 8  203.208.151.26 (203.208.151.26)  141.716 ms 203.208.145.190 (203.208.145.190)  134.740 ms 203.208.151.26 (203.208.151.26)  142.453 ms
 9  219.158.3.225 (219.158.3.225)  138.774 ms  157.205 ms  157.123 ms
10  219.158.4.69 (219.158.4.69)  156.865 ms  157.044 ms  156.845 ms
11  202.96.12.62 (202.96.12.62)  157.109 ms  160.294 ms  159.805 ms
12  61.148.3.58 (61.148.3.58)  159.521 ms  178.088 ms  160.004 ms
     MPLS Label=33 CoS=5 TTL=1 S=0
13  202.106.48.18 (202.106.48.18)  199.730 ms  181.263 ms  181.300 ms
14  * * *
15  * * *
16  * * *
17  * * *
18  * * *
19  * * *
20  * * *
21  * * *
22  * * *
23  * * *

mapreduce程序: -

公共类TraceRouteDataCleaning {

/**
 * @param args
 * @throws IOException 
 * @throws InterruptedException 
 * @throws ClassNotFoundException 
 */
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    Configuration conf = new Configuration();
    String userArgs[] = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (userArgs.length < 2) {
        System.out.println("Usage: hadoop jar jarfilename mainclass input output");
        System.exit(1);
    }       
    Job job = new Job(conf, "cleaning trace route data");
    job.setJarByClass(TraceRouteDataCleaning.class);        
    job.setMapperClass(TraceRouteMapper.class);
    job.setReducerClass(TraceRouteReducer.class);       
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.addInputPath(job, new Path(userArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(userArgs[1]));     
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}   
public static class TraceRouteMapper extends Mapper<LongWritable, Text, Text, Text>{        
    StringBuilder emitValue = null;
    StringBuilder emitKey = null;
    Text kword = new Text();
    Text vword = new Text();

    public void map(LongWritable key, Text value, Context context) throws InterruptedException, IOException
     {
         // String[] cleanData;
         String lines = value.toString();   
         //deleting ms in RTT time data  
         lines = lines.replace(" ms", "");               
         String[] data = lines.split(" ");          
         emitValue = new StringBuilder(1024);
         emitKey = new StringBuilder(1024);

            if (data.length == 6) {                     
                emitKey.append(data[0]);
                emitValue.append(data[1]).append("\t").append(data[2]).append("\t").append(data[3]).append("\t").append(data[4]).append("\t").append(data[5]);
                kword.set(emitKey.toString());
                vword.set(emitValue.toString());                            
                context.write(kword, vword);                    
            }               
     }              
}   

public static class TraceRouteReducer extends Reducer<Text, Text, Text, Text>{
    Text vword = new Text();
    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{

        context.write(key,vword);           
    }
}

}

1 个答案:

答案 0 :(得分:0)

首先你的减速器类应该根据你的要求低于你的要求。如果你的钥匙没有发出多个文本,那么选择第一个减速器或者选择第二个减速器。

public static class TraceRouteReducer extends Reducer<Text, Text, Text, Text>{
Text vword = new Text();
public void reduce(Text key, Text values, Context context) throws IOException, InterruptedException{

    vword=values;

    /*
 for (Iterator iterator = values.iterator(); iterator.hasNext();) {

    vword.set(iterator.next().toString());
    System.out.println("printing " +vword.toString());

}*/

    context.write(key,vword);           


}
 }

   ----------or------------

public static class TraceRouteReducer extends Reducer<Text, Text, Text, Text>{
Text vword = new Text();
  public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{

   for (Iterator iterator = values.iterator(); iterator.hasNext();) {

    vword.set(iterator.next().toString());
    context.write(key,vword);  


}          


}
}


second in your mapper you are splitting based on space.but not feasible as of my knowledge. split based on   "\\s+"  regular expression.

   String[] data = lines.split("\\s+");