Question

我正在练习MapReduce，我有一个Amazon .tsv文件，其中包含具有产品评分的“评论”列表。 1个产品有很多评论，并且每个评论都有一个评分。评论还包含其他数据，例如user_id，product_name，review_title等。我想在此文件上使用MapReduce生成3列的输出：产品ID，评论总数和产品的平均评分。

我用于测试的文件链接：LINK（这是 sample_us.tsv ）

https://gofile.io/?c=wLsv0y

到目前为止，我已经写了以下内容，但出现了几个错误。请让我知道您是否看到任何修复程序或可以实现相同目标的更好逻辑。我一直在使用Hadoop btw。

映射器：

package stubs;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class ReviewMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{

  @Override
  public void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException
  {

      int productIndex = 3; //index for productID
      int ratingIndex = 7; //index for ratingID

      String input = value.toString();
      String [] line = input.split("\\t");

      String productID = line[productIndex];
      String ratingVal = line[ratingIndex];


      if((productID.length() > 0) && (ratingVal.length() == 1))
      {
         int starRating = Integer.valueOf(ratingVal);
         context.write(new Text(productID), new IntWritable(starRating));
      }
  }
}

然后是我的减速器：

package stubs;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class ReviewReducer extends Reducer<Text, IntWritable, Text, Text> {

  @Override
  public void reduce(Text key, Iterable<IntWritable> values, Context context)
    throws IOException, InterruptedException
  {
      int reviewCount = 0;
      int combineRating = 0;
      for(IntWritable value : values)
      {
          reviewCount++;
          combineRating += value.get();
      }

      int avgRating = (combineRating/reviewCount);
      String reviews = Integer.toString(reviewCount);
      String ratings = Integer.toString(avgRating);
      String result = reviews+ "\t" +ratings;

      context.write(key,  new Text(result));
  }
}

最后一个驱动程序：

package stubs;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class AvgRatingReviews {

  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
    System.out.printf("Usage: AvgWordLength <input dir> <output dir>\n");
    System.exit(-1);
    }

    Job job = new Job();
    job.setJarByClass(AvgRatingReviews.class);  
    job.setJobName("Review Results");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job,  new Path(args[1]));

    job.setMapperClass(ReviewMapper.class);
    job.setReducerClass(ReviewReducer.class);

    job.setOutputKeyClass(Text.class);;
    job.setOutputValueClass(Text.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);
  }
}

MapReduce：计算产品的平均评分和评论总数

0 个答案: