我是hadoop生态系统的新手。我设法从rdbms导入2个不同的数据集到hdfs。现在我需要聚合数据集并将其导出/移动到base。什么是最好的方法?
我有第三类数据集已经位于hbase
感谢您的帮助
答案 0 :(得分:0)
您可以通过map reduce代码轻松完成此操作。 关于你的问题:
通常,您的Reducer任务将写入结果(到文件系统或HBase)。
示例示例
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.util.Calendar;
public class MapReduceExample
{
public MapReduceExample() {
}
static class MyMapper extends TableMapper<LongWritable, LongWritable>
{
private LongWritable ONE = new LongWritable( 1 );
public MyMapper() {
}
@Override
protected void map( ImmutableBytesWritable rowkey, Result columns, Context context ) throws IOException, InterruptedException
{
// Get the timestamp from the row key
long timestamp = ExampleSetup.getTimestampFromRowKey(rowkey.get());
// Get hour of day
Calendar calendar = Calendar.getInstance();
calendar.setTimeInMillis( timestamp );
int hourOfDay = calendar.get( Calendar.HOUR_OF_DAY );
// Output the current hour of day and a count of 1
context.write( new LongWritable( hourOfDay ), ONE );
}
}
static class MyReducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable>
{
public MyReducer() {
}
@Override
protected void reduce(LongWritable key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException
{
// Add up all of the page views for this hour
long sum = 0;
for( LongWritable count : values )
{
sum += count.get();
}
// Write out the current hour and the sum
context.write( key, new LongWritable( sum ) );
}
}
public static void main( String[] args )
{
try
{
// Setup Hadoop
Configuration conf = HBaseConfiguration.create();
Job job = Job.getInstance(conf, "PageViewCounts");
job.setJarByClass( MapReduceExample.class );
// Create a scan
Scan scan = new Scan();
// Configure the Map process to use HBase
TableMapReduceUtil.initTableMapperJob(
"PageViews", // The name of the table
scan, // The scan to execute against the table
MyMapper.class, // The Mapper class
LongWritable.class, // The Mapper output key class
LongWritable.class, // The Mapper output value class
job ); // The Hadoop job
// Configure the reducer process
job.setReducerClass( MyReducer.class );
job.setCombinerClass( MyReducer.class );
// Setup the output - we'll write to the file system: HOUR_OF_DAY PAGE_VIEW_COUNT
job.setOutputKeyClass( LongWritable.class );
job.setOutputValueClass( LongWritable.class );
job.setOutputFormatClass( TextOutputFormat.class );
// We'll run just one reduce task, but we could run multiple
job.setNumReduceTasks( 1 );
// Write the results to a file in the output directory
FileOutputFormat.setOutputPath( job, new Path( "output" ) );
// Execute the job
System.exit( job.waitForCompletion( true ) ? 0 : 1 );
}
catch( Exception e )
{
e.printStackTrace();
}
}
}
更多信息 http://hbase.apache.org/book.html#mapreduce.example http://www.informit.com/articles/article.aspx?p=2262143&seqNum=2