Question

我是hadoop生态系统的新手。我设法从rdbms导入2个不同的数据集到hdfs。现在我需要聚合数据集并将其导出/移动到base。什么是最好的方法？

我有第三类数据集已经位于hbase

感谢您的帮助

Answer 1

您可以通过map reduce代码轻松完成此操作。关于你的问题：

Mapper接收数据拆分并返回一对密钥，设置
Reducer接收Mapper的输出并生成一个对

通常，您的Reducer任务将写入结果（到文件系统或HBase）。

示例示例

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.util.Calendar;

public class MapReduceExample
{

    public MapReduceExample() {

    }

    static class MyMapper extends TableMapper<LongWritable, LongWritable>
    {
        private LongWritable ONE = new LongWritable( 1 );

        public MyMapper() {
        }

        @Override
        protected void map( ImmutableBytesWritable rowkey, Result columns, Context context ) throws IOException, InterruptedException

        {

            // Get the timestamp from the row key
            long timestamp = ExampleSetup.getTimestampFromRowKey(rowkey.get());

            // Get hour of day
            Calendar calendar = Calendar.getInstance();
            calendar.setTimeInMillis( timestamp );
            int hourOfDay = calendar.get( Calendar.HOUR_OF_DAY );

            // Output the current hour of day and a count of 1
            context.write( new LongWritable( hourOfDay ), ONE );
        }
    }

    static class MyReducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable>

    {
        public MyReducer() {
        }

        @Override
        protected void reduce(LongWritable key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException

        {
            // Add up all of the page views for this hour
            long sum = 0;
            for( LongWritable count : values )
            {
                sum += count.get();
            }

            // Write out the current hour and the sum
            context.write( key, new LongWritable( sum ) );
        }
    }

    public static void main( String[] args )
    {
        try
        {
            // Setup Hadoop
            Configuration conf = HBaseConfiguration.create();
            Job job = Job.getInstance(conf, "PageViewCounts");
            job.setJarByClass( MapReduceExample.class );

            // Create a scan
            Scan scan = new Scan();

            // Configure the Map process to use HBase
            TableMapReduceUtil.initTableMapperJob(

                    "PageViews",                    // The name of the table
                    scan,                           // The scan to execute against the table
                    MyMapper.class,                 // The Mapper class
                    LongWritable.class,             // The Mapper output key class
                    LongWritable.class,             // The Mapper output value class
                    job );                          // The Hadoop job

            // Configure the reducer process
            job.setReducerClass( MyReducer.class );
            job.setCombinerClass( MyReducer.class );

            // Setup the output - we'll write to the file system: HOUR_OF_DAY   PAGE_VIEW_COUNT
            job.setOutputKeyClass( LongWritable.class );
            job.setOutputValueClass( LongWritable.class );
            job.setOutputFormatClass( TextOutputFormat.class );

            // We'll run just one reduce task, but we could run multiple
            job.setNumReduceTasks( 1 );

            // Write the results to a file in the output directory
            FileOutputFormat.setOutputPath( job, new Path( "output" ) );

            // Execute the job
            System.exit( job.waitForCompletion( true ) ? 0 : 1 );

        }
        catch( Exception e )
        {
            e.printStackTrace();
        }
    }
}

更多信息 http://hbase.apache.org/book.html#mapreduce.example http://www.informit.com/articles/article.aspx?p=2262143&seqNum=2

聚合hdfs中的两个数据集并将其表示/移动到hbase

1 个答案: