HBase Map-only行删除

时间:2014-01-22 18:41:06

标签: hadoop mapreduce hbase

第一次编写HBase mapreduce,我在删除HBase中的行时遇到问题(尝试将其作为仅限地图的作业运行)。作业成功并且能够扫描HBase表,并且我能够从HBase中读取映射器中的正确rowkeys(通过sysout验证)。但是,似乎对Delete del = new Delete(row.get())的调用实际上并没有做任何事情。

以下是我正在尝试运行的代码:

HBaseDelete.java

public class HBaseDelete { 
  public static void main(String[] args) throws Exception {

    Configuration config = HBaseConfiguration.create();
    Job job = new Job(config, "log_table");
    job.setJarByClass(HBaseDeleteMapper.class);     

    Scan scan = new Scan();
    scan.setCaching(500);        
    scan.setCacheBlocks(false);

    TableMapReduceUtil.initTableMapperJob("log_table", scan, HBaseDeleteMapper.class, null, null, job);

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);

    boolean b = job.waitForCompletion(true);
    if (!b) {
        throw new IOException("error with job!");
    }

  }
}

HBaseDeleteMapper.java

public class HBaseDeleteMapper extends TableMapper<ImmutableBytesWritable, Delete>{
  @Override
  public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
    Delete delete = new Delete(row.get());
    context.write(row, delete);
  }
}

缺少“提交”删除的内容吗?

2 个答案:

答案 0 :(得分:6)

您正在写入上下文,而不是表格,您的映射器看起来应该与此类似:

public class HBaseDeleteMapper extends TableMapper<ImmutableBytesWritable, NullWritable>{

    private HTable myTable;

    protected void setup(Context context) throws IOException, InterruptedException {
        /* HTable instance for deletes */
        myTable = new HTable(HBaseConfiguration.create(), "myTable".getBytes());
    }

    public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
        myTable.delete(new Delete(row.get())); /* Delete the row from the table */
        //context.write(row, NullWritable.get()); /* Just an output with deleted rows if you need it for something (avoid it if you not) */
    }

    protected void cleanup(Context context) throws IOException, InterruptedException { 
        myTable.close(); /* Close table */
    }

}

请注意,删除操作不使用写缓冲区,此代码每次删除将发出1次RPC操作,这对此类作业不利。为了解决这个问题,您可以构建自己的List<Delete>来批处理它们:

public class HBaseDeleteMapper extends TableMapper<NullWritable, NullWritable>{

    private HTable myTable;
    private List<Delete> deleteList = new ArrayList<Delete>();
    final private int buffer = 10000; /* Buffer size, tune it as desired */

    protected void setup(Context context) throws IOException, InterruptedException {
        /* HTable instance for deletes */
        myTable = new HTable(HBaseConfiguration.create(), "myTable".getBytes());
    }

    public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
        deleteList.add(new Delete(row.get())); /* Add delete to the batch */
        if (deleteList.size()==buffer) {
            myTable.delete(deleteList); /* Submit batch */
            deleteList.clear(); /* Clear batch */
        }
    }

    protected void cleanup(Context context) throws IOException, InterruptedException {
        if (deleteList.size()>0) {
            myTable.delete(deleteList); /* Submit remaining batch */
        }
        myTable.close(); /* Close table */
    }

}

答案 1 :(得分:1)

完整代码

下面的代码会扫描hbase表,查找包含共同子字符串的rowkeys,并在列表大小为&gt;时删除它们。 1000(这是为了确保您的列表不会耗尽空间/堆空间).Code也会将这些行键写入您的hdfs。

驱动程序

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.oclc.wcsync.hadoop.mapper.HbaseBulkDeleteMapper;
import org.oclc.wcsync.hadoop.util.JobName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Properties;

/** doc. */
public class HbaseBulkDelete extends Configured implements Tool{

    /** doc. */
    private static final Logger LOG = LoggerFactory.getLogger(HbaseBulkDelete.class);


    /**
     * doc.
     * @param args ...
     * @throws Exception ...
     */
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(HBaseConfiguration.create(), new HbaseBulkDelete(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] strings) throws Exception {


        JobName jobName = JobName.HBASE_DELETE;
        LOG.info ("Got into class driver");
        Configuration conf = HBaseConfiguration.create ();
        String env = "prod";
        Properties hadoopProps = new Properties();
        hadoopProps.load(HbaseBulkDelete.class.getResourceAsStream("/hadoop.config." + env + ".properties"));
        conf.set("jobName", jobName.name());
        conf.set ("hbase.master.catalog.timeout","600000");
        conf.set ("hbase.client.scanner.timeout.period","600000");
        conf.set ("hbase.rpc.timeout","6000000");
        conf.set ("mapred.task.timeout","6000000");
        conf.set("mapreduce.map.memory.mb","4096");
        Job job = new Job(conf);
        job.setJobName(jobName.format("HbaseBulkDelete"));
        job.setJarByClass(HbaseBulkDelete.class);
        Scan s = new Scan ();
        s.addFamily(Bytes.toBytes("data"));
        s.setStartRow (Bytes.toBytes ("Your_Substring"));

        TableMapReduceUtil.initTableMapperJob ("Ingest", s, HbaseBulkDeleteMapper.class, TextOutputFormat.class,
                TextOutputFormat.class, job);

        job.setNumReduceTasks(0);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path("/user/neethu/HbaseBulkDelete"));


        return job.waitForCompletion(true) ? 0 : -1;
    }
}

MAPPER

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class HbaseBulkDeleteMapper extends TableMapper<Text, Text> {
    private static final Logger LOG = LoggerFactory.getLogger(HbaseBulkDeleteMapper.class);
    Configuration conf;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        conf = context.getConfiguration();

    }
    List<Delete> listOfBatchDelete = new ArrayList<Delete> ();
    @Override
    public void map(ImmutableBytesWritable row, Result values, Context context)
            throws IOException, InterruptedException {
        HTable table= new HTable(conf,"Ingest");
        if (listOfBatchDelete != null && !listOfBatchDelete.isEmpty () && listOfBatchDelete.size () > 1000) {
            LOG.info ("Deleted records!");

            listOfBatchDelete.clear ();
        }
        String KEY=Bytes.toString(values.getRow ());
        try {
            if (KEY.contains ("Your_substring") ){
                LOG.info ("RowKey:"+KEY );
                Delete d=new Delete(Bytes.toBytes(KEY));
                listOfBatchDelete.add(d);
                context.write (new Text ("RowKey"), new Text (KEY));
            }

        } catch (Exception e) {
            LOG.error ("error  ---" + e);
        }
       // table.delete(listOfBatchDelete);
    }
}