Question

我正在使用hadoop-1.2.1并尝试使用ToolRunner运行一个简单的RowCount HBase作业。但是，无论我怎么试试，hadoop都找不到地图类。 jar文件被正确复制到hdfs中，但我似乎无法弄清楚它出错的地方。请帮忙！

以下是代码：

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;


import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class HBaseRowCountToolRunnerTest extends Configured implements Tool
{

    // What to copy.
    public static final String JAR_NAME = "myJar.jar";
    public static final String LOCAL_JAR = <path_to_jar> + JAR_NAME;
    public static final String REMOTE_JAR = "/tmp/"+JAR_NAME;


    public static void main(String[] args) throws Exception 
    {
        Configuration config = HBaseConfiguration.create();

//All connection configs set here -- omitted to post the code 

        config.set("tmpjars", REMOTE_JAR);


        FileSystem dfs = FileSystem.get(config);

        System.out.println("pathString = " + (new Path(LOCAL_JAR)).toString() + " \n");

        // Copy jar file to remote.
        dfs.copyFromLocalFile(new Path(LOCAL_JAR), new Path(REMOTE_JAR));

        // Get rid of jar file when we're done.
        dfs.deleteOnExit(new Path(REMOTE_JAR));

        // Run the job.
        System.exit(ToolRunner.run(config, new HBaseRowCountToolRunnerTest(), args));
    }

    @Override
    public int run(String[] args) throws Exception 
    {
        Job job = new RowCountJob(getConf(), "testJob", "myLittleHBaseTable");

        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static class RowCountJob extends Job
    {

        RowCountJob(Configuration conf, String jobName, String tableName) throws IOException
        {
            super(conf, RowCountJob.class.getCanonicalName() + "_" + jobName);

            setJarByClass(getClass()); 

            Scan scan = new Scan();
            scan.setCacheBlocks(false);
            scan.setFilter(new FirstKeyOnlyFilter());

            setOutputFormatClass(NullOutputFormat.class);

            TableMapReduceUtil.initTableMapperJob(tableName, scan,
                    RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, this);

            setNumReduceTasks(0);

        }

    }//end public static class RowCountJob extends Job

    //Mapper that runs the count
    //TableMapper -- TableMapper<KEYOUT, VALUEOUT> (*OUT by type)
    public static class RowCounterMapper extends TableMapper<ImmutableBytesWritable, Result> 
    {

        //Counter enumeration to count the actual rows
        public static enum Counters {ROWS}

        /**
         * Maps the data.
         *
         * @param row  The current table row key.
         * @param values  The columns.
         * @param context  The current context.
         * @throws IOException When something is broken with the data.
         * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN,
         *   org.apache.hadoop.mapreduce.Mapper.Context)
         */
        @Override
        public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException 
        {
            // Count every row containing data times 2, whether it's in qualifiers or values
            context.getCounter(Counters.ROWS).increment(2);
        }

    }//end public static class RowCounterMapper extends TableMapper<ImmutableBytesWritable, Result> 


}//end public static void main(String[] args) throws Exception

Answer 1

好的 - 我找到了问题的解决方法，并认为我会分享所有其他有类似问题的人......

事实证明，我放弃了tmpjars配置选项，只是从代码本身复制了指向DistributedCache的jar文件。这是它的样子：

// Copy jar file to remote.
FileSystem dfs = FileSystem.get(conf);
dfs.copyFromLocalFile(new Path(LOCAL_JAR), new Path(REMOTE_JAR));

// Get rid of jar file when we're done.
dfs.deleteOnExit(new Path(REMOTE_JAR));

//Place it in the distributed cache
DistributedCache.addFileToClassPath(new Path(REMOTE_JAR), conf, dfs);

也许它无法解决tmpjars的问题，但确实有效。

Answer 2

我今天遇到了同样的问题。最后，我发现这是因为我忘了在驱动程序类中插入以下句子...

job.setJarByClass(HBaseTestDriver.class);

Hadoop没有找到地图类

2 个答案: