
时间:2013-09-12 07:14:41

标签: hadoop mapreduce cassandra word-count

我在eclipse中编写cassandra wordcount并在hadoop上运行..在运行时它显示输出rcords为0并且我在cassandra中的输出表为空。为什么会这样?


  public class WordcountCass extends Configured implements Tool

    public static class TokenizerMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, IColumn>, Text, IntWritable>
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private ByteBuffer sourceColumn;

        String punctuationsToStrip[] = { "\"", "'", ",", ";", "!", ":", "\\?", "\\.", "\\(", "\\-", "\\[", "\\)", "\\]" };

        protected void setup(Mapper.Context context) throws IOException, InterruptedException {
            sourceColumn = ByteBufferUtil.bytes(context.getConfiguration().get("columnname"));

        public void map(ByteBuffer key, SortedMap<ByteBuffer, IColumn> columns, Context context) throws IOException, InterruptedException
            // Our slice predicate contains only one column. We fetch it here
            IColumn column = columns.get(sourceColumn);
            if (column == null)
            String value = ByteBufferUtil.string(column.value());

            value = value.toLowerCase();
            for (String pattern : punctuationsToStrip) {
              value = value.replaceAll(pattern, "");

            StringTokenizer itr = new StringTokenizer(value);
            while (itr.hasMoreTokens()) {
                context.write(word, one);

    public static class ReducerToCassandra extends Reducer<Text, IntWritable, ByteBuffer, List<Mutation>>
        private ByteBuffer outputKey;

        protected void setup(Reducer.Context context) throws IOException, InterruptedException
            // The row key is the name of the column from which we read the text
            outputKey = ByteBufferUtil.bytes(context.getConfiguration().get("columnname"));

        public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            context.write(outputKey, Collections.singletonList(getMutation(word, sum)));

        // See Cassandra API (http://wiki.apache.org/cassandra/API)
        private static Mutation getMutation(Text word, int sum)
            Column c = new Column();
            c.setName(Arrays.copyOf(word.getBytes(), word.getLength()));

            Mutation m = new Mutation();
            m.setColumn_or_supercolumn(new ColumnOrSuperColumn());
            return m;

    public int run(String[] args) throws Exception
        String columnName = "name";
        getConf().set("columnname", columnName);

        //Configuration conf = new Configuration();
        Job job = new Job(getConf(), "wordcount");

        // Tell the Mapper to expect Cassandra columns as input

        // Tell the "Shuffle/Sort" phase of M/R what type of Key/Value to expect from the mapper


        // Set the keyspace and column family for the output of this job
        ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "wordcount", "outputword");
        // Set the keyspace and column family for the input of this job
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), "wordcount", "inputword");
        ConfigHelper.setRangeBatchSize(job.getConfiguration(), 500);

        ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "");
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
        ConfigHelper.setOutputPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
        ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "");
       // ConfigHelper.getOutputPartitioner(job.getConfiguration());
        // Set the predicate that determines what columns will be selected from each row

        SlicePredicate predicate = new SlicePredicate().setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        // The "get_slice" (see Cassandra's API) operation will be applied on each row of the ColumnFamily.
        // Each row will be handled by one Map job.
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        return job.isSuccessful() ? 0:1;

    public static void main(String[] args) throws Exception
        // Let ToolRunner handle generic command-line options
        ToolRunner.run(new Configuration(), new WordcountCass(), args);


我在cassandra的桌子是: inputword:

 id | name      | locate
  5 |     abhar | zanjan
 10 | mahneshan | zanjan
  1 |    zanjan | zanjan
  8 |     abbar | zanjan
  2 |    zanjan | zanjan
  4 |     abhar | zanjan
  7 |     abbar | zanjan
  6 |   gheidar | zanjan
  9 |     abbar | zanjan
  3 |    zanjan | zanjan


 id | name      | number


