Dataset<Row> sqlDF = spark.sql(sql);
JavaRDD<Row> rowJavaRDD = sqlDF.javaRDD();
JavaPairRDD<ImmutableBytesWritable, TreeSet<KeyValue>> pairRDD = rowJavaRDD.mapToPair(
row->convertToKVs(row,fieldArray)//in method :TreeSet<KeyValue> kvSet = new TreeSet<>(KeyValue.COMPARATOR);
);
pairRDD.repartitionAndSortWithinPartitions(new HashPartitioner(1));
JavaPairRDD<ImmutableBytesWritable, KeyValue> cellRDD = pairRDD.flatMapToPair(row ->{
List<Tuple2<ImmutableBytesWritable, KeyValue>> kvs = new ArrayList<>();
TreeSet<KeyValue> kvSet = row._2;
for (KeyValue keyValue : kvSet) {
kvs.add(new Tuple2<>(row._1,keyValue));
}
return kvs.iterator();
});
... job conf ...
HFileOutputFormat2.configureIncrementalLoad(job, table);
cellRDD.saveAsNewAPIHadoopFile(hFilePath,ImmutableBytesWritable.class,KeyValue.class,HFileOutputFormat2.class,job.getConfiguration());
当我使用spark-submit --master yarn
来执行代码时,会引发异常
Added a key not lexically larger than previous. key=...
,我知道异常是由于表列没有被词法排序引起的,但为什么异常是即使我使用new TreeSet<>(KeyValue.COMPARATOR)
对列进行排序,也会提出。
有人会告诉我如何解决它吗?提前谢谢。
编辑:原因是pairRdd具有相同的rowkey结果。通过distcp,我实现了不同hbase集群的大容量扩展。