我很困惑为什么mapToPair操作花了这么长时间。这是我的火花作业附件。它是shuffle写入的带宽吗?我在2节点集群中有100MB /秒的连接。
CODE:
SparkSession sparkSession = SparkSession.builder()。appName(“POC”)。getOrCreate();
Dataset<Row> dataset = sparkSession.read().text(args[0]);
Dataset<String> dataset2 = dataset.map(new MapFunction<Row, String>() {
@Override
public String call(Row r) throws Exception {
// TODO Auto-generated method stub
return mapper(r.getString(0), fillProps());
}
}, Encoders.STRING());
Dataset<String> dataset3 = dataset2.filter(new FilterFunction<String>() {
public boolean call(String arg0) throws Exception {
return arg0 != null;
};
});
JavaRDD<String> modifiedLines = dataset3.javaRDD(); // dataset2.filter(dataset2.col("value").isNotNull()).javaRDD();
List<Partition> partitions = modifiedLines.partitions();
for (Partition partition : partitions) {
@SuppressWarnings("serial")
JavaRDD<String> c = modifiedLines
.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer arg0, Iterator<String> arg1) throws Exception {
// TODO Auto-generated method stub
if (partition.index() == arg0) {
return arg1;
} else {
return new ArrayList<String>().iterator();
}
}
}, false);
System.out.println("******* partition.index(): " + partition.index());
JavaPairRDD<String, String> javaPairRDD = c.mapToPair(new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String arg0) throws Exception {
// TODO Auto-generated method stub
try {
if (org.apache.commons.lang.StringUtils.isEmpty(arg0)) {
return new Tuple2<String, String>("", "");
}
Tuple2<String, String> t = new Tuple2<String, String>(getESIndexName(arg0), arg0);
return t;
} catch (Exception e) {
e.printStackTrace();
System.out.println("******* exception in getESIndexName");
}
return new Tuple2<String, String>("", "");
}
});
java.util.Map<String, Iterable<String>> map1 = javaPairRDD.groupByKey().collectAsMap();