spark shuffle写@maptoPair花了太长时间

时间:2018-06-08 16:16:34

标签: apache-spark apache-spark-sql

我很困惑为什么mapToPair操作花了这么长时间。这是我的火花作业附件。它是shuffle写入的带宽吗?我在2节点集群中有100MB /秒的连接。

enter image description here

CODE:

SparkSession sparkSession = SparkSession.builder()。appName(“POC”)。getOrCreate();

Dataset<Row> dataset = sparkSession.read().text(args[0]);
Dataset<String> dataset2 = dataset.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row r) throws Exception {
        // TODO Auto-generated method stub
        return mapper(r.getString(0), fillProps());
    }
}, Encoders.STRING());

Dataset<String> dataset3 = dataset2.filter(new FilterFunction<String>() {
    public boolean call(String arg0) throws Exception {

        return arg0 != null;

    };
});

JavaRDD<String> modifiedLines = dataset3.javaRDD(); // dataset2.filter(dataset2.col("value").isNotNull()).javaRDD();
List<Partition> partitions = modifiedLines.partitions();
for (Partition partition : partitions) {

    @SuppressWarnings("serial")
    JavaRDD<String> c = modifiedLines
            .mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

                @Override
                public Iterator<String> call(Integer arg0, Iterator<String> arg1) throws Exception {
                    // TODO Auto-generated method stub
                    if (partition.index() == arg0) {
                        return arg1;
                    } else {
                        return new ArrayList<String>().iterator();
                    }

                }

            }, false);

    System.out.println("******* partition.index(): " + partition.index());

    JavaPairRDD<String, String> javaPairRDD = c.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String arg0) throws Exception {
            // TODO Auto-generated method stub

            try {
                if (org.apache.commons.lang.StringUtils.isEmpty(arg0)) {
                    return new Tuple2<String, String>("", "");
                }
                Tuple2<String, String> t = new Tuple2<String, String>(getESIndexName(arg0), arg0);
                return t;
            } catch (Exception e) {
                e.printStackTrace();
                System.out.println("******* exception in getESIndexName");
            }
            return new Tuple2<String, String>("", "");
        }
    });

    java.util.Map<String, Iterable<String>> map1 = javaPairRDD.groupByKey().collectAsMap(); 

0 个答案:

没有答案