使用wholeTextFiles进行Spark并行化

时间:2017-03-28 05:07:57

标签: apache-spark apache-spark-sql spark-dataframe

我正在尝试使用wholeTextFiles API进行文件处理。我在文件夹中有很多.gz文件,想要用wholeTextFiles API读取它们。

我有4个执行器,每个1个核心,每个执行器有2GB RAM。

只有2个执行程序正在处理作业,处理速度非常慢。其他两位遗嘱执行人正处于闲置状态。

如何将工作分散到其他2个执行器以增加并行性。?

package com.sss.ss.ss.WholeText;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.Tuple2;


public class WholeText { 

    public static class mySchema implements Serializable {
              private String CFIELD1       ;
              private String CFIELD2       ;

              public String getCFIELD1()
                {
                    return CFIELD1;
                }
            public void setCFIELD1(String cFIELD1)
                {
                    CFIELD1 = cFIELD1;
                }
            public String getCFIELD2() 
                {
                    return CFIELD2;
                }
            public void setCFIELD2(String cFIELD2)
                {
                    CFIELD2 = cFIELD2;
                }

      }


 public static void main(String[] args) throws InterruptedException { 

      SparkConf sparkConf = new SparkConf().setAppName("My app")
               .setMaster("mymaster..")
                .set("spark.driver.allowMultipleContexts", "true");

        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(15));


      JavaPairRDD<String, String> wholeTextFiles = jssc.sparkContext().wholeTextFiles(args[0],Integer.parseInt(args[3]));


      Integer ll = wholeTextFiles.getNumPartitions();

      System.out.println("Number of Partitions"+ll);

      JavaRDD<String> stringRDD = wholeTextFiles.
             map( 
                      new Function<Tuple2<String, String>, String>() {
                        private static final long serialVersionUID = -551872585218963131L;

                        public String call(Tuple2<String, String> v1) throws Exception
                            {
                                return v1._2;
                            }
                      }
            ).
             flatMap
              (new FlatMapFunction<String, String>() 
                    {

                        public Iterator<String> call(String t) throws Exception
                            {

                                return Arrays.asList(t.split("\\r?\\n")).iterator();

                            }
                    }).
             filter(new Function<String,  Boolean>() {

                            private static final long serialVersionUID = 1L;

                            public Boolean call(String t) throws Exception {

                            int colons = 0;

                            String s = t;

                            if(s == null || s.trim().length() < 1) {
                              return false;
                            }

                            for(int i = 0; i < s.length(); i++) {
                                if(s.charAt(i) == ';') colons++;
                            }
                            System.out.println("colons="+colons);

                            if ((colons <=3)){
                                return false;
                            } 
                            return true;
                          }

             });



     JavaRDD<mySchema> schemaRDD =  stringRDD.map(new Function<String, mySchema>()
            {

                private static final long serialVersionUID = 1L;

                public mySchema call(String line) throws Exception
                    {               
                        String[] parts = line.split(";",-1);
                        mySchema mySchema = new mySchema();

                        mySchema.setCFIELD1       (parts[0]);
                        mySchema.setCFIELD2       (parts[1]);


                        return mySchema;

                      }
            });


     SQLContext hc = new HiveContext(jssc.sparkContext());


      Dataset<Row> df = hc.createDataFrame(schemaRDD, mySchema.class);

      df.createOrReplaceTempView("myView");

      hc.sql("INSERT INTO -----
                "from myView"); 


     hc.sql("INSERT INTO .......
            "from myView"); 


}


}

0 个答案:

没有答案