我正在尝试使用wholeTextFiles API进行文件处理。我在文件夹中有很多.gz文件,想要用wholeTextFiles API读取它们。
我有4个执行器,每个1个核心,每个执行器有2GB RAM。
只有2个执行程序正在处理作业,处理速度非常慢。其他两位遗嘱执行人正处于闲置状态。
如何将工作分散到其他2个执行器以增加并行性。?
package com.sss.ss.ss.WholeText;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
public class WholeText {
public static class mySchema implements Serializable {
private String CFIELD1 ;
private String CFIELD2 ;
public String getCFIELD1()
{
return CFIELD1;
}
public void setCFIELD1(String cFIELD1)
{
CFIELD1 = cFIELD1;
}
public String getCFIELD2()
{
return CFIELD2;
}
public void setCFIELD2(String cFIELD2)
{
CFIELD2 = cFIELD2;
}
}
public static void main(String[] args) throws InterruptedException {
SparkConf sparkConf = new SparkConf().setAppName("My app")
.setMaster("mymaster..")
.set("spark.driver.allowMultipleContexts", "true");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(15));
JavaPairRDD<String, String> wholeTextFiles = jssc.sparkContext().wholeTextFiles(args[0],Integer.parseInt(args[3]));
Integer ll = wholeTextFiles.getNumPartitions();
System.out.println("Number of Partitions"+ll);
JavaRDD<String> stringRDD = wholeTextFiles.
map(
new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = -551872585218963131L;
public String call(Tuple2<String, String> v1) throws Exception
{
return v1._2;
}
}
).
flatMap
(new FlatMapFunction<String, String>()
{
public Iterator<String> call(String t) throws Exception
{
return Arrays.asList(t.split("\\r?\\n")).iterator();
}
}).
filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;
public Boolean call(String t) throws Exception {
int colons = 0;
String s = t;
if(s == null || s.trim().length() < 1) {
return false;
}
for(int i = 0; i < s.length(); i++) {
if(s.charAt(i) == ';') colons++;
}
System.out.println("colons="+colons);
if ((colons <=3)){
return false;
}
return true;
}
});
JavaRDD<mySchema> schemaRDD = stringRDD.map(new Function<String, mySchema>()
{
private static final long serialVersionUID = 1L;
public mySchema call(String line) throws Exception
{
String[] parts = line.split(";",-1);
mySchema mySchema = new mySchema();
mySchema.setCFIELD1 (parts[0]);
mySchema.setCFIELD2 (parts[1]);
return mySchema;
}
});
SQLContext hc = new HiveContext(jssc.sparkContext());
Dataset<Row> df = hc.createDataFrame(schemaRDD, mySchema.class);
df.createOrReplaceTempView("myView");
hc.sql("INSERT INTO -----
"from myView");
hc.sql("INSERT INTO .......
"from myView");
}
}