我正在从s3读取一个只有4条记录的文件,我看到36k任务用于计算UI上的数据集+我的内存为3个集群显示半满(总共60GB)。对于像这样的情况,这似乎没问题吗?
ctx.textFile(filename,1).mapPartitions(....)
public static void main(String[] args){
sparkConf = new SparkConf().setAppName(Constants.APP_NAME)
.set("spark.io.compression.codec", "lzf")
.set("spark.speculation", "true")
.set("spark.cleaner.ttl", "10000")
.set("spark.executor.extraJavaOptions", "-XX:+UseG1GC");
ctx= new JavaSparkContext(sparkConf);
JavaRDD<Object1> FileDataJavaRDD = batchDao.fileRDD (javaSparkCtx, FilePath);
FileDataJavaRDD .count();
}
public static JavaRDD<Object1> fileRDD (JavaSparkContext ctx, String filePath) {
return ctx.textFile(FilePath,1).mapPartitions(new FlatMapFunction<Iterator<String>, Object1>() {
@Override
public Iterable<Object1> call(Iterator<String> stringIterator) throws Exception {
final Object1 currentDayFile = new Object1();
List<Object1> list = new ArrayList<Object1>();
while (stringIterator.hasNext()) {
String line = stringIterator.next();
String[] parts = line.split("\u0001", -1);
currentDayFile.setId(parts[1]);
..................few setters
list.add(currentDayFile);
return list;
}
return list;
}
});
public class Object1 implements Externalizable {
private static final long serialVersionUID = 1L;
private static final String DELIMITER = "\u0001";
public String id1;
public String id2;
.....few getter and setters
@Override
public void writeExternal(ObjectOutput stream) throws IOException {
stream.writeObject(this.id1);}
@Override
public void readExternal(ObjectInput stream) throws IOException, ClassNotFoundException {
this.id1= (String) stream.readObject();
}