在spark UI中,我的工作正确结束了,但是我的应用程序未返回result,我在此工作之后没有任何Java处理。
我的半结构化文件的应用程序控制数据类型(从每列加载csv文件+数据类型,然后将这些类型应用于列)
我的测试用例有500万行,对于1、2和3M,它工作正常
public ErrorAccumulationResult controlStructure(Structure struct, String MappingTablesDirectory, String mode, Long packageId) throws IOException, AnalysisException {
System.out.println("==============================| START CONTROL PROCESS |==============================");
JavaRDD<String> data = sc.textFile(sessionMetaData.getInputPath(), 32);
JavaPairRDD<String, Long> filtredsDataKV = data.zipWithIndex().filter(t -> indexes.contains(t._2 + 1));
JavaRDD<Tuple2<String, Long>> partOfDataRdd = sc.parallelize(filtredsDataKV.take(1000));
List<ColumnDateFormat> patternList = partOfDataRdd.map(iterator -> {
List<ColumnDateFormat> columnDateFormats = new ArrayList<>();
String row = iterator._1;
if (!row.equals("")) {
String[] arr = row.trim().split(Pattern.quote(sep), -1);
for (int i = 0; i < cols.size(); i++) {
String type = cols.get(i).getType();
String name = cols.get(i).getName();
int index = cols.get(i).getIndex();
switch (type) {
case "Date":
Map<String, Long> count = Common.countPossibleDatePatterns(arr[i]);
ColumnDateFormat columnDateFormat = new ColumnDateFormat();
columnDateFormat.setName(name);
columnDateFormat.setIndex(index);
columnDateFormat.setPatternsCount(count);
columnDateFormats.add(columnDateFormat);
break;
}
}
}
return columnDateFormats;
}).reduce((colDate1, colDate2) -> {
for (int i = 0; i < colDate1.size(); i++) {
Map<String, Long> count1 = colDate1.get(i).getPatternsCount();
Map<String, Long> count2 = colDate2.get(i).getPatternsCount();
Map<String, Long> count = new HashMap<>();
count1.forEach((pattern, value) -> {
count.put(pattern, value + count2.get(pattern));
});
colDate1.get(i).setPatternsCount(count);
}
return colDate1;
});
patternList.forEach(p -> {
Map<String, Long> count = p.getPatternsCount();
String pattern = count.entrySet().stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.findFirst().get().getKey();
p.setPattern(pattern);
});
ErrorsAccumulator acc = new ErrorsAccumulator();
ErrorReportingAccumulator accReporting = new ErrorReportingAccumulator();
sc.sc().register(acc);
sc.sc().register(accReporting);
// Paths initialization....
JavaRDD<org.apache.spark.sql.Row> castColumnsToType = filtredsDataKV.mapPartitions(tuple2Iterator -> {
List<org.apache.spark.sql.Row> results = new ArrayList<>();
while (tuple2Iterator.hasNext()) {
String[] rowAfterConverting = new String[cols.size()];
Tuple2<String, Long> iterator = tuple2Iterator.next();
String row = iterator._1;
Long index = iterator._2 + 1;
if (!row.equals("")) {
String[] arr = row.trim().split(Pattern.quote(sep), -1);
try {
for (int i = 0; i < cols.size(); i++) {
boolean mandatory = cols.get(i).isMandatory();
String type = cols.get(i).getType();
String mapTabName = cols.get(i).getMapTabName();
String mapColName = cols.get(i).getMapColName();
String coe = cols.get(i).getCoe();
String decision = cols.get(i).getDecision();
String name = cols.get(i).getName();
switch (type) {
case "Date":
if (arr[i].equals("") && mandatory) {
acc.add(new Error("Missing value", index, name, arr[i], decision));
accReporting.add(struct.getName() + ";" + name + ";" + "Missing value");
rowAfterConverting[i] = "";
} else {
String pattern = "dd/MM/yyyy";
for (ColumnDateFormat columnDateFormat : patternList) {
if (columnDateFormat.getIndex() != i) {
continue;
}
pattern = columnDateFormat.getPattern();
}
String colConvertedToDate = Common.convertDate(arr[i], pattern);
if (colConvertedToDate.equals("null")) {
acc.add(new Error("Data type mismatch", index, name, arr[i], decision));
accReporting.add(struct.getName() + ";" + name + ";" + "Data type mismatch");
}
rowAfterConverting[i] = colConvertedToDate;
}
break;
case "Decimal":
//same for decimal
break;
case "Integer":
//same for integer
break;
default:
break;
}
if (mapTabName != null && mapColName != null && coe != null) {
//Other control for user types
}
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("Element don't exist ! error " + e.getMessage());
}
} else {
logger.warn("Blank row");
}
results.add(RowFactory.create(rowAfterConverting));
}
return results.iterator();
});
SparkUtils.csvToParquet(outputFilePathPARQUET, struct, castColumnsToType);
// Accumulation manipulation
System.out.println("==============================| DETECTION PROCESS TAKE " + (duration/1000000000) + " |==============================");
return new ErrorAccumulationResult(acc.value(), errorReporting);
}
这是此任务的执行流程:
有日志:
2019-09-09 21:16:07,442 284413221 INFO org.apache.spark.ContextCleaner - Cleaned accumulator 16754
2019-09-09 21:16:07,442 284413221 INFO org.apache.spark.ContextCleaner - Cleaned accumulator 16765
2019-09-09 21:16:07,442 284413221 INFO org.apache.spark.ContextCleaner - Cleaned accumulator 16768
2019-09-09 21:16:07,442 284413221 INFO org.apache.spark.ContextCleaner - Cleaned accumulator 16771
2019-09-09 21:16:07,442 284413221 INFO org.apache.spark.ContextCleaner - Cleaned accumulator 16753
2019-09-09 21:16:07,442 284413221 INFO org.apache.spark.ContextCleaner - Cleaned accumulator 16759
2019-09-09 21:16:07,444 284413223 INFO o.a.spark.storage.BlockManagerInfo - Removed broadcast_791_piece4
Spark version : org.apache.spark.SparkContext - Running Spark version 2.3.1