Spark将数据写入HDFS并生成目录:
/data/location/bath_num/shangshai/20191219/00/38400/38/20191219112100_part-00000.gz
_SUCCESS
中有一个文件/data/location/bath_num directory
。
当我重新启动spark应用程序时,它将删除旧的数据目录
/data/location/bath_num/shangshai/20191219
,仅保留/data/location/bath_num/shangshai
目录。
重启前的数据也需要保存。但是它被删除了。而且我没有写删除代码。发生什么事了?
将数据保存到HDFS的代码如下:
String outputRootPath = "/data/location/bath_num";
// savePairRdd key is shanghai/20191219/00/38400/38/20191219112100 and
// value is data to save
savePairRdd.saveAsHadoopFile(outputRootPath, String.class, String.class,
MultiOutputByDiffFileNameUtil.class, GzipCodec.class);
Java类MultiOutputByDiffFileNameUtil:
public class MultiOutputByDiffFileNameUtil extends MultipleTextOutputFormat implements Serializable {
private static final long serialVersionUID = 4402532550211870125L;
@Override
protected Object generateActualKey(Object key, Object value) {
return null;
}
@Override
protected Object generateActualValue(Object key, Object value) {
return value;
}
@Override
protected String generateFileNameForKeyValue(Object key, Object value, String name) {
return key + "_" + name;
}
@Override
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
// super.checkOutputSpecs(ignored, job);
Path outDir = getOutputPath(job);
if (outDir == null && job.getNumReduceTasks() != 0) {
throw new InvalidJobConfException(
"Output directory not set in JobConf.");
}
if (outDir != null) {
FileSystem fs = outDir.getFileSystem(job);
outDir = fs.makeQualified(outDir);
setOutputPath(job, outDir);
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[]{outDir}, job);
}
}
}