我将简要描述数据流作业以获得更多上下文:
转换1:
return input
.apply(Create.of(fileOrPattern))
.apply(ParDo.of(new ResolveFilePatternFn()).named("ResolveFilePattern"))
.apply("GroupByResolvedPaths", GroupByKey.create())
.apply("ValuesResolvedPaths", Values.create())
.apply("FlattenResolvedPaths", Flatten.iterables())
.apply(ParDo.of(new UnzipAndLoadFn()).named("UnzipAndLoad"));
转换2:
input
.apply(ParDo.of(new ProcessContent()).named("ProcessContent"))
.apply(ParDo.of(new StoreData()).named("StoreData"));
return PDone.in(input.getPipeline());
GroupByKey + Values + Flatten作为防止融合优化而添加。事实证明,通过实施这一逻辑,工作实际上运行得更快。
作业以--maxNumWorkers=5
开头。一切正常,直到工人池缩小规模:
自动调节:将工作线池调整为5到3
这项工作只是阻止,它“正在运行”但没有任何反应,没有进展,没有任何进展。工人不断记录:
拒绝在AAAAAoAAAQ拆分在shuffle范围[ShufflePosition(base64:AAAAAg),ShufflePosition(base64:AAAAAw)中未启动的GroupingShuffleReader
注意:当我使用--maxNumWorkers=1
开始相同的工作时,一切都按预期工作。
关于管道建设的任何想法以及可能导致这种行为的原因是什么?
更新:UnzipAndLoad PTransform代码
public final class UnzipAndLoad extends PTransform<PBegin, PCollection<KV<String, String>>> {
private static final Logger log = LoggerFactory.getLogger(UnzipAndLoad.class);
private final String fileOrPattern;
private UnzipAndLoad(final String fileOrPattern) {
super("UnzipAndLoad");
Preconditions.checkNotNull(fileOrPattern, "file pattern must not be null");
Preconditions.checkArgument(StringUtils.isNotBlank(fileOrPattern), "file pattern must not be empty");
this.fileOrPattern = fileOrPattern;
}
public static UnzipAndLoad from(final String fileOrPattern) {
return new UnzipAndLoad(fileOrPattern);
}
@Override
public void validate(final PBegin input) {
Preconditions.checkNotNull(fileOrPattern, "file pattern must not be null");
Preconditions.checkArgument(StringUtils.isNotBlank(fileOrPattern), "file pattern must not be empty");
}
@Override
public PCollection<KV<String, String>> apply(final PBegin input) {
/*
* This composite transform involves the following steps:
* 1. Create input from fileOrPattern String
*
* 2. Resolve fileOrPattern string to valid existing paths using dataflow's IOChannelUtils.
*
* 3. Read resolved path(s), unzip them, read files from archive(s), load XML files content
* into String and produce pairs of path + content.
*/
return input
.apply(Create.of(fileOrPattern))
.apply(ParDo.of(new ResolveFilePatternFn()).named("ResolveFilePattern"))
.apply("GroupByResolvedPaths", GroupByKey.create())
.apply("ValuesResolvedPaths", Values.create())
.apply("FlattenResolvedPaths", Flatten.iterables())
.apply(ParDo.of(new UnzipAndLoadFn()).named("UnzipAndLoadPerSource"));
}
@Override
public void populateDisplayData(final DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.addIfNotNull(DisplayData.item("fileOrPattern", fileOrPattern).withLabel("File Pattern"));
}
@VisibleForTesting
static class ResolveFilePatternFn extends DoFn<String, KV<Integer, String>> {
@Override
public void processElement(final ProcessContext context) throws Exception {
final String fileOrPattern = context.element();
Preconditions.checkNotNull(fileOrPattern, "file pattern must not be null");
Preconditions.checkState(StringUtils.isNotBlank(fileOrPattern), "file pattern must not be empty");
// using dataflow i/o utils to handle globs, eg
// /user/xxx/yyy/*.txt -> can resolve to 2 files:
// - /user/xxx/yyy/first.txt
// - /user/xxx/yyy/second.txt
final AtomicInteger key = new AtomicInteger(0);
IOChannelUtils.getFactory(fileOrPattern).match(fileOrPattern).forEach(file -> {
context.output(KV.of(key.getAndIncrement(), file));
});
}
}
@VisibleForTesting
static class UnzipAndLoadFn extends DoFn<String, KV<String, String>> {
private static final Pattern PATH_PATTERN =
Pattern.compile("(.+)/(.+)[.]xml", Pattern.CASE_INSENSITIVE);
@VisibleForTesting
final Aggregator<Long, Long> matchedFiles = createAggregator("Matched Files", new Sum.SumLongFn());
@VisibleForTesting
final Aggregator<Long, Long> unmatchedFiles = createAggregator("Unmatched Files", new Sum.SumLongFn());
@Override
public void processElement(final ProcessContext context) throws Exception {
final String inputPath = context.element();
Preconditions.checkNotNull(inputPath, "input path must not be null");
Preconditions.checkState(StringUtils.isNotBlank(inputPath), "input path must not be empty");
// let dataflow utils decide if this is the file path or gcs path
try (final ReadableByteChannel channel = IOChannelUtils.getFactory(inputPath).open(inputPath);
final ZipInputStream stream = new ZipInputStream(Channels.newInputStream(channel))) {
ZipEntry entry;
while ((entry = stream.getNextEntry()) != null) {
if (!entry.isDirectory()) {
final String path = entry.getName();
final Matcher matcher = PATH_PATTERN.matcher(path);
if (matcher.find()) {
final String content = IOUtils.toString(new BOMInputStream(stream), StandardCharsets.UTF_8);
matchedFiles.addValue(1L);
context.output(KV.of(path, content));
} else {
unmatchedFiles.addValue(1L);
log.debug("Ignoring unmatched file: '{}'.", path);
}
}
}
}
}
}
}