Apache-Beam GroupByKey.create()转换的输出为PCollection
当我尝试多次迭代此Iterable时,使用SparkRunner ,我得到一个例外:
Caused by: java.lang.IllegalStateException: ValueIterator can't be iterated more than once,otherwise there could be data lost
at org.apache.beam.runners.spark.translation.GroupNonMergingWindowsFunctions$GroupByKeyIterator$ValueIterator.iterator(GroupNonMergingWindowsFunctions.java:163)
at java.lang.Iterable.spliterator(Iterable.java:101)
查看ValueIterator的内部代码表明,ValueIterator禁止为此Itrable
为什么不能从此Iterable创建多个迭代器?哪些数据可能会丢失?
代码示例:
import com.google.common.base.MoreObjects;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.values.KV;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.StreamSupport;
public class Main implements Serializable {
public static void main(String[] args) {
new Main().runPipeline(args);
}
private void runPipeline(String[] args) {
PipelineOptions options =
PipelineOptionsFactory.fromArgs(args).withValidation().create();
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(Create.of(
"trader1,10.0",
"trader1,20.0",
"trader1,5.0",
"trader2,7.0",
"trader2,30.0",
"trader2,2.0",
"trader3,10.0"))
.apply(ParDo.of(extractKey()))
.apply(GroupByKey.<String, Trade>create())
.apply(ParDo.of(calculateMax()))
.apply(ParDo.of(calculateMin()))
.apply(ToString.elements())
.apply(TextIO.write().to("output.txt"));
pipeline.run();
}
private static DoFn<KV<String, Iterable<Trade>>, KV<String, IterableAndCalculationWrapper>> calculateMax() {
return new DoFn<KV<String, Iterable<Trade>>, KV<String, IterableAndCalculationWrapper>>() {
@ProcessElement
public void processElement(@Element KV<String, Iterable<Trade>> element, ProcessContext context) {
String key = element.getKey();
Iterable<Trade> iterable = element.getValue();
Double max = StreamSupport.stream(iterable.spliterator(), false).mapToDouble(Trade::getTransactionAmount).max().getAsDouble();
// Un-commenting this line throws exception:
//Double min = StreamSupport.stream(iterable.spliterator(), false).mapToDouble(Trade::getTransactionAmount).min().getAsDouble();
Map<String, Double> caclulated = new HashMap<>();
caclulated.put("max", max);
context.output(KV.of(key, new IterableAndCalculationWrapper(iterable, caclulated)));
}
};
}
private static DoFn<KV<String, IterableAndCalculationWrapper>, KV<String, IterableAndCalculationWrapper>> calculateMin() {
return new DoFn<KV<String, IterableAndCalculationWrapper>, KV<String, IterableAndCalculationWrapper>>() {
@ProcessElement
public void processElement(@Element KV<String, IterableAndCalculationWrapper> element, ProcessContext context) {
String key = element.getKey();
IterableAndCalculationWrapper iterableAndCalculationWrapper = element.getValue();
Iterable<Trade> iterable = iterableAndCalculationWrapper.getIterable();
// This line throws exception:
Double min = StreamSupport.stream(iterable.spliterator(), false).mapToDouble(Trade::getTransactionAmount).min().getAsDouble();
iterableAndCalculationWrapper.getMap().put("min", min);
context.output(KV.of(key, iterableAndCalculationWrapper));
}
};
}
public static DoFn<String, KV<String, Trade>> extractKey() {
return new DoFn<String, KV<String, Trade>>() {
@ProcessElement
public void processElement(@Element String element, ProcessContext context) {
String[] row = element.split(",");
Trade trade = new Trade(row[0], Double.valueOf(row[1]));
context.output(KV.of(trade.traderId, trade));
}
};
}
private static class IterableAndCalculationWrapper implements Serializable {
private Iterable<Trade> iterable;
private Map<String, Double> map;
public IterableAndCalculationWrapper(Iterable<Trade> iterable, Map<String, Double> map) {
this.iterable = iterable;
this.map = map;
}
public Iterable<Trade> getIterable() {
return iterable;
}
public void setIterable(Iterable<Trade> iterable) {
this.iterable = iterable;
}
public Map<String, Double> getMap() {
return map;
}
public void setMap(Map<String, Double> map) {
this.map = map;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("iterable", iterable)
.add("map", map)
.toString();
}
}
private static class Trade implements Serializable {
private String traderId;
private Double transactionAmount;
public Trade(String traderId, Double transactionAmount) {
this.traderId = traderId;
this.transactionAmount = transactionAmount;
}
public String getTraderId() {
return traderId;
}
public void setTraderId(String traderId) {
this.traderId = traderId;
}
public Double getTransactionAmount() {
return transactionAmount;
}
public void setTransactionAmount(Double transactionAmount) {
this.transactionAmount = transactionAmount;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("traderId", traderId)
.add("transactionAmount", transactionAmount)
.toString();
}
}
}
答案 0 :(得分:2)
您可以将管道的中间结果存储在PCollections中。
PCollection<KV<String, Iterable<Trade>>> tmpCollection =
pipeline.apply(Create.of(
"trader1,10.0",
"trader1,20.0",
"trader1,5.0",
"trader2,7.0",
"trader2,30.0",
"trader2,2.0",
"trader3,10.0"))
.apply(ParDo.of(extractKey()))
.apply(GroupByKey.<String, Trade>create());
PCollection<KV<String, IterableAndCalculationWrapper>> collectionMax = tmpCollection.apply(ParDo.of(calculateMax()));
PCollection<KV<String, IterableAndCalculationWrapper>> collectionMin = tmpCollection.apply(ParDo.of(calculateMin()));
执行此操作将使您迭代一次calculateMax,并迭代一次calculateMin。然后,您将需要分别处理这两个集合,直到将它们合并为止。