我无法通过Apache Beam Java SDK(2.1.0)将实体上传到Cloud Datastore。以下是我的代码:
import com.google.cloud.datastore.DatastoreOptions
import com.google.cloud.datastore.Entity
import com.opencsv.CSVParser
import org.apache.beam.runners.dataflow.DataflowRunner
import
org.apache.beam.runners.dataflow.options.DataflowPipelineOptions
import org.apache.beam.sdk.Pipeline
import org.apache.beam.sdk.io.TextIO
import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO
import org.apache.beam.sdk.options.PipelineOptionsFactory
import org.apache.beam.sdk.transforms.DoFn
import org.apache.beam.sdk.transforms.MapElements
import org.apache.beam.sdk.transforms.ParDo
import org.apache.beam.sdk.transforms.SimpleFunction
import java.io.Serializable
object PipelineClass {
class FoodGroup(var id: String? = null,
var group: String? = null) : Serializable
class CreateGroupsFn : SimpleFunction<String, FoodGroup>() {
override fun apply(line: String?): FoodGroup {
val group = FoodGroup()
val parser = CSVParser()
val parts = parser.parseLine(line)
group.id = parts[0].trim()
group.group = parts[1].trim()
return group
}
}
class CreateEntitiesFn : DoFn<FoodGroup, Entity>() {
@ProcessElement
fun processElement(c: ProcessContext) {
val datastore = DatastoreOptions.getDefaultInstance().service
val keyFactory = datastore.newKeyFactory()
.setKind("FoodGroup")
.setNamespace("nutrients")
val key = datastore.allocateId(keyFactory.newKey())
val entity = Entity.newBuilder(key)
.set("id", c.element().id)
.set("group", c.element().group)
.build()
c.output(entity)
}
}
@JvmStatic fun main(args: Array<String>) {
val options =
PipelineOptionsFactory.`as`(DataflowPipelineOptions::class.java)
options.runner = DataflowRunner::class.java
options.project = "simplesample"
options.jobName = "fgUpload"
val pipeline = Pipeline.create(options)
pipeline.apply(TextIO.read().from("gs://bucket/foodgroup.csv"))
.apply(MapElements.via(CreateGroupsFn()))
.apply(ParDo.of(CreateEntitiesFn()))
//error occurs below...
.apply(DatastoreIO.v1().write()
.withProjectId(options.project))
pipeline.run()
}
}
以下是我得到的错误:
PipelineClass.kt: (75, 24): Type mismatch: inferred type is
DatastoreV1.Write! but PTransform<in PCollection<Entity!>!, PDone!>!
was expected
我尝试过SimpleFunction,DoFn和PTransform(复合和非复合)来完成从String到Entity的转换,但没有成功。
我做错了什么?
编辑:我终于设法让我的实体进入数据存储区。在看到this example之后,我决定使用Dataflow 1.9.1和抛弃Beam(2.1.0)。以下是我的代码:
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.io.datastore.DatastoreIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.datastore.v1.Entity;
import com.google.datastore.v1.Key;
import com.opencsv.CSVParser;
import javax.annotation.Nullable;
import java.util.UUID;
import static com.google.datastore.v1.client.DatastoreHelper.makeKey;
import static
com.google.datastore.v1.client.DatastoreHelper.makeValue;
public class PipelineClass {
static class CreateEntitiesFn extends DoFn<String, Entity> {
private final String namespace;
private final String kind;
private final Key ancestorKey;
CreateEntitiesFn(String namespace, String kind) {
this.namespace = namespace;
this.kind = kind;
ancestorKey = makeAncestorKey(namespace, kind);
}
Entity makeEntity(String id, String group) {
Entity.Builder entityBuilder = Entity.newBuilder();
Key.Builder keyBuilder = makeKey(ancestorKey, kind,
UUID.randomUUID().toString());
if (namespace != null) {
keyBuilder.getPartitionIdBuilder().setNamespaceId(namespace);
}
entityBuilder.setKey(keyBuilder.build());
entityBuilder.getMutableProperties().put("id",
makeValue(id).build());
entityBuilder.getMutableProperties().put("group",
makeValue(group).build());
return entityBuilder.build();
}
@Override
public void processElement(ProcessContext c) throws Exception {
CSVParser parser = new CSVParser();
String[] parts = parser.parseLine(c.element());
String id = parts[0];
String group = parts[1];
c.output(makeEntity(id, group));
}
}
static Key makeAncestorKey(@Nullable String namespace, String kind) {
Key.Builder keyBuilder = makeKey(kind, "root");
if (namespace != null) {
keyBuilder.getPartitionIdBuilder().setNamespaceId(namespace);
}
return keyBuilder.build();
}
public interface Options extends PipelineOptions {
@Description("Path of the file to read from and store to Cloud
Datastore")
@Default.String("gs://bucket/foodgroup.csv")
String getInput();
void setInput(String value);
@Description("Dataset ID to read from Cloud Datastore")
@Default.String("simplesample")
String getProject();
void setProject(String value);
@Description("Cloud Datastore Entity Kind")
@Default.String("FoodGroup")
String getKind();
void setKind(String value);
@Description("Dataset namespace")
@Default.String("nutrients")
String getNamespace();
void setNamespace(@Nullable String value);
@Description("Number of output shards")
@Default.Integer(0)
int getNumShards();
void setNumShards(int value);
}
public static void main(String args[]) {
PipelineOptionsFactory.register(Options.class);
Options options =
PipelineOptionsFactory.fromArgs(args).as(Options.class);
Pipeline p = Pipeline.create(options);
p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
.apply(ParDo.named("CreateEntities").of(new
CreateEntitiesFn(options.getNamespace(), options.getKind())))
.apply(DatastoreIO.v1().write().withProjectId(options.getProject()));
p.run();
}
}