Apache beam 2.1.0:以下示例后无法上传到数据存储区

时间:2017-09-18 14:27:21

标签: google-app-engine google-cloud-datastore google-cloud-dataflow apache-beam

我无法通过Apache Beam Java SDK(2.1.0)将实体上传到Cloud Datastore。以下是我的代码:

import com.google.cloud.datastore.DatastoreOptions
import com.google.cloud.datastore.Entity
import com.opencsv.CSVParser
import org.apache.beam.runners.dataflow.DataflowRunner
import 
org.apache.beam.runners.dataflow.options.DataflowPipelineOptions
import org.apache.beam.sdk.Pipeline
import org.apache.beam.sdk.io.TextIO
import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO
import org.apache.beam.sdk.options.PipelineOptionsFactory
import org.apache.beam.sdk.transforms.DoFn
import org.apache.beam.sdk.transforms.MapElements
import org.apache.beam.sdk.transforms.ParDo
import org.apache.beam.sdk.transforms.SimpleFunction
import java.io.Serializable


object PipelineClass {

class FoodGroup(var id: String? = null,
                var group: String? = null) : Serializable

class CreateGroupsFn : SimpleFunction<String, FoodGroup>() {
    override fun apply(line: String?): FoodGroup {
        val group = FoodGroup()
        val parser = CSVParser()
        val parts = parser.parseLine(line)            
        group.id = parts[0].trim()
        group.group = parts[1].trim()

        return group

    }
}

class CreateEntitiesFn : DoFn<FoodGroup, Entity>() {

    @ProcessElement
    fun processElement(c: ProcessContext) {

        val datastore = DatastoreOptions.getDefaultInstance().service

        val keyFactory = datastore.newKeyFactory()
                .setKind("FoodGroup")
                .setNamespace("nutrients")

        val key = datastore.allocateId(keyFactory.newKey())

        val entity = Entity.newBuilder(key)
                .set("id", c.element().id)
                .set("group", c.element().group)
                .build()

        c.output(entity)
    }
}

@JvmStatic fun main(args: Array<String>) {

    val options = 
PipelineOptionsFactory.`as`(DataflowPipelineOptions::class.java)
    options.runner = DataflowRunner::class.java
    options.project = "simplesample"
    options.jobName = "fgUpload"

    val pipeline = Pipeline.create(options)

    pipeline.apply(TextIO.read().from("gs://bucket/foodgroup.csv"))
            .apply(MapElements.via(CreateGroupsFn()))
            .apply(ParDo.of(CreateEntitiesFn()))
            //error occurs below...
            .apply(DatastoreIO.v1().write()
            .withProjectId(options.project))              






    pipeline.run()

    }

}

以下是我得到的错误:

PipelineClass.kt: (75, 24): Type mismatch: inferred type is 
DatastoreV1.Write! but PTransform<in PCollection<Entity!>!, PDone!>! 
was expected

我尝试过SimpleFunction,DoFn和PTransform(复合和非复合)来完成从String到Entity的转换,但没有成功。

我做错了什么?

编辑:我终于设法让我的实体进入数据存储区。在看到this example之后,我决定使用Dataflow 1.9.1和抛弃Beam(2.1.0)。以下是我的代码:

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.io.datastore.DatastoreIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.datastore.v1.Entity;
import com.google.datastore.v1.Key;
import com.opencsv.CSVParser;

import javax.annotation.Nullable;
import java.util.UUID;

import static com.google.datastore.v1.client.DatastoreHelper.makeKey;
import static 
com.google.datastore.v1.client.DatastoreHelper.makeValue;


public class PipelineClass {


static class CreateEntitiesFn extends DoFn<String, Entity> {
    private final String namespace;
    private final String kind;
    private final Key ancestorKey;

    CreateEntitiesFn(String namespace, String kind) {
        this.namespace = namespace;
        this.kind = kind;


        ancestorKey = makeAncestorKey(namespace, kind);
    }

    Entity makeEntity(String id, String group) {

        Entity.Builder entityBuilder = Entity.newBuilder();
        Key.Builder keyBuilder = makeKey(ancestorKey, kind, 
                                       UUID.randomUUID().toString());

        if (namespace != null) {

keyBuilder.getPartitionIdBuilder().setNamespaceId(namespace);
        }

        entityBuilder.setKey(keyBuilder.build());
        entityBuilder.getMutableProperties().put("id", 
            makeValue(id).build());
        entityBuilder.getMutableProperties().put("group", 
            makeValue(group).build());


        return entityBuilder.build();
    }

    @Override
    public void processElement(ProcessContext c) throws Exception {


        CSVParser parser = new CSVParser();
        String[] parts = parser.parseLine(c.element());
        String id = parts[0];
        String group = parts[1];

        c.output(makeEntity(id, group));
    }
}

static Key makeAncestorKey(@Nullable String namespace, String kind) {
    Key.Builder keyBuilder = makeKey(kind, "root");
    if (namespace != null) {
        keyBuilder.getPartitionIdBuilder().setNamespaceId(namespace);
    }
    return keyBuilder.build();
}


public interface Options extends PipelineOptions {
    @Description("Path of the file to read from and store to Cloud 
Datastore")
    @Default.String("gs://bucket/foodgroup.csv")
    String getInput();

    void setInput(String value);

    @Description("Dataset ID to read from Cloud Datastore")
    @Default.String("simplesample")
    String getProject();

    void setProject(String value);

    @Description("Cloud Datastore Entity Kind")
    @Default.String("FoodGroup")
    String getKind();

    void setKind(String value);

    @Description("Dataset namespace")
    @Default.String("nutrients")
    String getNamespace();

    void setNamespace(@Nullable String value);


    @Description("Number of output shards")
    @Default.Integer(0)
    int getNumShards();

    void setNumShards(int value);
}


public static void main(String args[]) {

    PipelineOptionsFactory.register(Options.class);
    Options options = 
PipelineOptionsFactory.fromArgs(args).as(Options.class);

    Pipeline p = Pipeline.create(options);
    p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
            .apply(ParDo.named("CreateEntities").of(new 
CreateEntitiesFn(options.getNamespace(), options.getKind())))

.apply(DatastoreIO.v1().write().withProjectId(options.getProject()));

    p.run();


    }
}

0 个答案:

没有答案