我的要求是从Google Cloud Storage中的csv文件读取并将其加载到Google Datastore。下面是代码段。
import com.google.datastore.v1.Entity;
import com.google.datastore.v1.Key;
import com.opencsv.CSVParser;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import javax.annotation.Nullable;
import java.util.UUID;
import static com.google.datastore.v1.client.DatastoreHelper.makeKey;
import static
com.google.datastore.v1.client.DatastoreHelper.makeValue;
public class PipelineClass {
static class CreateEntitiesFn extends DoFn<String, Entity> {
/**
*
*/
private static final Logger LOG = LoggerFactory.getLogger(PipelineClass.class);
private static final long serialVersionUID = 1L;
private final String namespace;
private final String kind;
private final Key ancestorKey;
CreateEntitiesFn(String namespace, String kind) {
this.namespace = namespace;
this.kind = kind;
ancestorKey = makeAncestorKey(namespace, kind);
}
Entity makeEntity(String id, String group) {
Entity.Builder entityBuilder = Entity.newBuilder();
Key.Builder keyBuilder = makeKey(ancestorKey, kind,
UUID.randomUUID().toString());
if (namespace != null) {
keyBuilder.getPartitionIdBuilder().setNamespaceId(namespace);
}
entityBuilder.setKey(keyBuilder.build());
entityBuilder.getMutableProperties().put("id",
makeValue(id).build());
entityBuilder.getMutableProperties().put("group",
makeValue(group).build());
return entityBuilder.build();
}
public void processElement(ProcessContext c) throws Exception {
CSVParser parser = new CSVParser();
String[] parts = parser.parseLine(c.element());
String id = parts[0];
String group = parts[1];
c.output(makeEntity(id, group));
}
}
static Key makeAncestorKey(@Nullable String namespace, String kind) {
Key.Builder keyBuilder = makeKey(kind, "root");
if (namespace != null) {
keyBuilder.getPartitionIdBuilder().setNamespaceId(namespace);
}
return keyBuilder.build();
}
public interface Options extends PipelineOptions {
@Description("Path of the file to read from and store to Cloud Datastore")
@Default.String("gs://myproject-263315.appspot.com/source/food.csv")
String getInput();
void setInput(String value);
@Description("Dataset ID to read from Cloud Datastore")
@Default.String("myproject-263315")
String getProject();
void setProject(String value);
@Description("Cloud Datastore Entity Kind")
@Default.String("FoodGroup")
String getKind();
void setKind(String value);
@Description("Dataset namespace")
@Default.String("nutrients")
String getNamespace();
void setNamespace(@Nullable String value);
@Description("Number of output shards")
@Default.Integer(0)
int getNumShards();
void setNumShards(int value);
}
public static void main(String args[]) {
// PipelineOptionsFactory.register(Options.class);
Options options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
p.apply(TextIO.read().from(options.getInput()))
.apply(ParDo.of(new
CreateEntitiesFn(options.getNamespace(), options.getKind())))
.apply(DatastoreIO.v1().write().withProjectId(options.getProject()));
p.run();
}
}
我正在使用ecplise开发代码。下面是我的POM.xml
<?xml version="1.0" encoding="UTF-8"?>
<!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~ Copyright (C) 2017 Google Inc.
~
~ Licensed under the Apache License, Version 2.0 (the "License"); you may not
~ use this file except in compliance with the License. You may obtain a copy of
~ the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
~ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
~ License for the specific language governing permissions and limitations under
~ the License.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>my.dataflow1</groupId>
<artifactId>my.artifact1</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<bigquery.version>v2-rev374-1.23.0</bigquery.version>
<google-clients.version>1.23.0</google-clients.version>
<guava.version>20.0</guava.version>
<hamcrest.version>1.3</hamcrest.version>
<joda.version>2.4</joda.version>
<junit.version>4.12</junit.version>
<maven-compiler-plugin.version>3.7.0</maven-compiler-plugin.version>
<exec-maven-plugin.version>1.6.0</exec-maven-plugin.version>
<maven-jar-plugin.version>3.0.2</maven-jar-plugin.version>
<maven-shade-plugin.version>3.1.0</maven-shade-plugin.version>
<mockito.version>1.9.5</mockito.version>
<pubsub.version>v1-rev382-1.23.0</pubsub.version>
<slf4j.version>1.7.25</slf4j.version>
<surefire-plugin.version>2.20.1</surefire-plugin.version>
</properties>
<repositories>
<repository>
<id>ossrh.snapshots</id>
<name>Sonatype OSS Repository Hosting</name>
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven-compiler-plugin.version}</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>${surefire-plugin.version}</version>
<configuration>
<parallel>all</parallel>
<threadCount>4</threadCount>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
</configuration>
<dependencies>
<dependency>
<groupId>org.apache.maven.surefire</groupId>
<artifactId>surefire-junit47</artifactId>
<version>${surefire-plugin.version}</version>
</dependency>
</dependencies>
</plugin>
<!-- Ensure that the Maven jar plugin runs before the Maven
shade plugin by listing the plugin higher within the file. -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>${maven-jar-plugin.version}</version>
</plugin>
<!--
Configures `mvn package` to produce a bundled jar ("fat jar") for runners
that require this for job submission to a cluster.
-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>${maven-shade-plugin.version}</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<finalName>${project.artifactId}-bundled-${project.version}</finalName>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/LICENSE</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>${exec-maven-plugin.version}</version>
<configuration>
<cleanupDaemonThreads>false</cleanupDaemonThreads>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<dependencies>
<!-- Adds a dependency on a specific version of the Dataflow SDK. -->
<dependency>
<groupId>com.google.cloud.dataflow</groupId>
<artifactId>google-cloud-dataflow-java-sdk-all</artifactId>
<version>2.5.0</version>
</dependency>
<!-- Dependencies below this line are specific dependencies needed by the examples code. -->
<dependency>
<groupId>com.google.api-client</groupId>
<artifactId>google-api-client</artifactId>
<version>${google-clients.version}</version>
<exclusions>
<!-- Exclude an old version of guava that is being pulled
in by a transitive dependency of google-api-client -->
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava-jdk5</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.google.apis</groupId>
<artifactId>google-api-services-bigquery</artifactId>
<version>${bigquery.version}</version>
<exclusions>
<!-- Exclude an old version of guava that is being pulled
in by a transitive dependency of google-api-client -->
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava-jdk5</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.google.http-client</groupId>
<artifactId>google-http-client</artifactId>
<version>${google-clients.version}</version>
<exclusions>
<!-- Exclude an old version of guava that is being pulled
in by a transitive dependency of google-api-client -->
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava-jdk5</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>com.google.apis</groupId>
<artifactId>google-api-services-pubsub</artifactId>
<version>${pubsub.version}</version>
<exclusions>
<!-- Exclude an old version of guava that is being pulled
in by a transitive dependency of google-api-client -->
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava-jdk5</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>${joda.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<!-- Add slf4j API frontend binding with JUL backend -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-jdk14</artifactId>
<version>${slf4j.version}</version>
<!-- When loaded at runtime this will wire up slf4j to the JUL backend -->
<scope>runtime</scope>
</dependency>
<!-- Hamcrest and JUnit are required dependencies of PAssert,
which is used in the main code of DebuggingWordCount example. -->
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<version>${hamcrest.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>${mockito.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
使用以下命令通过maven执行代码时,
mvn compile exec:java -e -Dexec.mainClass=com.dataflow1.PipelineClass -Dexec.args="--project=myproject-263315 --runner=DataflowRunner" -Pdataflow-runner
我遇到了提示错误,
Exception in thread "main" java.lang.IllegalArgumentException: Property [project] is marked with contradictory annotations. Found [[Default.String(value=myproject-263315) on my.dataflow1.PipelineClass$Options#getProject()], [Default.InstanceFactory(value=class org.apache.beam.sdk.extensions.gcp.options.GcpOptions$DefaultProjectFactory) on org.apache.beam.runners.dataflow.options.DataflowPipelineOptions#getProject()], [Default.InstanceFactory(value=class org.apache.beam.sdk.extensions.gcp.options.GcpOptions$DefaultProjectFactory) on org.apache.beam.sdk.extensions.gcp.options.GcpOptions#getProject()]].
有人可以帮助我更正此问题吗?坚持下去,并为此长期坚持。非常感谢您的帮助。
谢谢。
答案 0 :(得分:0)
选项“ Project”在PiplineOptions中是保留的。 “ JobName”是另一个。