现在我正在尝试下面的示例,以从DataFlow的GCP发布/订阅中检索数据。
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import avro.shaded.com.google.common.collect.Lists;
import com.google.auth.oauth2.GoogleCredentials;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Sum;
import org.apache.beam.sdk.transforms.windowing.SlidingWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.joda.time.Duration;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
public class StreamDemoConsumer {
public static interface MyOptions extends DataflowPipelineOptions {
@Description("Output BigQuery table <project_id>:<dataset_id>.<table_id>")
@Default.String("coexon-seoul-dev:ledger_data_set.ledger_data2")
String getOutput();
void setOutput(String s);
@Description("Input topic")
@Default.String("projects/coexon-seoul-dev/topics/trading")
String getInput();
void setInput(String s);
}
@SuppressWarnings("serial")
public static void main(String[] args) throws IOException {
MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class);
options.setStreaming(true);
Pipeline p = Pipeline.create(options);
String topic = options.getInput();
String output = options.getOutput();
// Build the table schema for the output table.
List<TableFieldSchema> fields = new ArrayList<>();
fields.add(new TableFieldSchema().setName("timestamp").setType("TIMESTAMP"));
fields.add(new TableFieldSchema().setName("num_words").setType("INTEGER"));
TableSchema schema = new TableSchema().setFields(fields);
p //
.apply("GetMessages", PubsubIO.readStrings().fromTopic(topic)) //
.apply("window",
Window.into(SlidingWindows//
.of(Duration.standardMinutes(2))//
.every(Duration.standardSeconds(30)))) //
.apply("WordsPerLine", ParDo.of(new DoFn<String, Integer>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
String line = c.element();
c.output(line.split(" ").length);
}
}))//
.apply("WordsInTimeWindow", Sum.integersGlobally().withoutDefaults()) //
.apply("ToBQRow", ParDo.of(new DoFn<Integer, TableRow>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
TableRow row = new TableRow();
row.set("timestamp", Instant.now().toString());
row.set("num_words", c.element());
c.output(row);
}
})) //
.apply(BigQueryIO.writeTableRows().to(output)//
.withSchema(schema)//
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));
p.run();
}
}
我使用以下命令运行此代码。
sh run_oncloud4.sh coexon-seoul-dev ledgerbucket
然后代码运行良好
run_oncloud4.sh如下
#!/bin/bash
if [ "$#" -ne 2 ]; then
echo "Usage: ./run_oncloud.sh project-name bucket-name"
echo "Example: ./run_oncloud.sh cloud-training-demos cloud-training-demos"
exit
fi
PROJECT=$1
BUCKET=$2
MAIN=com.google.cloud.training.dataanalyst.javahelp.StreamDemoConsumer
echo "project=$PROJECT bucket=$BUCKET main=$MAIN"
export PATH=/usr/lib/jvm/java-8-openjdk-amd64/bin/:$PATH
mvn compile -e exec:java \
-Dexec.mainClass=$MAIN \
-Dexec.args="--project=$PROJECT \
--stagingLocation=gs://$BUCKET/staging/ \
--tempLocation=gs://$BUCKET/staging/ \
--output=$PROJECT:demos.streamdemo \
--input=projects/$PROJECT/topics/streamdemo \
--runner=DataflowRunner"
但是我像下面那样运行uppercode
sh run_locally.sh com.google.cloud.training.dataanalyst.javahelp.StreamDemoConsumer
然后出现无法获取应用程序默认凭据错误消息。
SLF4J:无法加载类“ org.slf4j.impl.StaticLoggerBinder”。 SLF4J:默认为无操作(NOP)记录器实现 SLF4J:有关更多详细信息,请参见http://www.slf4j.org/codes.html#StaticLoggerBinder。 线程“主”中的异常java.lang.RuntimeException:无法获取应用程序默认凭据。有关如何指定凭据的详细信息,请参见https://developers.google.com/accounts/docs/application-default-credentials。此版本的SDK依赖于gcloud核心组件版本2015.02.05或更高版本,才能通过gcloud auth从当前授权的用户获取凭据。 在org.apache.beam.sdk.extensions.gcp.auth.NullCredentialInitializer.throwNullCredentialException(NullCredentialInitializer.java:60) 在org.apache.beam.sdk.extensions.gcp.auth.NullCredentialInitializer $ NullCredentialHttpUnsuccessfulResponseHandler.handleResponse(NullCredentialInitializer.java:53)处 在com.google.cloud.hadoop.util.ChainingHttpRequestInitializer $ 3.handleResponse(ChainingHttpRequestInitializer.java:111) 在com.google.api.client.http.HttpRequest.execute(HttpRequest.java:1015) com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:419) com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:352) com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:469) 在org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl.executeWithRetries(BigQueryServicesImpl.java:854) 在org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl $ DatasetServiceImpl.getDataset(BigQueryServicesImpl.java:554) 在org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.verifyDatasetPresence(BigQueryHelpers.java:196) 在org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO $ Write.validate(BigQueryIO.java:1486) 在org.apache.beam.sdk.Pipeline $ ValidateVisitor.enterCompositeTransform(Pipeline.java:640) 在org.apache.beam.sdk.runners.TransformHierarchy $ Node.visit(TransformHierarchy.java:656) 在org.apache.beam.sdk.runners.TransformHierarchy $ Node.visit(TransformHierarchy.java:660) 位于org.apache.beam.sdk.runners.TransformHierarchy $ Node.access $ 600(TransformHierarchy.java:311) 在org.apache.beam.sdk.runners.TransformHierarchy.visit(TransformHierarchy.java:245) 在org.apache.beam.sdk.Pipeline.traverseTopologically(Pipeline.java:458) 在org.apache.beam.sdk.Pipeline.validate(Pipeline.java:575) 在org.apache.beam.sdk.Pipeline.run(Pipeline.java:310) 在org.apache.beam.sdk.Pipeline.run(Pipeline.java:297) 在com.google.cloud.training.dataanalyst.javahelp.StreamDemoConsumer.main(StreamDemoConsumer.java:115)
以退出代码1完成的过程
run_locally.sh
#!/bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage: ./run_locally.sh mainclass-basename"
echo "Example: ./run_oncloud.sh Grep"
exit
fi
MAIN=com.google.cloud.training.dataanalyst.javahelp.$1
export PATH=/usr/lib/jvm/java-8-openjdk-amd64/bin/:$PATH
mvn compile -e exec:java -Dexec.mainClass=$MAIN
我已经设置了凭据
echo ${GOOGLE_APPLICATION_CREDENTIALS}
/Users/mattheu/coexon-seoul-dev-898d91a66539.json
但发生授权错误。
我该如何解决这个问题?
答案 0 :(得分:0)
我经历了类似的事情,以下步骤对我有用:
在笔记本电脑上安装Google Cloud SDK。此处的说明:https://cloud.google.com/sdk/install
关闭命令行,然后重新打开。
运行gcloud init
并按照说明进行操作,包括将SDK绑定到您的GCP帐户和项目。
按照说明手动设置服务帐户(https://cloud.google.com/docs/authentication/production#obtaining_and_providing_service_account_credentials_manually)。您只需要按照“手动获取和提供服务帐户凭据”下的说明进行操作。基本上,您会将带有服务帐户信息的文件保存到计算机中,该文件将用于您要执行的工作。
在您的Shell配置文件中(在以Catalina开头的macOS上为~/.zshenv
),添加行export GOOGLE_APPLICATION_CREDENTIALS="/path/to/the/file/you/saved/in/step/4"
关闭并重新打开外壳,您应该一切顺利。
我不确定是否需要设置SDK(第1-3步),但是无论如何还是要设置好。