如何从Google Cloud Function启动Cloud Dataflow作业?我想将Google Cloud Functions用作启用跨服务合成的机制。
答案 0 :(得分:9)
我在下面列出了WordCount示例的一个非常基本的示例。请注意,您需要在Cloud Function部署中包含Java二进制文件的副本,因为它不在默认环境中。同样,您还需要使用Cloud Function打包部署jar。
module.exports = {
wordcount: function (context, data) {
const spawn = require('child_process').spawn;
const child = spawn(
'jre1.8.0_73/bin/java',
['-cp',
'MY_JAR.jar',
'com.google.cloud.dataflow.examples.WordCount',
'--jobName=fromACloudFunction',
'--project=MY_PROJECT',
'--runner=BlockingDataflowPipelineRunner',
'--stagingLocation=gs://STAGING_LOCATION',
'--inputFile=gs://dataflow-samples/shakespeare/*',
'--output=gs://OUTPUT_LOCATION'
],
{ cwd: __dirname });
child.stdout.on('data', function(data) {
console.log('stdout: ' + data);
});
child.stderr.on('data', function(data) {
console.log('error: ' + data);
});
child.on('close', function(code) {
console.log('closing code: ' + code);
});
context.success();
}
}
您可以使用non-blocking runner进一步增强此示例,并让函数返回作业ID,以便您可以单独轮询作业完成情况。此模式也应该对其他SDK有效,只要它们的依赖关系可以打包到云函数中。
答案 1 :(得分:1)
最好的启动方式是通过云功能启动,但是要小心,如果您将云功能用于Google云存储,那么对于每个上传的文件,都会启动一个数据流作业。
const { google } = require('googleapis');
const templatePath = "gs://template_dir/df_template;
const project = "<project_id>";
const tempLoc = "gs://tempLocation/";
exports.PMKafka = (data, context, callback) => {
const file = data;
console.log(`Event ${context.eventId}`);
console.log(`Event Type: ${context.eventType}`);
console.log(`Bucket Name: ${file.bucket}`);
console.log(`File Name: ${file.name}`);
console.log(`Metageneration: ${file.metageneration}`);
console.log(`Created: ${file.timeCreated}`);
console.log(`Updated: ${file.updated}`);
console.log(`Uploaded File Name - gs://${file.bucket}/${file.name}`);
google.auth.getApplicationDefault(function (err, authClient, projectId) {
if (err) {
throw err;
}
if (authClient.createScopedRequired && authClient.createScopedRequired()) {
authClient = authClient.createScoped(authScope);
}
const dataflow = google.dataflow({ version: 'v1b3', auth: authClient });
var inputDict= {
inputFile: `gs://${file.bucket}/${file.name}`,
...
...
<other_runtime_parameters>
};
var env = {
tempLocation: tempLoc
};
var resource_opts = {
parameters: inputDict,
environment: env,
jobName: config.jobNamePrefix + "-" + new Date().toISOString().toLowerCase().replace(":","-").replace(".","-")
};
var opts = {
gcsPath: templatePath,
projectId: project,
resource: resource_opts
}
console.log(`Dataflow Run Time Options - ${JSON.stringify(opts)}`)
dataflow.projects.templates.launch(opts, function (err, response) {
if (err) {
console.error("problem running dataflow template, error was: ", err);
slack.publishMessage(null, null, false, err);
return;
}
console.log("Dataflow template response: ", response);
var jobid = response["data"]["job"]["id"];
console.log("Dataflow Job ID: ", jobid);
});
callback();
});
};