每当新文件上传到云存储桶时触发云功能。此函数应调用在pyspark中编写的数据空间作业来读取文件并将其加载到BigQuery。
我想知道如何从云功能调用Google数据应用程序作业。请建议。
答案 0 :(得分:1)
我能够创建一个简单的云功能,在GCS创建文件事件上触发Dataproc Job。在此示例中,GCS中的文件包含要执行的Pig查询。但是,您可以按照Dataproc API文档创建PySpark版本。
index.js
:
exports.submitJob = (event, callback) => {
const google = require('googleapis');
const projectId = 'my-project'
const clusterName = 'my-cluster'
const file = event.data;
if (file.name) {
google.auth.getApplicationDefault(function (err, authClient, projectId) {
if (err) {
throw err;
}
const queryFileUri = "gs://" + file.bucket + "/" + file.name
console.log("Using queryFileUri: ", queryFileUri);
if (authClient.createScopedRequired && authClient.createScopedRequired()) {
authClient = authClient.createScoped([
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/userinfo.email'
]);
}
const dataproc = google.dataproc({ version: 'v1beta2', auth: authClient });
dataproc.projects.regions.jobs.submit({
projectId: projectId,
region: "global",
resource: {
"job": {
"placement": {"clusterName": clusterName},
"pigJob": {
"queryFileUri": queryFileUri,
}
}
}
}, function(err, response) {
if (err) {
console.error("Error submitting job: ", err);
}
console.log("Dataproc response: ", response);
callback();
});
});
} else {
throw "Skipped processing file!";
}
callback();
};
务必将Function to execute
设为submitJob
。
package.json
:
{
"name": "sample-cloud-storage",
"version": "0.0.1",
"dependencies":{ "googleapis": "^21.3.0" }
}
以下博文给了我很多想法如何入门: https://cloud.google.com/blog/big-data/2016/04/scheduling-dataflow-pipelines-using-app-engine-cron-service-or-cloud-functions