云功能触发DataPrep数据流作业

时间:2018-05-07 18:38:02

标签: google-cloud-functions google-cloud-dataflow

我试图执行一个小管道:

  1. 文件放入GCS Bucket> 2.当文件放入GCS存储桶(不工作)时,云功能触发数据流作业> 3.写入Big Query表(此部分工作)
  2. 我已经通过Dataprep创建了一个Dataflow作业,因为它在写入BigQuery表之前有很好的UI来完成我的所有转换(写入BigQuery工作正常),并且当文件上传到GCS桶。但是,Cloud Function不会触发Dataflow作业(我在Dataprep中写过)。

    如果我可以获得有关数据流作业未触发的原因的任何指示,请查看我的云函数下面的示例代码。

    
    
    /**
     * Triggered from a message on a Cloud Storage bucket.
     *
     * @param {!Object} event The Cloud Functions event.
     * @param {!Function} The callback function.
     */
    exports.processFile = (event, callback) => {
      console.log('Processing file: ' + event.data.name);
      callback();
    
      const google = require('googleapis');
    
     exports.CF_GCStoDataFlow_v2 = function(event, callback) {
      const file = event.data;
      if (file.resourceState === 'exists' && file.name) {
        google.auth.getApplicationDefault(function (err, authClient, projectId) {
          if (err) {
            throw err;
          }
    
          if (authClient.createScopedRequired && authClient.createScopedRequired()) {
            authClient = authClient.createScoped([
              'https://www.googleapis.com/auth/cloud-platform',
              'https://www.googleapis.com/auth/userinfo.email'
            ]);
          }
    
          const dataflow = google.dataflow({ version: 'v1b3', auth: authClient });
    
          dataflow.projects.templates.create({
            projectId: projectId,
            resource: {
              parameters: {
                inputFile: `gs://${file.bucket}/${file.name}`,
                outputFile: `gs://${file.bucket}/${file.name}`
              },
              jobName: 'cloud-dataprep-csvtobq-v2-281345',
              gcsPath: 'gs://mygcstest-pipeline-staging/temp/'
            }
          }, function(err, response) {
            if (err) {
              console.error("problem running dataflow template, error was: ", err);
            }
            console.log("Dataflow template response: ", response);
            callback();
          });
    
        });
      }
     };
    };
    
    
    

    DataProc job

3 个答案:

答案 0 :(得分:2)

这个代码片段可能有所帮助,它使用了不同的数据流api(启动)方法,它对我有用,请注意你需要指定模板的url并检查元数据文件(你可以在与通过dataprep接口执行的模板相同的目录)文件包含正确的参数

dataflow.projects.templates.launch({
   projectId: projectId,
   location: location,
   gcsPath: jobTemplateUrl,
   resource: {
     parameters: {
       inputLocations : `{"location1" :"gs://${file.bucket}/${file.name}"}`,
       outputLocations: `{"location1" : "gs://${destination.bucket}/${destination.name}"}"}`,
     },
      environment: {
        tempLocation: `gs://${destination.bucket}/${destination.tempFolder}`,
        zone: "us-central1-f"
     },
     jobName: 'my-job-name',

   }
 }

答案 1 :(得分:1)

你有没有提交Dataproc工作?它开始运行了吗? 以下文档可以给出一些开始的想法!

https://cloud.google.com/dataproc/docs/concepts/jobs/life-of-a-job

答案 2 :(得分:1)

看起来你将CF_GCStoDataFlow_v2放在processFile里面,所以代码的Dataflow部分没有执行。

您的功能应如下所示:

/**
 * Triggered from a message on a Cloud Storage bucket.
 *
 * @param {!Object} event The Cloud Functions event.
 * @param {!Function} The callback function.
 */
exports.CF_GCStoDataFlow_v2 = (event, callback) => {

  const google = require('googleapis');

  if (file.resourceState === 'exists' && file.name) {
    google.auth.getApplicationDefault(function (err, authClient, projectId) {
      if (err) {
        throw err;
      }

      if (authClient.createScopedRequired && authClient.createScopedRequired()) {
        authClient = authClient.createScoped([
          'https://www.googleapis.com/auth/cloud-platform',
          'https://www.googleapis.com/auth/userinfo.email'
        ]);
      }

      const dataflow = google.dataflow({ version: 'v1b3', auth: authClient });

      dataflow.projects.templates.create({
        projectId: projectId,
        resource: {
          parameters: {
            inputFile: `gs://${file.bucket}/${file.name}`,
            outputFile: `gs://${file.bucket}/${file.name}`
          },
          jobName: '<JOB_NAME>',
          gcsPath: '<BUCKET_NAME>'
        }
      }, function(err, response) {
        if (err) {
          console.error("problem running dataflow template, error was: ", err);
        }
        console.log("Dataflow template response: ", response);
        callback();
      });

    });
  }

  callback();
};

确保将“要执行的功能”下的值更改为CF_GCStoDataFlow_v2