我的管道具有以下简单的JSON输入
{"mac": "KC:FC:48:AE:F6:94", "status": 8, "datetime": "2015-07-13T21:15:02Z"}
基本上,输出应转到包含3列(mac,状态和日期时间)及其相应值的BigQuery表中
我的管道如下所示:
# -*- coding: utf-8 -*-
import os, json, logging, argparse, datetime, apache_beam as beam
from google.cloud import error_reporting
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
GOOGLE_PUBSUB_CHANNEL = 'projects/project-name/topics/topic-name'
GOOGLE_BIGQUERY_TABLE = 'bq-table'
GOOGLE_DATASET_ID = 'bq-dataset'
GOOGLE_PROJECT_ID = 'project-name'
class GoogleBigQuery():
client_error = error_reporting.Client()
@staticmethod
def get_schema_table(schema):
bigquery_schema = []
for key in range(len(schema)):
bigquery_schema.append('{}:{}'.format(schema[key].get('bigquery_field_name'), schema[key].get('bigquery_field_type')))
return ','.join(bigquery_schema)
fields_contract = (
{ 'bigquery_field_name': 'datetime', 'bigquery_field_type': 'STRING' },
{ 'bigquery_field_name': 'mac', 'bigquery_field_type': 'STRING' },
{ 'bigquery_field_name': 'status', 'bigquery_field_type': 'INTEGER' }
)
def parse_pubsub(line):
record = json.loads(line)
logging.info(record)
return record
class FilterStatus1(beam.DoFn):
def status_filter_1(self, data):
for r in data:
print(r)
logging.info(r)
if r["status"] == 1:
print(r)
logging.info(r)
yield r
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_parameters = [
'--runner', 'DirectRunner'
, '--staging_location', 'gs://bucket/staging'
, '--temp_location', 'gs://bucket/temp'
, '--autoscaling_algorithm', 'THROUGHPUT_BASED' #'NONE' to disable autoscaling
, '--num_workers', '1'
, '--max_num_workers', '2'
, '--disk_size_gb', '30'
, '--worker_machine_type', 'n1-standard-1'
]
pipeline_options = PipelineOptions(pipeline_parameters)
pipeline_options.view_as(StandardOptions).streaming = True
pipeline_options.view_as(GoogleCloudOptions).job_name = os.path.basename(__file__).split('.')[0].replace('_', '-')
pipeline_options.view_as(GoogleCloudOptions).project = GOOGLE_PROJECT_ID
with beam.Pipeline(options=pipeline_options, argv=pipeline_parameters) as p:
# Read the pubsub topic into a PCollection.
lines = (
p
| 'ReadPubSubMessage' >> beam.io.ReadFromPubSub(GOOGLE_PUBSUB_CHANNEL).with_output_types(bytes)
| 'Decode UTF-8' >> beam.Map(lambda x: x.decode('utf-8'))
| 'ParsePubSub' >> beam.Map(parse_pubsub)
)
(
lines | 'Filter Status 1' >> beam.ParDo(FilterStatus1())
| 'WriteToBigQueryStatus1' >> beam.io.WriteToBigQuery(
GOOGLE_BIGQUERY_TABLE
, project=GOOGLE_PROJECT_ID
, dataset=GOOGLE_DATASET_ID
, schema=GoogleBigQuery.get_schema_table(fields_contract)
, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
#, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
)
logging.info('Pipeline finished')
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
我遇到以下错误:
RuntimeError: NotImplementedError [while running 'Filter Status 1']
我的目标是过滤状态列,当值为1时将其流式传输到BQ。
预先感谢您的帮助。
答案 0 :(得分:1)
您可以尝试使用FlatMap
进行过滤。
首先,定义一种过滤方法:
def FilterStatus1(row):
if row["status"] == 1:
yield row
然后您可以像这样申请
:lines = lines | beam.FlatMap(FilterStatus1) | 'WriteToBigQueryStatus1' ...
此外,请尝试将您的代码分成多个块或明确分配的步骤。单行发生的巨大转换,映射和过滤通常会将您的代码变成黑盒。
希望有帮助。谢谢。
答案 1 :(得分:0)
我这样修改了代码
class FilterStatus1(beam.DoFn):
def process(self, data):
if data["status"] == 1:
result = [{"datetime":data["datetime"], "mac":data["mac"], "status":data["status"]}]
logging.info(result)
return result