BigQuery-use_avro_logical_types在python脚本中不起作用

时间:2019-12-04 16:28:43

标签: python google-bigquery avro

疑问类似于问题:BigQuery use_avro_logical_types ignored in Python script,但是我已经更新了我使用google的库,但未成功。因此,我想了解我的情况。我将发送avro文件的摄入脚本部分。注意:此问题发生在BigQuery的多个avro文件提取管道中。

编辑:所提到的更改模式类型的解决方案不起作用,它给出了另一个错误。

BQ中的架构:

enter image description here

BQ中的数据:

enter image description here

import csv
import base64
import json
import io
import avro.schema
import avro.io
from avro.datafile import DataFileReader, DataFileWriter
import math
import os
import gcloud
from gcloud import storage
from google.cloud import bigquery
from oauth2client.client import GoogleCredentials
from datetime import datetime, timedelta, date
import numpy as np

try:
    script_path = os.path.dirname(os.path.abspath(__file__)) + "/"
except:
    script_path = "C:\\Users\\me\\key.json"

#Bigquery Credentials and settings
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = script_path

folder = str((datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'))
data_folder = str((datetime.now() - timedelta(days=1)).strftime('%Y%m%d'))
bucket_name = 'gs://bucket/*.csv'
dataset = 'dataset'
tabela = 'table_ids'

new_file = 'C:\\Users\\me\\register_' + data_folder + '.avro'
file_schema = 'C:\\Users\\me\\schema.avsc'
new_filename = 'register_' + data_folder + '.avro'

bq1 = bigquery.Client()
#Deleta IDs
query1 = """DELETE FROM dataset.table_ids WHERE ID IS NOT NULL""" 
query_job1 = bq1.query(query1)

def insert_bigquery(target_uri, dataset_id, table_id):
    bigquery_client = bigquery.Client()
    dataset_ref = bigquery_client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.schema = [
        bigquery.SchemaField('id','STRING',mode='REQUIRED')
    ]
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.field_delimiter = ";"
    uri = target_uri
    load_job = bigquery_client.load_table_from_uri(
        uri,
        dataset_ref.table(table_id),
        job_config=job_config
        )
    print('Starting job {}'.format(load_job.job_id))
    load_job.result()
    print('Job finished.')

insert_bigquery(bucket_name, dataset, tabela)

def get_data_from_bigquery():
    """query bigquery to get data to import to PSQL"""
    bq = bigquery.Client()
    #Busca IDs
    query = """SELECT id FROM dataset.table_ids"""
    query_job = bq.query(query)
    data = query_job.result()
    rows = list(data)
    return rows

a = get_data_from_bigquery()
length = len(a) 
line_count = 0
schema = avro.schema.Parse(open(file_schema, "rb").read())  # need to know the schema to write. According to 1.8.2 of Apache Avro
writer = DataFileWriter(open(new_file, "wb"), avro.io.DatumWriter(), schema)

for row in range(length):
    bytes = base64.b64decode(str(a[row][0]))
    bytes = bytes[5:]
    buf = io.BytesIO(bytes)
    decoder = avro.io.BinaryDecoder(buf)
    rec_reader = avro.io.DatumReader(avro.schema.Parse(open(file_schema).read()))
    out=rec_reader.read(decoder)
    writer.append(out)
writer.close()

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob("insert/" + destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}'.format(
        source_file_name,
        destination_blob_name
    ))

upload_blob('bucket', new_file, new_filename)

def insert_bigquery_avro(target_uri, dataset_id, table_id):
    bigquery_client = bigquery.Client()
    dataset_ref = bigquery_client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.source_format = bigquery.SourceFormat.AVRO
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    job_config.use_avro_logical_types = True
    time_partitioning = bigquery.table.TimePartitioning()
    job_config.time_partitioning = time_partitioning
    uri = target_uri
    load_job = bigquery_client.load_table_from_uri(
        uri,
        dataset_ref.table(table_id),
        job_config=job_config
        )
    print('Starting job {}'.format(load_job.job_id))
    load_job.result()
    print('Job finished.')

Avro模式:

"fields": [
    {
      "name": "id",
      "type": {
        "type": "string",
        "avro.java.string": "String"
      },
      "doc": "the payment id"
    },
    {
      "name": "merchant",
      "type": {
        "type": "string",
        "avro.java.string": "String"
      },
      "doc": "the merchant who owns the payment"
    },
    {
      "name": "date",
      "type": {
        "type": "long",
        "logicalType": "timestamp-millis"
      },
      "doc": "the date where the transaction happend"
    },
    {
      "name": "amount",
      "type": {
        "type": "record",
        "name": "amount",
        "fields": [
          {
            "name": "amount",
            "type": [
              "null",
              {
                "type": "bytes",
                "logicalType": "decimal",
                "precision": 5,
                "scale": 5
              }
            ],
            "doc": "the original currency amount",
            "default": null
          },
          {
            "name": "foreignAmount",
            "type": [
              "null",
              {
                "type": "bytes",
                "logicalType": "decimal",
                "precision": 5,
                "scale": 5
              }
            ],
            "doc": "the foreign amount for the payment",
            "default": null
          },
          {
            "name": "code",
            "type": {
              "type": "string",
              "avro.java.string": "String"
            },
            "doc": "the destination currency code"
          }
        ],
        "default": null
      }
    },
    {
      "name": "exchange_rate",
      "type": {
        "type": "record",
        "name": "code",
        "fields": [
          {
            "name": "currency_code",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "doc": "the exchange rate currency code",
            "default": null
          },
          {
            "name": "rate",
            "type": [
              "null",
              {
                "type": "bytes",
                "logicalType": "decimal",
                "precision": 5,
                "scale": 5
              }
            ],
            "default": null
          },
          {
            "name": "online_rate",
            "type": [
              "null",
              {
                "type": "bytes",
                "logicalType": "decimal",
                "precision": 5,
                "scale": 5
              }
            ],
            "default": null
          }
        ]
      },
      "doc": "The transaction exchange rate"
    },
    {
      "name": "consumer",
      "type": {
        "type": "record",
        "name": "Consumer",
        "fields": [
          {
            "name": "name",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "doc": "the consumer's name",
            "default": null
          },
          {
            "name": "email",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "doc": "the consumer's email address",
            "default": null
          },
          {
            "name": "external_id",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "doc": "the consumer's external id when needed",
            "default": null
          },
          {
            "name": "national_id",
            "type": {
              "type": "string",
              "avro.java.string": "String"
            },
            "doc": "the national id"
          },
          {
            "name": "phone",
            "type": {
              "type": "string",
              "avro.java.string": "String"
            },
            "doc": "the consumer's phone number",
            "default": ""
          }
        ]
      }
    },
    {
      "name": "soft_descriptor",
      "type": [
        "null",
        {
          "type": "string",
          "avro.java.string": "String"
        }
      ],
      "doc": "the description as it will be shown at the customer's invoice",
      "default": null
    },
    {
      "name": "merchant_contract",
      "type": {
        "type": "enum",
        "name": "merchant_contract_type",
        "symbols": [
          "PAY",
          "BANK"
        ]
      },
      "default": "PAY"
    },
    {
      "name": "type",
      "type": {
        "type": "enum",
        "name": "payment_type",
        "symbols": [
          "INITIAL",
          "CREDIT_CARD",
          "DEBIT_CARD",
          "ONLINE_DEBIT",
          "BANK_SLIP",
          "DIGITAL_WALLET",
          "ELECTRONIC_BANK_TRANSFER"
        ]
      },
      "default": "INITIAL"
    },
    {
      "name": "card",
      "type": {
        "type": "record",
        "name": "card",
        "fields": [
          {
            "name": "type",
            "type": [
              "null",
              {
                "type": "enum",
                "name": "card_type",
                "symbols": [
                  "CARD",
                  "TOKEN"
                ]
              }
            ],
            "default": null
          },
          {
            "name": "mask_number",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "default": null
          },
          {
            "name": "card_holder",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "default": null
          },
          {
            "name": "brand",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "default": null
          }
        ]
      }
    },
    {
      "name": "confirm",
      "type": "boolean",
      "doc": "indicates whether is self confirmed",
      "default": false
    },
    {
      "name": "installments",
      "type": "int",
      "doc": "Number of installments for the payment",
      "default": 1
    },
    {
      "name": "due_date",
      "type": [
        "null",
        {
          "type": "int",
          "logicalType": "date"
        }
      ],
      "default": null
    },
    {
      "name": "correlation_id",
      "type": [
        "null",
        {
          "type": "string",
          "avro.java.string": "String"
        }
      ],
      "doc": "the external customer correlationid",
      "default": null
    },
    {
      "name": "billing",
      "type": {
        "type": "record",
        "name": "Billing",
        "fields": [
          {
            "name": "national_id",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "doc": "billing info",
            "default": null
          },
          {
            "name": "name",
            "type": [
              "null",
              {
                "type": "string",
                "avro.java.string": "String"
              }
            ],
            "doc": "the consumer address name",
            "default": null
          }
        ]
      }
    },
    {
      "name": "status",
      "type": {
        "type": "enum",
        "name": "payment_status",
        "symbols": [
          "INITIAL",
          "CONSUMER",
          "AUTHORIZED",
          "WAITING_CONFIRMATION",
          "CANCELED",
          "WAITING_CLEARING",
          "CLEARED",
          "TRANSFERENCE",
          "DECLINED_BY_ISSUER",
          "DECLINED_BY_BUSINESS_RULES",
          "CONFIRMED",
          "WAITING_CANCELING",
          "WAITING_CONSUMER",
          "TRANSFER_REQUESTED"
        ],
        "default": "INITIAL"
      }
    },
    {
      "name": "metadata",
      "type": {
        "type": "map",
        "values": {
          "type": "string",
          "avro.java.string": "String"
        },
        "avro.java.string": "String"
      }
    },
    {
      "name": "events",
      "type": {
        "type": "array",
        "items": {
          "type": "record",
          "name": "event",
          "fields": [
            {
              "name": "id",
              "type": {
                "type": "string",
                "avro.java.string": "String"
              },
              "default": "0"
            },
            {
              "name": "type",
              "type": {
                "type": "enum",
                "name": "event_type",
                "symbols": [
                  "AUTHORIZATION",
                  "AUTHENTICATION",
                  "CONFIRMATION",
                  "CANCELATION",
                  "CHECKOUT_CREATION",
                  "SETTLEMENT",
                  "TRANSFER_VALIDATION",
                  "TRANSFER_SCHEDULE",
                  "TRANSFERRED"
                ]
              }
            },
            {
              "name": "gateway",
              "type": [
                "null",
                {
                  "type": "string",
                  "avro.java.string": "String"
                }
              ],
              "default": null
            },
            {
              "name": "breadcrumb_id",
              "type": [
                "null",
                {
                  "type": "string",
                  "avro.java.string": "String"
                }
              ],
              "default": null
            },
            {
              "name": "request_time",
              "type": {
                "type": "long",
                "logicalType": "timestamp-millis"
              },
              "doc": "the moment where this request was received by the platform"
            },
            {
              "name": "response_time",
              "type": {
                "type": "long",
                "logicalType": "timestamp-millis"
              },
              "doc": "the moment where this request was returned by the platform"
            },
            {
              "name": "status",
              "type": {
                "type": "enum",
                "name": "event_status",
                "symbols": [
                  "SUCCESS",
                  "DENIED",
                  "ERROR",
                  "TIMEOUT",
                  "PENDING"
                ]
              }
            },
            {
              "name": "actor",
              "type": {
                "type": "enum",
                "name": "actor",
                "symbols": [
                  "AQ",
                  "GTW",
                  "CONCIL"
                ],
                "default": "GTW"
              },
              "default": "GTW"
            },
            {
              "name": "amount",
              "type": [
                "null",
                {
                  "type": "bytes",
                  "logicalType": "decimal",
                  "precision": 5,
                  "scale": 5
                }
              ],
              "doc": "the original currency amount",
              "default": null
            },
            {
              "name": "foreign_amount",
              "type": [
                "null",
                {
                  "type": "bytes",
                  "logicalType": "decimal",
                  "precision": 5,
                  "scale": 5
                }
              ],
              "doc": "the foreign amount for the payment",
              "default": null
            },
            {
              "name": "error",
              "type": {
                "type": "record",
                "name": "Error",
                "fields": [
                  {
                    "name": "code",
                    "type": [
                      "null",
                      {
                        "type": "string",
                        "avro.java.string": "String"
                      }
                    ],
                    "default": null
                  },
                  {
                    "name": "message",
                    "type": [
                      "null",
                      {
                        "type": "string",
                        "avro.java.string": "String"
                      }
                    ],
                    "default": null
                  }
                ]
              }
            },
            {
              "name": "message",
              "type": [
                "null",
                {
                  "type": "string",
                  "avro.java.string": "String"
                }
              ],
              "default": null
            },
            {
              "name": "fee_amount",
              "type": [
                "null",
                {
                  "type": "bytes",
                  "logicalType": "decimal",
                  "precision": 5,
                  "scale": 5
                }
              ],
              "doc": "the fee amount",
              "default": null
            },
            {
              "name": "net_amount",
              "type": [
                "null",
                {
                  "type": "bytes",
                  "logicalType": "decimal",
                  "precision": 5,
                  "scale": 5
                }
              ],
              "doc": "the net amount",
              "default": null
            },
            {
              "name": "metadata",
              "type": {
                "type": "map",
                "values": {
                  "type": "string",
                  "avro.java.string": "String"
                },
                "avro.java.string": "String"
              }
            },
            {
              "name": "internal_metadata",
              "type": {
                "type": "map",
                "values": {
                  "type": "string",
                  "avro.java.string": "String"
                },
                "avro.java.string": "String"
              }
            }
          ]
        }
      }
    },
    {
      "name": "bank_account",
      "type": [
        "null",
        {
          "type": "record",
          "name": "bank_account",
          "fields": [
            {
              "name": "name",
              "type": {
                "type": "string",
                "avro.java.string": "String"
              },
              "doc": "the bank name",
              "default": ""
            },
            {
              "name": "code",
              "type": {
                "type": "string",
                "avro.java.string": "String"
              },
              "doc": "the bank code",
              "default": ""
            },
            {
              "name": "agency",
              "type": {
                "type": "string",
                "avro.java.string": "String"
              },
              "doc": "the bank agency",
              "default": ""
            },
            {
              "name": "account",
              "type": {
                "type": "string",
                "avro.java.string": "String"
              },
              "doc": "the bank account",
              "default": ""
            },
            {
              "name": "document_number",
              "type": {
                "type": "string",
                "avro.java.string": "String"
              },
              "doc": "the bank document number (CNPJ)",
              "default": ""
            }
          ]
        }
      ],
      "doc": "The bank account values",
      "default": null
    }
  ]

1 个答案:

答案 0 :(得分:0)

请尝试为avro模式上的所有时间戳列将此类型设置为“类型”:

scale_*