Question

这是我的代码，它从firebase获取实时数据库，在Json中格式化，上传到云，然后到BQ。

#standardsql
import json
import boto
import gcs_oauth2_boto_plugin
import os
import shutil
import StringIO
import tempfile
import time
import argparse
import uuid

from firebase import firebase
from google.cloud import storage
from google.cloud.storage import blob
from google.cloud import bigquery

firebase = firebase.FirebaseApplication('https://dataworks-356fa.firebaseio.com/')
result = firebase.get('/connection_info', None)
id_keys = map(str, result.keys())

with open("firetobq.json", "w") as outfile:
  for id in id_keys:
    json.dump(result[id], outfile, indent=None)
    outfile.write("\n")

client = storage.Client(project='dataworks-356fa')
bucket = client.get_bucket('dataworks-356fa-backups')
blob = bucket.blob('firetobq.json')
with open('firetobq.json', 'rb') as f:
  blob.upload_from_file(f)

dataset = 'dataworks-356fa'
source = 'gs://dataworks-356fa-backups/firetobq.json'


def load_data_from_gcs(dataset, test12, source):
    bigquery_client = bigquery.Client(dataset)
    dataset = bigquery_client.dataset('FirebaseArchive')
    table = dataset.table('test12')
    job_name = str(uuid.uuid4())
    job1.create_disposition = 'WRITE_TRUNCATE'
    job1.begin()

    job= bigquery_client.load_table_from_storage(
        job_name, table, "gs://dataworks-356fa-backups/firetobq.json")
    job.source_format = 'NEWLINE_DELIMITED_JSON'

    job.begin()
    wait_for_job(job)

def wait_for_job(job):
    while True:
        job.reload()
        if job.state == 'DONE':
            if job.error_result:
                raise RuntimeError(job.errors)
            return
        time.sleep(1)

load_data_from_gcs(dataset, 'test12', source)

如何更改此内容而不是将表test12中的数据导入到创建新表中，并使该表在1周后过期。（我非常确定设置过期日期的命令必须以秒为单位.1周= 604800秒）我知道如何通过命令行设置过期日期，但宁愿在此处自动完成。

这是我在添加job1后收到的错误。

Traceback (most recent call last):
  File "firebasetobq2.py", line 63, in <module>
    load_data_from_gcs(dataset, 'test12', source)
  File "firebasetobq2.py", line 44, in load_data_from_gcs
    job1.create_disposition = 'WRITE_TRUNCATE'
NameError: global name 'job1' is not defined

Answer 1

如果您想为表设置过期时间，可能会有这个诀窍：

from datetime import datetime, timedelta
from google.cloud.bigquery.schema import SchemaField

def load_data_from_gcs(dataset,
                   table_name,
                   table_schema,
                   source,
                   source_format,
                   expiration_time):
    bigquery_client = bigquery.Client()
    dataset = bigquery_client.dataset(dataset)
    table = dataset.table(table_name)
    table.schema = table_schema
    table.expires = expiration_time
    if not table.created:
        table.create()

    job_name = str(uuid.uuid4())
    job= bigquery_client.load_table_from_storage(
        job_name, table, source)
    job.source_format = source_format

    job.begin()
    wait_for_job(job)

dataset = 'FirebaseArchive'
table_name = 'test12'
gcs_source = 'gs://dataworks-356fa-backups/firetobq.json'
source_format = 'NEWLINE_DELIMITED_JSON'
table.schema = [SchemaField(field1), SchemaField(field2), (...)]
expiration_time = datetime.now() + timedelta(seconds=604800)

load_data_from_gcs(dataset,
                   table_name,
                   table_schema,
                   gcs_source,
                   source_format,
                   expiration_time)

请注意，唯一的区别是它设置的代码行：

table.expires = expiration_time

其值必须为datetime类型（此处定义为expiration_time = datetime.now() + timedelta(seconds=604800)）

不确定是否可以使用Python API使用模式自动检测，但您仍然可以使用SchemaFields发送此信息。例如，如果您的表格有两个字段user_id和job_id，两者都是INTEGERS，那么架构将是：

table_schema = [SchemaField('user_id', field_type='INT64'),
                SchemaField('job_id', field_type='INT64')]

有关架构如何在BigQuery中运行的更多信息，您可以找到here。

[编辑]：

刚看到你的other question，如果要截断表格然后向其写入数据，你可以这样做：

job.create_disposition = 'WRITE_TRUNCATE'
job.begin()

在load_data_from_gcs功能中。这将自动删除表并使用存储文件中的数据创建一个新表。您不必为之前已定义的模式定义模式（因此对您来说可能是一个更容易的解决方案）。

使用Python创建一个新表并在bigquery中设置过期日期

1 个答案: