我已在Google Cloud Storage中压缩了csv gzip文件,并使用Python,我正在根据命名约定自动检测架构并在Google BigQuery中创建新表。如何对正在创建的表进行分区?我要使用的数据中已经有一个“日期”列。
# importing libraries
from google.cloud import bigquery
# defining first load list
first_load_list = []
#defining tracker file
tracker_file = open("tracker_file", "a")
#reading values from config file
config_file = open("ingestion.config", "r")
for line in config_file:
if "project_id" in line:
project_id = line.split("=")[1].strip()
elif "dataset" in line:
dataset = line.split("=")[1].strip()
elif "gcs_location" in line:
gcs_location = line.split("=")[1].strip()
elif "bq1_target_table" in line:
bq1_target_table = line.split("=")[1].strip()
elif "bq2_target_table" in line:
bq2_target_table = line.split("=")[1].strip()
elif "bq1_first_load_filename" in line:
bq1_first_load_filename = line.split("=")[1].strip()
first_load_list.append(bq1_first_load_filename)
elif "bq2_first_load_filename" in line:
bq2_first_load_filename = line.split("=")[1].strip()
first_load_list.append(bq2_first_load_filename)
elif "gcs_bucket" in line:
gcs_bucket = line.split("=")[1].strip()
# reading bucket list temp file
bucket_list_file = open("bucket_list.temp", "r")
bucket_list = []
for entry in bucket_list_file:
bucket_list.append(entry)
# defining client and specifying project
client = bigquery.Client(project_id)
dataset_id = dataset
dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
# leading files into tables based on naming convention
for filename in first_load_list:
if "BQ2_2" in filename:
uri = gcs_location + filename
print "Processing file = " + uri
load_job = client.load_table_from_uri(
uri.strip(),
dataset_ref.table(bq2_target_table),
job_config=job_config) # API request
assert load_job.job_type == 'load'
load_job.result() # Waits for table load to complete.
assert load_job.state == 'DONE'
assert client.get_table(dataset_ref.table(bq2_target_table))
tracker_file.write(filename + "\n")
print filename.strip() + " processing complete\n"
elif "BQ1_2" in filename:
uri = gcs_location + filename
print "Processing file = " + uri
load_job = client.load_table_from_uri(
uri.strip(),
dataset_ref.table(bq1_target_table),
job_config=job_config) # API request
assert load_job.job_type == 'load'
load_job.result() # Waits for table load to complete.
assert load_job.state == 'DONE'
assert client.get_table(dataset_ref.table(bq1_target_table))
tracker_file.write(filename + "\n")
print filename.strip() + " processing complete\n"
tracker_file.close()
这是我为首次加载运行的代码。一旦创建了第一个加载表,我便只想将数据追加到这些表中。我看了https://cloud.google.com/bigquery/docs/creating-partitioned-tables,但不知道如何在Python中实现。
有人可以帮我指出正确的方向吗?
答案 0 :(得分:1)
您可以使用job_config._properties['load']['timePartitioning'] = {"type":"DAY", 'field':'your_field'}
在加载时创建分区表。我刚刚用测试数据对它进行了测试,它按预期工作。
请注意,目前使用API的分区仅支持'DAY'
。