我想从我的sql server表中提取新行。我发现获得差异的方法是使用以下脚本。对于MySql表,它可以完美运行。当我插入pymssql库以连接到该新银行并应用差异文件提取时,遇到以下错误:
我寻求帮助,以了解为什么对于Sql Server上的表我无法应用脚本!
import os
import pandas as pd
import numpy as np
import mysql.connector as sql
from datetime import datetime, timedelta
from airflow.contrib.operators.mssql_to_gcs import MsSqlToGoogleCloudStorageOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
import pyarrow
import airflow
from gcloud import storage
from google.cloud import bigquery
from airflow import DAG
import pyodbc
import pymssql
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
def update_table():
query_bq = """SELECT * FROM dataset.table_test"""
query_sql = """select id, \
col2, \
col3, \
col4, \
col5, \
col6, \
col7, \
col8, \
col9, \
replace(replace(replace(col10,';','|'),'\n',''),'"','') as col10, \
replace(replace(replace(col11,';','|'),'\n',''),'"','') as col11, \
col12, \
col13, \
replace(replace(replace(col14,';','|'),'\n',''),'"','') as col14, \
replace(replace(replace(col15,';','|'),'\n',''),'"','') as col15, \
replace(replace(replace(col16,';','|'),'\n',''),'"','') as col16, \
replace(replace(replace(col17,';','|'),'\n',''),'"','') as col17, \
replace(replace(replace(col18,';','|'),'\n',''),'"','') as col18, \
col19, \
replace(replace(replace(col20,';','|'),'\n',''),'"','') as col20, \
replace(replace(replace(col21,';','|'),'\n',''),'"','') as col21, \
col22, \
col23, \
col24, \
col25, \
col26, \
replace(replace(replace(col27,';','|'),'\n',''),'"','') as col27, \
col28, \
col29 \
from operacoes_b2w"""
bucket_name = 'bucket_name'
schema_path_gcs = 'path/subpath/'
schema_name_gcs = 'table_test.json'
table_path_gcs = 'dir/table_test/'
table_name_gcs = 'table_test' + '_' + datetime.now().strftime("%Y%m%d_%H%M%S") + '.csv'
dataset_bq = 'dataset'
table_bq = 'table_test'
date_columns = ['col3','col13','col22']
client = bigquery.Client()
query_job = client.query(query_bq)
df_bq = query_job.to_dataframe()
ids_bq = df_bq.id.tolist()
# MySQL Credentials and settings
db = pymssql.connect(server='ip_adress',user='username',password='***',database='bdd',port='1433')
df_mysql = pd.read_sql(query_sql, db, parse_dates=date_columns)
ids_mysql = df_mysql.iloc[:,0].tolist()
ids_diff = np.setdiff1d(ids_mysql, ids_bq)
df_diff1 = df_mysql.loc[df_mysql.id.isin(ids_diff), :]
df_diff = df_diff1.replace({pd.np.nan: None})
if df_diff.shape[0] > 0:
df_diff.to_csv(table_name_gcs)
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
#blob_schema = bucket.blob(schema_path_gcs+schema_name_gcs)
#blob_schema.download_to_filename(schema_name_gcs)
#schema_fields = client.schema_from_json(schema_name_gcs)
#os.remove(schema_name_gcs)
blob_table = bucket.blob(table_path_gcs+table_name_gcs)
blob_table.upload_from_filename(table_name_gcs)
os.remove(table_name_gcs)
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = 'WRITE_APPEND'
job_config.schema = [
bigquery.SchemaField('id','INTEGER',mode='REQUIRED'),
bigquery.SchemaField('col2','INTEGER',mode='NULLABLE'),
bigquery.SchemaField('col3','DATE',mode='REQUIRED'),
bigquery.SchemaField('col4','FLOAT',mode='REQUIRED'),
bigquery.SchemaField('col5','FLOAT',mode='NULLABLE'),
bigquery.SchemaField('col6','FLOAT',mode='REQUIRED'),
bigquery.SchemaField('col7','FLOAT',mode='REQUIRED'),
bigquery.SchemaField('col8','FLOAT',mode='NULLABLE'),
bigquery.SchemaField('col9','INTEGER',mode='NULLABLE'),
bigquery.SchemaField('col10','STRING',mode='NULLABLE'),
bigquery.SchemaField('col11','STRING',mode='REQUIRED'),
bigquery.SchemaField('col12','INTEGER',mode='REQUIRED'),
bigquery.SchemaField('col13','date',mode='NULLABLE'),
bigquery.SchemaField('col14','STRING',mode='NULLABLE'),
bigquery.SchemaField('col15','STRING',mode='NULLABLE'),
bigquery.SchemaField('col16','STRING',mode='NULLABLE'),
bigquery.SchemaField('col17','STRING',mode='NULLABLE'),
bigquery.SchemaField('col18','STRING',mode='NULLABLE'),
bigquery.SchemaField('col19','INTEGER',mode='NULLABLE'),
bigquery.SchemaField('col20','STRING',mode='NULLABLE'),
bigquery.SchemaField('col21','STRING',mode='NULLABLE'),
bigquery.SchemaField('col22','DATE',mode='REQUIRED'),
bigquery.SchemaField('col23','INTEGER',mode='REQUIRED'),
bigquery.SchemaField('col24','INTEGER',mode='NULLABLE'),
bigquery.SchemaField('col25','INTEGER',mode='NULLABLE'),
bigquery.SchemaField('col26','INTEGER',mode='NULLABLE'),
bigquery.SchemaField('col27','STRING',mode='NULLABLE'),
bigquery.SchemaField('col28','INTEGER',mode='NULLABLE'),
bigquery.SchemaField('col29','INTEGER',mode='NULLABLE')
]
# Criar e chamar job
dataset_ref = client.dataset(dataset_bq)
table_ref = dataset_ref.table(table_bq)
job = client.load_table_from_dataframe(df_diff.reset_index(drop=True), table_ref, location="southamerica-east1", job_config=job_config)
job.result()
print(str(len(ids_diff)) + ' row(s) added.')
default_args = {
'owner': 'bexs-data',
'start_date': airflow.utils.dates.days_ago(0),
'depends_on_past': False,
#Exclusao de email
#'email': ['airflow@apache.org'],
'email_on_failure': False,
'email_on_retry': False,
'depends_on_past': False,
'catchup': False,
#Se o processo falhar, tente novamente depois de esperar pelo menos 5 minutos
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
with DAG('dag_test' , default_args=default_args, description='Python DAG', schedule_interval='25 9 * * *') as dag:
python_task = PythonOperator(task_id='run_dag', python_callable=update_bq_table, dag=dag)
python_task
错误:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/airflow/models/taskinstance.py", line 1094, in handle_failure
task.on_failure_callback(context)
File "/airflow/dags/git/dag_test.py", line 144, in
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table Traceback (most recent call last):
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/bin/airflow", line 32, in <module>
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table args.func(args)
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/airflow/utils/cli.py", line 74, in wrapper
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table return f(*args, **kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 522, in run
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table _run(args, dag, ti)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 440, in _run
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table pool=args.pool,
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/airflow/utils/db.py", line 74, in wrapper
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table return func(*args, **kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/airflow/models/taskinstance.py", line 926, in _run_raw_task
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table result = task_copy.execute(context=context)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 113, in execute
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table return_value = self.execute_callable()
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 118, in execute_callable
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table return self.python_callable(*self.op_args, **self.op_kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/airflow/dags/git/operacoes_operacoes_b2w.py", line 132, in update_table
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table job = client.load_table_from_dataframe(df_diff.reset_index(drop=True), table_ref, location="southamerica-east1", job_config=job_config)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/client.py", line 1566, in load_table_from_dataframe
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table parquet_compression=parquet_compression,
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 368, in dataframe_to_parquet
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table arrow_table = dataframe_to_arrow(dataframe, bq_schema)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 335, in dataframe_to_arrow
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 187, in bq_to_arrow_array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table return pyarrow.array(series, type=arrow_type)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "pyarrow/array.pxi", line 191, in pyarrow.lib.array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "pyarrow/array.pxi", line 78, in pyarrow.lib._ndarray_to_array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table File "pyarrow/error.pxi", line 95, in pyarrow.lib.check_status
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table pyarrow.lib.ArrowTypeError: an integer is required (got type str)