pyarrow.lib.ArrowTypeError:必须为整数(类型为str)

时间:2019-09-11 16:46:06

标签: python pandas airflow pyarrow

我想从我的sql server表中提取新行。我发现获得差异的方法是使用以下脚本。对于MySql表,它可以完美运行。当我插入pymssql库以连接到该新银行并应用差异文件提取时,遇到以下错误:

我寻求帮助,以了解为什么对于Sql Server上的表我无法应用脚本!

import os
import pandas as pd
import numpy as np
import mysql.connector as sql
from datetime import datetime, timedelta
from airflow.contrib.operators.mssql_to_gcs import MsSqlToGoogleCloudStorageOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
import pyarrow
import airflow
from gcloud import storage
from google.cloud import bigquery
from airflow import DAG
import pyodbc
import pymssql
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator

def update_table():
    query_bq = """SELECT * FROM dataset.table_test"""
    query_sql = """select id, \
                    col2, \
                    col3, \
                    col4, \
                    col5, \
                    col6, \
                    col7, \
                    col8, \
                    col9, \
                    replace(replace(replace(col10,';','|'),'\n',''),'"','') as col10, \
                    replace(replace(replace(col11,';','|'),'\n',''),'"','') as col11, \
                    col12, \
                    col13, \
                    replace(replace(replace(col14,';','|'),'\n',''),'"','') as col14, \
                    replace(replace(replace(col15,';','|'),'\n',''),'"','') as col15, \
                    replace(replace(replace(col16,';','|'),'\n',''),'"','') as col16, \
                    replace(replace(replace(col17,';','|'),'\n',''),'"','') as col17, \
                    replace(replace(replace(col18,';','|'),'\n',''),'"','') as col18, \
                    col19, \
                    replace(replace(replace(col20,';','|'),'\n',''),'"','') as col20, \
                    replace(replace(replace(col21,';','|'),'\n',''),'"','') as col21, \
                    col22, \
                    col23, \
                    col24, \
                    col25, \
                    col26, \
                    replace(replace(replace(col27,';','|'),'\n',''),'"','') as col27, \
                    col28, \
                    col29 \
                    from operacoes_b2w"""
    bucket_name = 'bucket_name'
    schema_path_gcs = 'path/subpath/'
    schema_name_gcs = 'table_test.json'
    table_path_gcs = 'dir/table_test/'
    table_name_gcs = 'table_test' + '_' + datetime.now().strftime("%Y%m%d_%H%M%S") + '.csv'
    dataset_bq = 'dataset'
    table_bq = 'table_test'
    date_columns = ['col3','col13','col22']

    client = bigquery.Client()
    query_job = client.query(query_bq)
    df_bq = query_job.to_dataframe()
    ids_bq = df_bq.id.tolist()

    # MySQL Credentials and settings
    db = pymssql.connect(server='ip_adress',user='username',password='***',database='bdd',port='1433')

    df_mysql = pd.read_sql(query_sql, db, parse_dates=date_columns)
    ids_mysql = df_mysql.iloc[:,0].tolist()


    ids_diff = np.setdiff1d(ids_mysql, ids_bq)
    df_diff1 = df_mysql.loc[df_mysql.id.isin(ids_diff), :]
    df_diff = df_diff1.replace({pd.np.nan: None})
    if df_diff.shape[0] > 0:
        df_diff.to_csv(table_name_gcs)


        storage_client = storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        #blob_schema = bucket.blob(schema_path_gcs+schema_name_gcs)
        #blob_schema.download_to_filename(schema_name_gcs)
        #schema_fields = client.schema_from_json(schema_name_gcs)
        #os.remove(schema_name_gcs)


        blob_table = bucket.blob(table_path_gcs+table_name_gcs)
        blob_table.upload_from_filename(table_name_gcs)
        os.remove(table_name_gcs)


        job_config = bigquery.LoadJobConfig()
        job_config.write_disposition = 'WRITE_APPEND'
        job_config.schema = [
            bigquery.SchemaField('id','INTEGER',mode='REQUIRED'),
            bigquery.SchemaField('col2','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col3','DATE',mode='REQUIRED'),
            bigquery.SchemaField('col4','FLOAT',mode='REQUIRED'),
            bigquery.SchemaField('col5','FLOAT',mode='NULLABLE'),
            bigquery.SchemaField('col6','FLOAT',mode='REQUIRED'),
            bigquery.SchemaField('col7','FLOAT',mode='REQUIRED'),
            bigquery.SchemaField('col8','FLOAT',mode='NULLABLE'),
            bigquery.SchemaField('col9','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col10','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col11','STRING',mode='REQUIRED'),
            bigquery.SchemaField('col12','INTEGER',mode='REQUIRED'),
            bigquery.SchemaField('col13','date',mode='NULLABLE'),
            bigquery.SchemaField('col14','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col15','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col16','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col17','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col18','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col19','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col20','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col21','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col22','DATE',mode='REQUIRED'),
            bigquery.SchemaField('col23','INTEGER',mode='REQUIRED'),
            bigquery.SchemaField('col24','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col25','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col26','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col27','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col28','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col29','INTEGER',mode='NULLABLE')
            ]

        # Criar e chamar job
        dataset_ref = client.dataset(dataset_bq)
        table_ref = dataset_ref.table(table_bq)
        job = client.load_table_from_dataframe(df_diff.reset_index(drop=True), table_ref, location="southamerica-east1", job_config=job_config)
        job.result()

    print(str(len(ids_diff)) + ' row(s) added.')


default_args = {
    'owner': 'bexs-data',
    'start_date': airflow.utils.dates.days_ago(0),
    'depends_on_past': False,
    #Exclusao de email
    #'email': ['airflow@apache.org'],
    'email_on_failure': False,
    'email_on_retry': False,
    'depends_on_past': False,
    'catchup': False,
    #Se o processo falhar, tente novamente depois de esperar pelo menos 5 minutos
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG('dag_test' , default_args=default_args, description='Python DAG', schedule_interval='25 9 * * *') as dag:
    python_task = PythonOperator(task_id='run_dag', python_callable=update_bq_table, dag=dag)
    python_task

错误:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/airflow/models/taskinstance.py", line 1094, in handle_failure
    task.on_failure_callback(context)
  File "/airflow/dags/git/dag_test.py", line 144, in 
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table Traceback (most recent call last):
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/bin/airflow", line 32, in <module>
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     args.func(args)
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/utils/cli.py", line 74, in wrapper
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return f(*args, **kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 522, in run
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     _run(args, dag, ti)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 440, in _run
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     pool=args.pool,
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/utils/db.py", line 74, in wrapper
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return func(*args, **kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/models/taskinstance.py", line 926, in _run_raw_task
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     result = task_copy.execute(context=context)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 113, in execute
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return_value = self.execute_callable()
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 118, in execute_callable
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return self.python_callable(*self.op_args, **self.op_kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/airflow/dags/git/operacoes_operacoes_b2w.py", line 132, in update_table
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     job = client.load_table_from_dataframe(df_diff.reset_index(drop=True), table_ref, location="southamerica-east1", job_config=job_config)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/client.py", line 1566, in load_table_from_dataframe
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     parquet_compression=parquet_compression,
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 368, in dataframe_to_parquet
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     arrow_table = dataframe_to_arrow(dataframe, bq_schema)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 335, in dataframe_to_arrow
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 187, in bq_to_arrow_array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return pyarrow.array(series, type=arrow_type)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "pyarrow/array.pxi", line 191, in pyarrow.lib.array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "pyarrow/array.pxi", line 78, in pyarrow.lib._ndarray_to_array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "pyarrow/error.pxi", line 95, in pyarrow.lib.check_status
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table pyarrow.lib.ArrowTypeError: an integer is required (got type str)

0 个答案:

没有答案