我正在尝试使用Azure数据工厂将14gb文件从FTP复制到我的Azure Data Lake存储中。当我执行管道时,它开始复制文件,并在半小时内复制了将近13.9 gb。
即使在管道运行了8个小时之后,也不会复制剩余数据,最终由于提供了有关该文件不可用的消息而失败。文件不可用的原因是源团队删除了下一个文件。
将积分单元增加到250
{
"name": "job_fa",
"properties": {
"activities": [
{
"name": "set_parameters_adh_or_sch",
"description": "validate and set the parameter values based on the runtype sch or adh",
"type": "Lookup",
"dependsOn": [
{
"activity": "br_bs_loggin",
"dependencyConditions": [
"Succeeded"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [
{
"name": "CheckLookup1",
"value": "1"
}
],
"typeProperties": {
"source": {
"type": "SqlSource",
"sqlReaderStoredProcedureName": "[dbo].[usp_FeedParameters_main]",
"storedProcedureParameters": {
"FeedName_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_FeedName",
"type": "Expression"
}
},
"RunType_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_RunType",
"type": "Expression"
}
},
"SrcEnddate_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_SrcEndDate",
"type": "Expression"
}
},
"SrcStartdate_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_SrcStartDate",
"type": "Expression"
}
},
"TgtDate_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_TargetDate",
"type": "Expression"
}
},
"SrcHour_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_SrcHour",
"type": "Expression"
}
},
"TgtHour_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_TgtHour",
"type": "Expression"
}
}
}
},
"dataset": {
"referenceName": "AzureSql_cdpconfiguser",
"type": "DatasetReference"
},
"firstRowOnly": true
}
},
{
"name": "br_bs_loggin",
"description": "insert into the batch run and update the batch scheduler to started in case of sch run",
"type": "Lookup",
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"source": {
"type": "SqlSource",
"sqlReaderStoredProcedureName": "[dbo].[usp_BatchRun]",
"storedProcedureParameters": {
"FeedName_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_FeedName",
"type": "Expression"
}
},
"RunType_in": {
"type": "String",
"value": {
"value": "@pipeline().parameters.p_RunType",
"type": "Expression"
}
},
"Status_in": {
"type": "String",
"value": "Started"
}
}
},
"dataset": {
"referenceName": "AzureSql_cdpconfiguser",
"type": "DatasetReference"
},
"firstRowOnly": true
}
},
{
"name": "Check if file exists in target",
"type": "GetMetadata",
"dependsOn": [
{
"activity": "Copy Data WT to ADLS",
"dependencyConditions": [
"Succeeded"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"dataset": {
"referenceName": "AzureDataLakeStoreFile_wt_tgt_path_and_name",
"type": "DatasetReference",
"parameters": {
"TgtFilePath": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"TgtFileName": {
"value": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"type": "Expression"
}
}
},
"fieldList": [
"exists",
"size"
]
}
},
{
"name": "Copy Data WT to ADLS",
"type": "Copy",
"dependsOn": [
{
"activity": "set_parameters_adh_or_sch",
"dependencyConditions": [
"Succeeded"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [
{
"name": "Source",
"value": "@{activity('set_parameters_adh_or_sch').output.firstrow.SrcFilePath_wo_dt_out}/@{activity('set_parameters_adh_or_sch').output.firstrow.SrcFileName_wt_dt_out}"
},
{
"name": "Destination",
"value": "@{activity('set_parameters_adh_or_sch').output.firstrow.TgtFilePath_wt_dt_out}/@{activity('set_parameters_adh_or_sch').output.firstrow.TgtFilePath_wt_dt_out}"
}
],
"typeProperties": {
"source": {
"type": "FileSystemSource",
"recursive": true
},
"sink": {
"type": "AzureDataLakeStoreSink"
},
"enableStaging": false,
"dataIntegrationUnits": 0
},
"inputs": [
{
"referenceName": "FTP_SRC_FA",
"type": "DatasetReference",
"parameters": {
"SrcFileName": "@activity('set_parameters_adh_or_sch').output.firstrow.SrcFileName_wt_dt_out",
"SrcFilePath": "@activity('set_parameters_adh_or_sch').output.firstrow.SrcFilePath_wo_dt_out"
}
}
],
"outputs": [
{
"referenceName": "AzureDataLakeStoreFile_wt_tgt_path_and_name",
"type": "DatasetReference",
"parameters": {
"TgtFilePath": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"TgtFileName": {
"value": "@activity('set_parameters_adh_or_sch').output.firstrow.TgtFileName_wt_dt_out",
"type": "Expression"
}
}
}
]
},
{
"name": "br_bs_update_failed",
"type": "SqlServerStoredProcedure",
"dependsOn": [
{
"activity": "Copy Data WT to ADLS",
"dependencyConditions": [
"Failed"
]
}
],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"storedProcedureName": "[dbo].[usp_BatchRunUpdate]",
"storedProcedureParameters": {
"BatchId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.Batchid_out",
"type": "Expression"
},
"type": "String"
},
"FeedID": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.FeedId_out",
"type": "Expression"
},
"type": "Int32"
},
"FeedRunId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.BatchRunId_out",
"type": "Expression"
},
"type": "Int32"
},
"Status": {
"value": "Failed",
"type": "String"
}
}
},
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1_cdp_dev_sql_db_appconfig",
"type": "LinkedServiceReference"
}
},
{
"name": "If Condition1",
"type": "IfCondition",
"dependsOn": [
{
"activity": "Check if file exists in target",
"dependencyConditions": [
"Succeeded"
]
}
],
"typeProperties": {
"expression": {
"value": "@equals(activity('Check if file exists in target').output.Exists,true)",
"type": "Expression"
},
"ifFalseActivities": [
{
"name": "Stored Procedure_failed",
"type": "SqlServerStoredProcedure",
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"storedProcedureName": "[dbo].[usp_BatchRunUpdate]",
"storedProcedureParameters": {
"BatchId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.Batchid_out",
"type": "Expression"
},
"type": "String"
},
"FeedID": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.FeedId_out",
"type": "Expression"
},
"type": "Int32"
},
"FeedRunId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.BatchRunId_out",
"type": "Expression"
},
"type": "Int32"
},
"Status": {
"value": "Failed",
"type": "String"
}
}
},
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1_cdp_dev_sql_db_appconfig",
"type": "LinkedServiceReference"
}
}
],
"ifTrueActivities": [
{
"name": "Stored Procedure1",
"type": "SqlServerStoredProcedure",
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"typeProperties": {
"storedProcedureName": "[dbo].[usp_BatchRunUpdate]",
"storedProcedureParameters": {
"BatchId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.Batchid_out",
"type": "Expression"
},
"type": "String"
},
"FeedID": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.FeedId_out",
"type": "Expression"
},
"type": "Int32"
},
"FeedRunId": {
"value": {
"value": "@activity('br_bs_loggin').output.firstrow.BatchRunId_out",
"type": "Expression"
},
"type": "Int32"
},
"Status": {
"value": "Succeeded",
"type": "String"
}
}
},
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1_cdp_dev_sql_db_appconfig",
"type": "LinkedServiceReference"
}
}
]
}
}
],
"parameters": {
"p_FeedName": {
"type": "String",
"defaultValue": "fa_cpsmyid_vdumcap1"
},
"p_BatchType": {
"type": "String",
"defaultValue": "RAW"
},
"p_RunType": {
"type": "String",
"defaultValue": "sch"
},
"p_SrcStartDate": {
"type": "String"
},
"p_SrcEndDate": {
"type": "String"
},
"p_TargetDate": {
"type": "String"
},
"p_SrcHour": {
"type": "String"
},
"p_TgtHour": {
"type": "String"
}
},
"variables": {
"v_StartDate": {
"type": "String"
},
"v_EndDate": {
"type": "String"
}
},
"folder": {
"name": "Batch_load"
}
},
"type": "Microsoft.DataFactory/factories/pipelines"
}
答案 0 :(得分:0)
我认为,根据您的描述,所有关注点都在于提高传输性能。
首先,指的是Data integration units statements,DIU
仅适用于Azure Integration Runtime,而不适用于Self-hosted Integration Runtime。您的源数据来自FTP,因此我认为它不受影响即使您已经设置了最大数量,也可以按DIU
进行操作。(当然,这是官方文件所引用的,您仍然可以从ADF小组获得验证)
那么也许您可以从此document中获得一些线索来改善复印性能。
例如:
1.尝试使用parallelCopies
属性指示要使用复制活动的并行性。但是它还受到statements的一些限制。
2。尝试将接收器数据集设置为Azure SQL Data Warehouse
,因为它似乎比ADL具有更好的性能。
3。尝试压缩源数据集中的文件以减小文件大小。
4。考虑使用Azure云服务作为Azure Blob存储之类的源数据集,据我所知,蔚蓝服务之间的复制活动的性能通常更好。