优化SQL Server中的数据加载

时间:2016-07-21 06:29:10

标签: sql sql-server sql-server-2014 sql-optimization

我一直在努力将大约1800万条记录从Oracle上传到SQL Server。问题是加载任务需要几天才能完成,即使数据不是那么多。我已经标记了相应于各自完成执行时间的部分代码。

declare @v_sql VARCHAR(MAX)
declare @column_name VARCHAR(MAX) 
declare @column_name2 VARCHAR(MAX) =''
declare @column_name3 VARCHAR(MAX) =''
declare @tablename VARCHAR(MAX) ='TITLE_BLOCK_PARTS' --input parameter
declare @v_sql2 VARCHAR(MAX)=''
declare @temp_table VARCHAR(500) = '##'+@tablename+'_T'
declare @temp_table2    VARCHAR(500)
declare @v_entityid int = 14 --input parameter
declare @vcount INT = 1
declare @v_total_records bigint
DECLARE @CountResults TABLE (CountReturned bigint)
declare @v_total_ei bigint
declare @v_max_ei bigint
declare @v_total_no_col int
declare @col_to_exclude VARCHAR(MAX) = ',ID,CLASS,'
declare @stg_table vARCHAR(500)  = '##'+@tablename+'_TEMP'
declare @v_success int = -1
declare @v_tablename VARCHAR(MAX) = @tablename
declare @itemnumber varchar(500) ='';
declare @rev varchar(500) ='';
declare @v_stg_sql  VARCHAR(MAX) = ''
declare @v_temp_table vARCHAR(500)  = '##'+@tablename+'_T_ORI'
declare @v_sql3 VARCHAR(MAX) = ''
declare @v_countrownum bigint = 1
declare @v_rownum bigint 
DECLARE @v_rownumtab TABLE (ROWCountReturned bigint)

BEGIN

--SET @col_to_exclude =  ''''+REPLACE(@col_to_exclude,',',''',''') +'''' 

--Start: Following code takes maximum 5 to 7 minutes to complete
IF EXISTS (SELECT * FROM sys.tables WHERE name = @v_tablename)
Begin 
set @v_sql = ' DROP TABLE ' + @v_tablename
exec (@v_sql)
END

set @v_sql = 'SELECT *
into ' + @tablename +'
  FROM [ORCLTST]..[AGILE].'+@v_tablename;

exec (@v_sql)

--End: Following code takes maximum 5 to 7 minutes to complete

--Start: Following code takes maximum 4 minutes to complete
IF OBJECT_ID('tempdb..'+@stg_table) IS NOT NULL 
exec ('drop table ' + @stg_table)

SET @v_sql = 'SELECT * 
INTO '+@stg_table+
'
FROM ' + @tablename;

EXEC (@v_sql);

set @v_sql = ''

set @v_total_no_col = (SELECT COUNT(COLUMN_NAME)
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND COLUMN_NAME
NOT IN 
(
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND 
charindex(',' + CAST(COLUMN_NAME as nvarchar(20)) + ',', @col_to_exclude) > 0
)
)


DECLARE get_columns CURSOR FOR  
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND COLUMN_NAME
NOT IN 
(
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND 
charindex(',' + CAST(COLUMN_NAME as nvarchar(20)) + ',', @col_to_exclude) > 0
)

--SELECT COLUMN_NAME
--FROM INFORMATION_SCHEMA.COLUMNS
--WHERE TABLE_NAME = @tablename
--AND COLUMN_NAME NOT IN (@col_to_exclude)
--@tablename
--print @sql3

OPEN get_columns  
FETCH NEXT FROM get_columns INTO @column_name

WHILE @@FETCH_STATUS = 0          
 BEGIN  
--PRINT @column_name        

SET @column_name2 = replace(REPLACE(REPLACE(LTRIM(RTRIM(REPLACE(@column_name,'"',''))),')',''),'(',''),' ','')

--replace(@column_name2
--print 'ALTER TABLE ' + @sql5+ ' DROP constraint ' + @sql3
set @v_sql2+='CAST('+'['+@column_name+']'+' AS VARCHAR(8000))'+ 'AS '+ @column_name2 + ',
'
+
'
'


SET @v_sql3 += 'CAST('+'ISNULL(['+@column_name+'],'''')'+' AS VARCHAR(8000))'+ 'AS '+ @column_name2 + ',
'
+
'
'

SET @column_name3 += @column_name2 + ',
'+

'  
'

FETCH NEXT FROM get_columns INTO  @column_name        

      end  
     CLOSE get_columns          
 DEALLOCATE get_columns 
--Print @v_sql2   
set @v_sql2 = left(@v_sql2, len(@v_sql2)-5)

set @v_sql3 = left(@v_sql3, len(@v_sql3)-5)

SET @column_name3 = left(@column_name3, len(@column_name3)-7)

IF OBJECT_ID('tempdb..'+@temp_table) IS NOT NULL 
exec ('drop table ' + @temp_table)

IF OBJECT_ID('tempdb..'+@temp_table+'2') IS NOT NULL 
exec ('drop table ' + @temp_table+'2')

IF OBJECT_ID('tempdb..'+@v_temp_table) IS NOT NULL 
exec ('drop table ' + @v_temp_table)

IF OBJECT_ID('tempdb..'+@v_temp_table+'2') IS NOT NULL 
exec ('drop table ' + @v_temp_table+'2')

SET @v_sql = 'Select
    ROW_NUMBER() 
        OVER (ORDER BY number, REV) AS Row, 
        '
            +
    @v_sql2
    +
    '
    '
    +   


 '   INTO ' + @v_temp_table + ' 
 from 

 '

 +
 @tablename

PRINT @v_sql
EXEC (@V_SQL)



SET @v_sql = 'Select
    ROW_NUMBER() 
        OVER (ORDER BY number, REV) AS Row, 
        '
            +
    @v_sql3
    +
    '
    '
    +   


 '   INTO ' + @v_temp_table+'2' + ' 
 from 

 '

 +
 @v_temp_table

PRINT @v_sql
EXEC (@V_SQL)

SET @v_sql ='
Select  ROW
 ,ColumnName
       , ColumnValue
       INTO ' + @temp_table + ' 
FROM 
(
    Select
    ROW,
    '
    +
    @v_sql3
    +
    '
    '
    +
'

'
+   

 '   from 

 ' 
 +
 @v_temp_table+'2'
 +
 '
 '
 +
 '
 )t  
   UNPIVOT (ColumnValue for ColumnName 
           IN (
           '
           +
           @column_name3
           +
           '

           '
           +

            '   
               )
            )UP
            ';


PRINT @v_sql
--END;
--/*
EXEC (@V_SQL)

set @v_sql = ''

/*set @v_sql = 'with q1 as  (
SELECT t2.Id, t2.displayorder,t2.DisplayName ,T2.[Group],t1.* 
'
 +
 ' 
FROM '+@temp_table+ ' t1
INNER JOIN PLMColumn t2
ON t1.ColumnName = t2.name
where t2.EntityId = ' + cast(@v_entityid as varchar(100))
+
')
,q2 as(
    select      Id, ColumnName, displayorder,[Group], count(*) as c_nnum
    from q1

group by Id,ColumnName,displayorder,[Group])

select t1.id, t1.displayorder, t1.ColumnName, t2.columnvalue
into   '+ @temp_table+'2'+
'
from q2 t1
INNER JOIN '+@temp_table+ ' t2
ON t1.ColumnName = t2.ColumnName
WHERE T1.[Group] <> ''PAGE THREE''

' */

 --

set @temp_table2 = replace(@temp_table+'2', '##','');

IF EXISTS (SELECT * FROM sys.tables WHERE name = @temp_table2)
Begin 
set @v_sql = ' DROP TABLE ' + @temp_table2
exec (@v_sql)
END

set @v_Sql = 'SELECT t1.Row,t2.Id, t2.displayorder,t2.DisplayName ,T2.[Group],t1.ColumnName, t1.ColumnValue
 into   '+ @temp_table2 +
' FROM '+@temp_table+ ' t1
INNER JOIN PLMColumn t2
ON t1.ColumnName = t2.name
where t2.EntityId = ' + cast(@v_entityid as varchar(100)) + '
and T2.[Group] <> ''PAGE THREE'''

print @v_sql
exec(@v_sql)


set @v_sql = ' CREATE INDEX idx_'+@temp_table2+'_row
ON '+ @temp_table2 + ' (row)';

print @v_sql
exec(@v_sql)
--end
--/*
--set @v_sql = 'select COUNT(*) from ' + @tablename
SELECT @v_sql = N'SELECT COUNT(*) FROM' + QUOTENAME(@tablename);

INSERT @CountResults
EXEC(@v_sql)


set @v_total_records = (SELECT CountReturned FROM @CountResults)
print '@v_total_records '+ cast(@v_total_records as varchar(5000)) 

set @v_max_ei = (SELECT IDENT_CURRENT ('PLMENTITYINSTANCE'))
set @v_max_ei = @v_max_ei + 1
--(select ISNULL(MAX (id),0) from PLMEntityInstance)

print '@v_max_ei '+ cast(@v_max_ei as varchar(5000)) 

set @v_total_ei = ISNULL(@v_total_records,0) + @v_max_ei
print '@v_total_ei '+ cast(@v_total_ei as varchar(5000)) 
--END;


IF EXISTS (SELECT * FROM sys.tables WHERE name = 'ITEM_STG')
Begin 
set @v_sql = ' DROP TABLE ITEM_STG'
exec (@v_sql)
END
set @v_sql = 'CREATE TABLE [dbo].[ITEM_STG](
    [INSTANCE_ID] [bigint] NOT NULL,
    [ITEM_NUMBER] [varchar](max) NOT NULL,
    [REV] [varchar](max) NOT NULL)'

exec (@v_sql)

set @v_sql = ''
--END
--/*
WHILE (@vcount <= @v_total_records)
BEGIN

INSERT INTO PLMEntityInstance (EntityId)
VALUES (@v_entityid)


SET @vcount = @vcount + 1
--print 
END 


set @v_sql = ''

set @v_sql = N'with c AS
(select DISTINCT ROW from ' + QUOTENAME(@temp_table2) +')

select count(*) from c

'

INSERT @v_rownumtab
EXEC(@v_sql)


set @v_rownum = (select * from @v_rownumtab)

print @v_rownum

--End: Following code takes maximum 4 minutes to complete

--end
--/*


--Start: Following code takes days to complete
while (@v_max_ei <= @v_total_ei AND @v_countrownum <=@v_rownum)
BEGIN

set @v_sql = ''
--top('+CAST(@v_total_no_col AS varchar(100))+') 
set @v_sql = '

with q2
as(

select 

* from '
+ @temp_table2
 +
 '
 where row = '+ CAST(@v_countrownum AS varchar(5000)) + '

 )
INSERT INTO PLMColumnValue(EntityInstanceId, ColumnID, Value)

select '+ CAST(@v_max_ei AS varchar(5000)) + ' as Entityinstanceid, t.id as column_id, t.columnvalue from q2 t
order by DisplayOrder
'

print @v_countrownum
print @v_sql
exec (@v_sql)


set @itemnumber = (select Value from PLMColumnValue where columnid=(select id from PLMColumn where EntityId = @v_entityid and Name = 'number') and EntityInstanceId =  @v_max_ei)
set @rev = (select Value from PLMColumnValue where columnid=(select id from PLMColumn where EntityId = @v_entityid and Name = 'rev') and EntityInstanceId =  @v_max_ei)

print @itemnumber
print @rev

SET @v_stg_sql = ''
set @v_stg_sql = '
INSERT INTO ITEM_STG (INSTANCE_ID,ITEM_NUMBER,REV)
select ' +  CAST(@v_max_ei AS varchar(5000)) + ' , ' + ''''+@itemnumber+'''' + ' , '  +''''+ @rev+'''' + ''

print @v_stg_sql
exec (@v_stg_sql)


set @itemnumber = '';
set @rev = '';

SET @v_stg_sql = ''
set @v_sql = ''

SET @v_max_ei = @v_max_ei + 1;
set @v_countrownum = @v_countrownum + 1;
END

--End: Following code takes days to complete

END;

正在使用的主要表是PlmEntityInstance,PlmColumnValue和ITEM_STG。 PlmColumnValue表是加载所有迁移数据的地方,此表具有EAV模型,此处代码需要很长时间才能加载数据。我该如何优化此流程?在所有必需的修改之后和加载之前在表上创建所有数据的索引(其名称存储在变量@ temp_table2中)确实有很大帮助,但仍然不够好。我昨天在当地时间2100点启动了加载过程并且已经过了15个小时,它在1800万条记录中只加载了149088条记录。请帮助!

如果没有提供足够的信息,请告知我们。

请参阅下面的等待统计数据

WaitType    Wait_S  Resource_S  Signal_S    WaitCount   Percentage  AvgWait_S   AvgRes_S    AvgSig_S
CXPACKET    657100.57   636945.69   20154.88    338749067   59.49   0.0019  0.0019  0.0001
PREEMPTIVE_OLEDBOPS 268405.70   268405.70   0.00    24041   24.30   11.1645 11.1645 0.0000
OLEDB   91213.76    91213.76    0.00    34867   8.26    2.6160  2.6160  0.0000
ASYNC_NETWORK_IO    33316.35    32352.43    963.93  4060699 3.02    0.0082  0.0080  0.0002

示例数据放在以下链接上:https://www.dropbox.com/home/Public/Query/

上述链接中提供的csv中的数据有8531条记录。

您可以使用'TITLE_BLOCK_PARTS'

初始化参数@ temp_table2

1 个答案:

答案 0 :(得分:1)

使用tally table将所有WHILE循环替换为基于集INSERT的集合。例如,替换

declare @vcount INT = 1
WHILE (@vcount <= @v_total_records)
BEGIN

    INSERT INTO PLMEntityInstance (EntityId)
    VALUES (@v_entityid)

    SET @vcount = @vcount + 1

END 

INSERT INTO PLMEntityInstance (EntityId)
SELECT @v_entityid 
FROM Tally 
WHERE N <= @v_total_records

可以通过多种方式创建Tally,例如,2000万行计数:

WITH E1(N) AS (
                SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
            ),                          
E2(N) AS (SELECT 1 FROM E1 a, E1 b), 
E4(N) AS (SELECT 1 FROM E2 a, E2 b) --10E+4 or 10,000 rows max
SELECT TOP 20000000
   IDENTITY(INT,1,1) AS N
INTO dbo.Tally
FROM E4 a, E4 b; --100,000,000 rows max

ALTER TABLE dbo.Tally
    ADD CONSTRAINT PK_Tally_N 
        PRIMARY KEY CLUSTERED (N) WITH FILLFACTOR = 100;

修改
关于重构您的上一个WHILE基于设置的代码。我不确定我完全理解整个剧本的作用,但是评论时间太长了。首先看一下最后一个循环的原始代码:

@v_countrownum = 1
...
@v_max_ei = ...
while (@v_max_ei <= @v_total_ei AND @v_countrownum <=@v_rownum)
BEGIN
...
SET @v_max_ei = @v_max_ei + 1;
set @v_countrownum = @v_countrownum + 1;
END

让我们将它重构为迭代器查询:

...
@v_max_ei = ...
WITH iterator AS (
    SELECT v_max_ei = @v_max_ei + N - 1
        ,v_countrownum = N
    FROM tally
    WHERE @v_max_ei + N - 1 <= @v_total_ei AND N <= @v_rownum
)
SELECT * 
FROM iterator

接下来,您的循环在每次迭代时无条件地将行添加到PLMColumnValueITEM_STG,因此我们必须使用LEFT JOIN迭代器和源数据表并处理NULL。我看到你的代码默认缺少数据''。 此外,我还需要一些测试数据,这些数据是使用数据集中PLMColumn字段的子集生成的。

-- test data generator
DROP TABLE title_block3;
GO
SELECT row = N, id, DisplayOrder
    ,columnvalue = c.NAME + '_val_0' + cast(N AS VARCHAR(3))
INTO title_block3
FROM PLMColumn c
JOIN tally ON tally.N <= 3
WHERE c.NAME IN ('number','rev' ) OR c.NAME LIKE 'text%'

SELECT * FROM  title_block3;

所以最后用脚本替换你的最后一个WHILE。我使用任意变量值而不是脚本的第一部分计算。 @v_rownum = 3根据测试数据。

declare @temp_table2 varchar(100);
declare @v_sql nvarchar(max);
declare @v_max_ei int;
declare @v_entityid int;
declare @v_total_ei int;
declare @v_rownum int;

set @temp_table2 = 'title_block3';  -- see generator

set @v_max_ei = 120000;
set @v_total_ei = 200000;
set @v_rownum = 3;
set @v_entityid = 14;

--Start: Following code takes days to complete

set @v_sql = 
'WITH iterator AS (
    SELECT v_max_ei = '+ CAST(@v_max_ei AS varchar(5000)) + ' + N - 1
        ,v_countrownum = N
    FROM tally
    WHERE '+ CAST(@v_max_ei AS varchar(5000)) + ' + N - 1 <= '+ CAST(@v_total_ei AS varchar(5000)) + 
    ' AND N <= '+ CAST(@v_rownum AS varchar(5000)) + '
)
INSERT INTO PLMColumnValue(EntityInstanceId, ColumnID, Value)
SELECT i.v_max_ei, isnull(t.id,''''), isnull(t.columnvalue,'''') 
FROM iterator i
LEFT JOIN ' + @temp_table2 + ' t ON t.row = i.v_countrownum
ORDER BY i.v_countrownum, t.DisplayOrder ';

print @v_sql;
exec (@v_sql);

WITH iterator AS (
    SELECT v_max_ei = @v_max_ei + N - 1
        ,v_countrownum = N
    FROM tally
    WHERE @v_max_ei + N - 1 <= @v_total_ei AND N <= @v_rownum
)
INSERT INTO ITEM_STG (INSTANCE_ID, ITEM_NUMBER, REV)
SELECT i.v_max_ei
    -- unpivot 
    , isnull(max(CASE c.Name WHEN 'number' THEN Value END),'')
    , isnull(max(CASE c.Name WHEN 'rev' THEN Value END),'')
FROM iterator i
LEFT JOIN PLMColumnValue cv ON cv.EntityInstanceId = i.v_max_ei
LEFT JOIN PLMColumn c ON cv.columnid = c.id 
    AND c.EntityId = @v_entityid 
    AND c.Name IN ( 'number' , 'rev')
GROUP BY i.v_max_ei;

-- check it

SELECT * FROM ITEM_STG;
SELECT * FROM PLMColumnValue;