我有一个未规范化的数据库:
disciplinabd.movies:
CREATE TABLE dbo.movies
(
movieid VARCHAR (20) NULL,
title VARCHAR (400) NULL,
mvyear VARCHAR (100) NULL,
actorid VARCHAR (20) NULL,
actorname VARCHAR (250) NULL,
sex CHAR (1) NULL,
as_character VARCHAR (1500) NULL,
languages VARCHAR (1500) NULL,
genres VARCHAR (100) NULL
)
我有我的数据库:labbd11,我将从disciplinabd中规范化这些数据。 所以我正在尝试执行此查询:
INTO labbd11..movie_actor(idMovie, idActor, idCharacter)
SELECT CASE
WHEN IsNumeric(movies.movieid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.movieid AS INT)
END,
CASE WHEN IsNumeric(movies.actorid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.actorid AS INT)
END,
(SELECT id FROM actor_character WHERE character = movies.as_character)
FROM disciplinabd..movies
它正常执行,但是我必须执行此操作的大量数据,例如在disciplinabd.movies中有1400万行。
我的问题是:
insert (1, 1000) ...
的内容吗?我只需更改insert( 1001, 2000) ..
之类的值,然后继续。
我所说的是,如果有任何机会一点一点地插入我的数据库中?
这样,如果连接断开,我可以避免回滚操作。
昨天这个插入查询运行了16个小时,然后连接断了,我丢失了所有的工作。更新
CREATE TABLE movie(
id INT PRIMARY KEY,
title VARCHAR(400) NOT NULL,
year INT
)
CREATE TABLE actor (
id INT PRIMARY KEY,
name VARCHAR(250) NOT NULL,
sex CHAR(1) NOT NULL
)
CREATE TABLE actor_character(
id INT PRIMARY KEY IDENTITY,
character VARCHAR(1000)
)
CREATE TABLE movie_actor(
idMovie INT,
idActor INT,
idCharacter INT,
CONSTRAINT fk_movie_actor_1 FOREIGN KEY (idMovie) REFERENCES movie(id) ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT fk_movie_actor_2 FOREIGN KEY (idActor) REFERENCES actor(id) ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT fk_movie_actor_3 FOREIGN KEY (idCharacter) REFERENCES actor_character(id) ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT pk_movie_actor PRIMARY KEY (idMovie,idActor, idCharacter)
)
答案 0 :(得分:1)
您没有看到您正在使用的RDBMS,这可能有助于我们更准确地回答您的问题,但是为了回答您的第二个问题,您很可能会限制SELECT查询以影响插入的数据量。例如,
INSERT INTO labbd11..movie_actor(idMovie, idActor, idCharacter)
SELECT CASE
WHEN IsNumeric(movies.movieid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.movieid AS INT)
END,
CASE WHEN IsNumeric(movies.actorid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.actorid AS INT)
END,
(SELECT id FROM actor_character WHERE character = movies.as_character)
FROM disciplinabd..movies
WHERE movieid >= 1000 and movieid < 2000
如果您没有连续的ID范围,则可能会生成一个,但该方法将取决于您正在使用的特定数据库。
关于如何提高性能的初步问题,我首先将子选择移到JOIN并确保在actor_character中有适当的索引。例如:
INTO labbd11..movie_actor(idMovie, idActor, idCharacter)
SELECT CASE
WHEN IsNumeric(movies.movieid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.movieid AS INT)
END,
CASE WHEN IsNumeric(movies.actorid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.actorid AS INT)
END,
actor_character.id
FROM disciplinabd..movies
LEFT JOIN disciplinabd..actor_characture ON movies.as_character = actor_characture.character
WHERE movieid >= 1000 and movieid < 2000
同样,如果您可以明确说明您正在使用哪个数据库,我们可以提供更多量身定制的答案。如果我写的是类似的东西,我不希望1400万行花费超过几分钟的时间在服务器级硬件上执行。
答案 1 :(得分:0)
16小时似乎只需要很长时间才能插入1400万行。我不知道你的硬件是什么样的,所以我只想回答手头的问题。有1400万行,如果你每1000次打开一个连接就会慢得多,所以我会建议一个更多变量的数字。
如果可以,我还建议为movieid添加索引。
create nonclustered index IX_movies on movies(movieid)
您可以使用while循环来完成您要找的内容。
Declare @loopMax int,@bottomRange int,@topRange int,@rangeSize int
select @loopMax = MAX(movies.movieid) from disciplinabd..movies
set @rangeSize = @loopMax/20
set @bottomRange = 0
set @topRange = @rangeSize
while @topRange < @loopMax
begin
INSERT INTO labbd11..movie_actor(idMovie, idActor, idCharacter)
SELECT CASE
WHEN IsNumeric(movies.movieid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.movieid AS INT)
END,
CASE WHEN IsNumeric(movies.actorid+ '.0e0') <> 1 THEN NULL
ELSE CAST (movies.actorid AS INT)
END,
actor_character.id
FROM disciplinabd..movies
LEFT JOIN actor_character ON movies.as_character = actor_character.character
WHERE movieid >= @bottomRange and movieid < @topRange
set @bottomRange = @topRange
set @topRange = @topRange + @rangeSize
end