我有两个具有以下记录的表:
表1:100行
表2:5000万行
示例:
表1:tb100
create table tb100
(
name varchar(50)
);
insert into tb100 values('Mak John'),('Will Smith'),('Luke W')......100 rows.
表2:tb50mil
create table tb50mil
(
name varchar(50)
);
insert into tb10mil values('John A Mak'),('K Smith Will'),('James Henry')......50 millions rows.
create nonclustered index nci_tb10mil_name on tb10mil(name);
要求:我想匹配两个表之间的名称,如果另一个表中存在任何 WORD (John,Smith,Will)。例如John
中出现的John A Mark
。
我的尝试:用XML
拆分表name
的列tb100
,并用collation
添加CHARINDEX
。
;WITH splitdata AS
(
SELECT splitname
FROM
(
SELECT *,Cast('<X>' + Replace(t.name, ' ', '</X><X>') + '</X>' AS XML) AS xmlfilter
FROM tb100 t
)F1
CROSS apply
(
SELECT fdata.d.value('.', 'varchar(50)') AS splitName
FROM f1.xmlfilter.nodes('X') AS fdata(d)
) O
)
SELECT t2.name AS [Aadhar Names]
FROM tb50mil t2
INNER JOIN splitdata S
ON CHARINDEX(S.splitname collate Latin1_General_BIN,T2.name collate Latin1_General_BIN)>0
GROUP BY t2.name
执行时间:00:01:34
受影响的行数:(受影响的2251429行)
执行计划:
答案 0 :(得分:0)
如果您需要在名称中使用单独的单词,那么从概念上讲,如果您的名字只是一个字符串,那么可能需要一个表格。另外,现在分隔名称也很麻烦,因为您的中间名没有重复出现的模式。加上字符串修改实际上不是SQL的强项。相反,我会将您的表扩展为类似这样的内容:
alter table tb100
add
nameID int IDENTITY(1,1) NOT NULL,
first_name varchar(50) null,
middle_name varchar(50) null,
last_name varchar(50) null
insert into tb100 values('Mak John'),('Will Smith'),('Luke W')......100 rows.
if (SELECT LEN(col) - LEN(REPLACE(col, ' ', '')) > 1)
update tb100
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name)))),
last_name = (select Substring( Right(middle_name, (LEN(middle_name) - (Charindex(' ', middle_name) + 1)), LEN(LEN(middle_name) - (Charindex(' ', middle_name) + 1))
else
update tb100
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = '',
last_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name))))
我希望它能起作用,因为我在路上,所以没有机会对其进行测试。 如果您有机会在不进行整个修改的情况下将数据插入到这些列中,请这样做。 然后,对其他表执行相同的操作...
alter table tb50mil
add
nameID int IDENTITY(1,1) NOT NULL,
first_name varchar(50) null,
middle_name varchar(50) null,
last_name varchar(50) null
insert into tb10mil values('John A Mak'),('K Smith Will'),('James Henry')......50 million rows.
if (SELECT LEN(col) - LEN(REPLACE(col, ' ', '')) > 1)
update tb50mil
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name)))),
last_name = (select Substring( Right(middle_name, (LEN(middle_name) - (Charindex(' ', middle_name) + 1)), LEN(LEN(middle_name) - (Charindex(' ', middle_name) + 1))
else
update tb50mil
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = '',
last_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name))))
从这里开始,这实际上是一个简单的连接:
select * from tb100 hun
inner join
tb50mil mil on hun.first_name = mil.first_name OR hun.middle_name = mil.middle_name OR hun.last_name OR mil.last_name
希望这会有所帮助!