有没有一种简单的方法可以使用T-SQL或VBA确定列/字段中最常出现的单词?
我正在为两个给定的记录集开发模糊匹配系统,并希望生成一个匹配的字符串,其中删除最常出现的单词。由于数据来自客户关系管理数据库,因此将删除“limited”,“ltd”,“plc”和“CORPORATION”等术语。
答案 0 :(得分:4)
为sql-server 2005 +
编写要拆分的功能:
create function f_split
(
@a varchar(max),
@delimiter varchar(20)
)
RETURNS @t TABLE(substr varchar(200))
as
begin
set @a = @a + @delimiter
;with a as
(
select cast(1 as bigint) f1, charindex(@delimiter, @a) f2
where len(@a) > 0
union all
select f2 + (len(@delimiter)) + 1, charindex(@delimiter, @a, f2+1)
from a
where f2 > 0
)
insert @t
select substring(@a, f1, f2 - f1) from a
where f1 < f2
return
end
go
查询:
--testdata
declare @table table(name varchar(50))
insert @table values('bla bla bla ltd')
insert @table values('bla plc ltd')
insert @table values('more text CORPORATION')
declare @matchlist table(name varchar(50), replacement varchar(50))
insert @matchlist values('ltd', 'limited')
insert @matchlist values('plc', 'limited')
insert @matchlist values('CORPORATION', 'limited')
--query
select coalesce(m.replacement, a.substr) name, count(*) count from @table p
cross apply
(
select substr from
dbo.f_split(p.name, ' ')
) a
left join
@matchlist m
on a.substr = m.name
group by coalesce(m.replacement, a.substr)
order by 2 desc
结果:
name count
---- -----
bla 4
limited 4
more 1
text 1
答案 1 :(得分:0)
create table sometable
( id integer not null primary key identity
, mYWords text not null
);
insert into sometable (mYWords)
values ('a word that appears maximum number of times in a column')
insert into sometable (mYWords)
values ('Is it possible to get words from text columns in a sql server database')
insert into sometable (mYWords)
values ('This could solve my problem if reffered column contain only single word')
insert into sometable (mYWords)
values ('that''s going to require that you split out every word in the column individually')
insert into sometable (mYWords)
values ('the query will definitely not be easy to write')
insert into sometable (mYWords)
values ('Please read the sticky at the top of the board')
insert into sometable (mYWords)
values ('The physical order of data in a database has no meaning')
GO
CREATE TABLE WordList (
Word varchar(256)
, WordId int IDENTITY(1,1)
, Add_Dt datetime DEFAULT (GetDate()))
GO
CREATE UNIQUE INDEX UnqueWords_PK ON WordList(Word)
GO
CREATE PROC isp_INS_WORD_LIST
AS
BEGIN
SET NOCOUNT ON
DECLARE @Words INT, @Pos INT, @x Int, @str varchar(256)
, @word varchar(256), @start int, @end int, @exitstart int
SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1
DECLARE Cur1 CURSOR FOR SELECT mYWords FROM sometable
OPEN Cur1
FETCH NEXT FROM Cur1 INTO @str
WHILE @@FETCH_STATUS = 0
BEGIN
WHILE (@x <> 0)
BEGIN
SET @x = CHARINDEX(' ', @str, @Pos)
IF @x <> 0
BEGIN
SET @end = @x - @start
SET @word = SUBSTRING(@str,@start,@end)
IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
INSERT INTO WordList(Word) SELECT @word
-- SELECT @Word, @@ROWCOUNT,@@ERROR
-- SELECT @x, @Word, @start, @end, @str
SET @exitstart = @start + @end + 1
SET @Pos = @x + 1
SET @start = @x + 1
SET @Words = @Words + 1
END
IF @x = 0
BEGIN
SET @word = SUBSTRING(@str,@exitstart,LEN(@str)-@exitstart+1)
IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
INSERT INTO WordList(Word) SELECT @word
-- SELECT @Word, @@ROWCOUNT,@@ERROR
-- SELECT @x, @Word, @exitstart, LEN(@str)-@exitstart, @str
END
END
FETCH NEXT FROM Cur1 INTO @str
SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1
END
CLOSE Cur1
DEALLOCATE Cur1
SET NOCOUNT OFF
RETURN @Words
END
GO
EXEC isp_INS_WORD_LIST
GO
SELECT * FROM WordList ORDER BY Word
GO
DROP PROC isp_INS_WORD_LIST
DROP TABLE WordList, sometable
GO