SQL - 确定列中最常出现的单词

时间:2011-12-01 09:37:08

标签: sql-server tsql vba frequency-analysis

有没有一种简单的方法可以使用T-SQL或VBA确定列/字段中最常出现的单词?

我正在为两个给定的记录集开发模糊匹配系统,并希望生成一个匹配的字符串,其中删除最常出现的单词。由于数据来自客户关系管理数据库,因此将删除“limited”,“ltd”,“plc”和“CORPORATION”等术语。

2 个答案:

答案 0 :(得分:4)

为sql-server 2005 +

编写

要拆分的功能:

create function f_split
(
  @a varchar(max), 
  @delimiter varchar(20)
)
RETURNS @t TABLE(substr varchar(200))
as
begin
set @a = @a + @delimiter
;with a as
(
  select cast(1 as bigint) f1, charindex(@delimiter, @a) f2
  where len(@a) > 0
  union all
  select f2 + (len(@delimiter)) + 1, charindex(@delimiter, @a, f2+1)
  from a
  where f2 > 0
)
insert @t
select substring(@a, f1, f2 - f1) from a
where f1 < f2
return
end
go

查询:

--testdata
declare @table table(name varchar(50))

insert @table values('bla bla bla ltd')
insert @table values('bla plc ltd')
insert @table values('more text CORPORATION')


declare @matchlist table(name varchar(50), replacement varchar(50))
insert @matchlist values('ltd', 'limited')
insert @matchlist values('plc', 'limited')
insert @matchlist values('CORPORATION', 'limited')

--query
select coalesce(m.replacement, a.substr) name, count(*) count from @table p
cross apply
(
  select substr from 
  dbo.f_split(p.name, ' ')
) a
left join
@matchlist m
on a.substr = m.name
group by coalesce(m.replacement, a.substr)
order by 2 desc

结果:

name  count
----  -----
bla       4
limited   4
more      1
text      1

答案 1 :(得分:0)

希望这对你有用。

   create table sometable
    ( id integer not null primary key identity
    , mYWords text not null
    );
    insert into sometable (mYWords) 
    values ('a word that appears maximum number of times in a column')
    insert into sometable (mYWords) 
    values ('Is it possible to get words from text columns in a sql server database')
    insert into sometable (mYWords) 
    values ('This could solve my problem if reffered column contain only single word')
    insert into sometable (mYWords) 
    values ('that''s going to require that you split out every word in the column individually')
    insert into sometable (mYWords) 
    values ('the query will definitely not be easy to write')
    insert into sometable (mYWords) 
    values ('Please read the sticky at the top of the board')
    insert into sometable (mYWords) 
    values ('The physical order of data in a database has no meaning')

GO

CREATE TABLE WordList (
      Word varchar(256)
    , WordId int IDENTITY(1,1)
    , Add_Dt datetime DEFAULT (GetDate()))
 GO

CREATE UNIQUE INDEX UnqueWords_PK ON WordList(Word)
GO

CREATE PROC isp_INS_WORD_LIST
AS
BEGIN
    SET NOCOUNT ON
    DECLARE @Words INT, @Pos INT, @x Int, @str varchar(256)
          , @word varchar(256), @start int, @end int, @exitstart int
    SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1

    DECLARE Cur1 CURSOR FOR SELECT mYWords FROM sometable
    OPEN Cur1
    FETCH NEXT FROM Cur1 INTO @str

    WHILE @@FETCH_STATUS = 0
      BEGIN
        WHILE (@x <> 0)
            BEGIN
                SET @x     = CHARINDEX(' ', @str, @Pos)
                IF @x <> 0
                  BEGIN 
                    SET @end   = @x - @start
                    SET @word  = SUBSTRING(@str,@start,@end)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @start, @end, @str
                    SET @exitstart = @start + @end + 1
                    SET @Pos   = @x + 1
                    SET @start = @x + 1
                    SET @Words = @Words + 1
                  END
                IF @x = 0
                  BEGIN
                    SET @word  = SUBSTRING(@str,@exitstart,LEN(@str)-@exitstart+1)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @exitstart, LEN(@str)-@exitstart, @str
                  END
            END
        FETCH NEXT FROM Cur1 INTO @str
        SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1
      END   

      CLOSE Cur1
      DEALLOCATE Cur1
      SET NOCOUNT OFF
    RETURN @Words
END
GO

EXEC isp_INS_WORD_LIST
GO

SELECT * FROM WordList ORDER BY Word
GO

DROP PROC isp_INS_WORD_LIST
DROP TABLE WordList, sometable
GO