为了从大型数据集中检索社区的方法,我发现了一篇关于算法的文章,该文章似乎适用于大型数据集。无论如何,数据存储了两个表:用户(节点)和连接,我想在没有自定义应用程序帮助的情况下通过纯SQL查询检索社区(我正在使用SQL Server 2008)。
检索cliques的算法如下:
Read the graph G
Generate set neighbors(v) for every vertex of G
for each vertex v of G
call recursive_find_cliques(v, neighbors(v))
end for
Function recursive_find_cliques(x, n)
for each vertex t ∈ n by ascending order calculate set sigma
if sigma is not empty
extend x with t
call recursive_find_cliques(x, sigma)
end if
end for
其中sigma是可以用v及其邻居构成三角形的顶点集。
我已经创建了一个存储过程,它返回所选节点的邻居表,但到目前为止我还没有使用sql函数和高级查询进行解析,所以问题如下:
有谁知道如何改写 以上算法在sql中获取 集团的集合?作为问题 我可能会有点抽象 指出主要问题是 创建一个递归函数 (recursive_find_cliques(x,n))其中 将表(n)作为参数)。
谢谢!
编辑:
这是迄今为止创建的存储过程:
CREATE PROCEDURE [dbo].[Peamc_Test]
AS
BEGIN
SET XACT_ABORT ON
BEGIN TRAN
SET NOCOUNT ON;
CREATE TABLE #Users
(
UserId int NOT NULL,
userLabel varchar(50) PRIMARY KEY NOT NULL,
Observed bit NOT NULL
)
CREATE TABLE #Neighbors
(
UserId int NOT NULL,
userLabel varchar(50) NOT NULL PRIMARY KEY,
Retrieved bit NOT NULL
)
CREATE TABLE #ConnectedVertices
(
UserId int NOT NULL,
userLabel varchar(50) NOT NULL PRIMARY KEY,
)
CREATE TABLE #Cliques
(
CliqueId int NOT NULL,
UserId varchar(50) NOT NULL,
)
DECLARE @UsersCount int
DECLARE @ii int
DECLARE @User varchar(50)
DECLARE @NeighborsCount int
INSERT INTO #Users(UserId, userLabel, Observed) SELECT user_id, userLabel, 0 FROM dbo.test_users WHERE user_id IS NOT NULL
SELECT @UsersCount = COUNT(*) FROM #Users
SELECT @ii = 1
WHILE @ii <= @UsersCount
BEGIN
--select user
SELECT TOP 1 @User = userLabel FROM #Users WHERE Observed = 0 ORDER BY UserId
UPDATE #Users SET Observed = 1 WHERE userLabel = @User
--Get user's neighbors
DELETE FROM #Neighbors
INSERT INTO #Neighbors(UserId, userLabel, Retrieved)
SELECT u.user_id, t2.neighbor, 0 FROM ( SELECT CALLING_NEIGHBORS.neighbor FROM ( SELECT mc.calling_party AS neighbor FROM monthly_connections_test mc WHERE mc.called_party = @User) AS CALLING_NEIGHBORS INNER JOIN (SELECT mc.called_party AS neighbor FROM monthly_connections_test mc WHERE mc.calling_party = @User) AS CALLED_NEIGHBORS ON CALLING_NEIGHBORS.neighbor = CALLED_NEIGHBORS.neighbor) AS t2 INNER JOIN test_users u ON t2.neighbor = u.userLabel
SELECT @NeighborsCount = COUNT(*) FROM #Neighbors
SELECT @ii = @ii + 1
--HERE the function recursive_find_cliques has to search for cliques and insert the found ones in #cliques
END
SELECT * FROM #Cliques
END
它没有返回任何东西,因为它没有完成。它虽然检索当前所选节点的所有邻居,但下一步是实现recursive_find_cliques函数。
答案 0 :(得分:1)
我意识到我的第一个答案只适用于每个集团至少有一个用户没有被该集团中任何其他人引用的情况。换句话说,将找不到像A-B,B-C,C-A这样的封闭派系。
这是解决此问题的解决方案。我们再次拥有ID的用户,现在是1..20。有几种相邻关系需要处理:
与简单案例相比,为每个集团找到一个独特的启动器更加困难。 我们用一点点手法实现了这个目标:
对邻居进行重新排序,以便对于所有参考A-B,A小于B,忽略任何A = B.
如果存在任何可能导致循环的X-A,请删除所有A-X引用。这永远不会完全删除对A的引用,因为X-A仍然存在并且A-X将在递归中添加。
结果集是'起始'用户,我们用它们来填充CTE:
-- Get all pairs, where UserA < UserB, dropping any A=B or B=A
WITH LRNeighbours(A, B) AS (
SELECT
Neighbours.UserA, Neighbours.UserB
FROM
Neighbours
WHERE
Neighbours.UserA < Neighbours.UserB
UNION ALL
SELECT DISTINCT
Neighbours.UserB, Neighbours.UserA
FROM
Neighbours
WHERE
Neighbours.UserA > Neighbours.UserB
),
-- Isolate those that are not referred to by a higher numbered key
Starters(userid) AS (
SELECT DISTINCT
A
FROM
LRNeighbours
WHERE
A NOT IN (
SELECT
B
FROM
LRNeighbours
)
),
-- The recursive Common Table Expression
cliques(userid, clique) AS (
-- Number starters 1..N
SELECT
userid, ROW_NUMBER() OVER(ORDER BY userid) AS clique
FROM
Starters
UNION ALL
-- Recurse, adding users referred by siblings, avoiding starters themselves
SELECT
B, clique
FROM
LRNeighbours INNER JOIN
cliques ON
LRNeighbours.A = cliques.userid
AND B NOT IN (
SELECT
userid
FROM
starters
)
)
SELECT DISTINCT
clique, userid
FROM
cliques
ORDER BY
clique, userid
结果:
1 1
1 2
2 3
2 4
3 5
3 6
3 7
3 8
4 9
4 10
4 11
4 12
4 13
5 14
5 15
5 16
5 17
5 18
5 19
5 20
答案 1 :(得分:0)
CREATE TABLE [dbo].[Users](
[UserID] [int] IDENTITY(1,1) NOT NULL,
[UserName] [varchar](50) NOT NULL
) ON [PRIMARY]
CREATE TABLE [dbo].[Neighbours](
[UserA] [int] NOT NULL,
[UserB] [int] NOT NULL
) ON [PRIMARY]
用1..8和邻居
填充的用户UserA UserB
1 2
2 3
4 5
4 6
5 7
7 8
然后:
WITH cliques(userid, clique) AS (
SELECT
userid, ROW_NUMBER() OVER(ORDER BY userid) AS clique
FROM
Users
WHERE
users.UserID NOT IN (
SELECT
UserB
FROM
Neighbours
)
UNION ALL
SELECT
Neighbours.UserB, clique
FROM
neighbours
INNER JOIN cliques
ON Neighbours.UserA = cliques.userid
)
SELECT
clique, cliques.userid
FROM
cliques
ORDER BY
clique, userid
结果:
clique userid
1 1
1 2
1 3
2 4
2 5
2 6
2 7
2 8
答案 2 :(得分:0)
我添加了两个LABELS和两个GOTO语句
CREATE PROCEDURE [dbo].[Peamc_Test]
AS
BEGIN
SET XACT_ABORT ON
BEGIN TRAN
SET NOCOUNT ON;
CREATE TABLE #Users
(
UserId int NOT NULL,
userLabel varchar(50) PRIMARY KEY NOT NULL,
Observed bit NOT NULL
)
CREATE TABLE #Neighbors
(
UserId int NOT NULL,
userLabel varchar(50) NOT NULL PRIMARY KEY,
Retrieved bit NOT NULL
)
CREATE TABLE #ConnectedVertices
(
UserId int NOT NULL,
userLabel varchar(50) NOT NULL PRIMARY KEY,
)
CREATE TABLE #Cliques
(
CliqueId int NOT NULL,
UserId varchar(50) NOT NULL,
)
DECLARE @UsersCount int
DECLARE @ii int
DECLARE @User varchar(50)
DECLARE @NeighborsCount int
INSERT INTO #Users(UserId, userLabel, Observed) SELECT user_id, userLabel, 0 FROM dbo.test_users WHERE user_id IS NOT NULL
SELECT @UsersCount = COUNT(*) FROM #Users
SELECT @ii = 1
WHILE @ii <= @UsersCount
BEGIN
--select user
SELECT TOP 1 @User = userLabel FROM #Users WHERE Observed = 0 ORDER BY UserId
UPDATE #Users SET Observed = 1 WHERE userLabel = @User
--Get user's neighbors
DELETE FROM #Neighbors
INSERT INTO #Neighbors(UserId, userLabel, Retrieved)
SELECT u.user_id, t2.neighbor, 0 FROM ( SELECT CALLING_NEIGHBORS.neighbor FROM ( SELECT mc.calling_party AS neighbor FROM monthly_connections_test mc WHERE mc.called_party = @User) AS CALLING_NEIGHBORS INNER JOIN (SELECT mc.called_party AS neighbor FROM monthly_connections_test mc WHERE mc.calling_party = @User) AS CALLED_NEIGHBORS ON CALLING_NEIGHBORS.neighbor = CALLED_NEIGHBORS.neighbor) AS t2 INNER JOIN test_users u ON t2.neighbor = u.userLabel
SELECT @NeighborsCount = COUNT(*) FROM #Neighbors
SELECT @ii = @ii + 1
GOTO Clique_Find
--HERE the function recursive_find_cliques has to search for cliques and insert the found ones in #cliques
--------------------
Clique_Return:
--------------------
END
SELECT * FROM #Cliques
END
--------------------
Clique_Find:
--------------------
-- Code goes here
-- Code goes here
-- Code goes here
-- Code goes here
-- Code goes here
-- Code goes here
GOTO Clique_Return