我认为这是一个图形理论问题:我们可以在两组点之间绘制多少行...我不熟悉......
例如
df = data.frame(city = c('Boston', 'Cambridge', 'Long Island', 'NYC'),
state = c('MA', 'MA', 'NY', 'NY'))
city state
1 Boston MA
2 Cambridge MA
3 Long Island NY
4 NYC NY
城市按州分配/分组。如何获得
Boston - Long Island
Boston - NYC
Cambridge - Long Island
Cambridge - NYC
换句话说,我想生成两个城市处于不同状态的每个城市对。
更一般的例子:
set.seed(123)
df = data.frame(value = 1:100,
group = letters[sample(1:26, 100, replace=T)])
> df
value group
1 1 e
2 2 m
3 3 g
4 4 o
5 5 p
6 6 a
7 7 i
8 8 o
9 9 i
10 10 h
11 11 p
12 12 h
... ... ...
我想要所有组合(value1,value2)或等价(index1,index2),其中value1和value2具有不同的组标签。
答案 0 :(得分:1)
For循环,虽然不鼓励在R中,但可用于获得所需的结果:
ddf = data.frame(value = 1:20, group = letters[sample(1:3, 20, replace=T)])
head(ddf)
value group
1 1 b
2 2 b
3 3 b
4 4 c
5 5 a
6 6 a
for(i in 1:20){
tempdf = ddf[ddf$group!=ddf[i,2],]
cat(ddf[i,1],': ',tempdf[,1], '\n')
}
1 : 4 5 6 8 9 10 13 15 17 19 20
2 : 4 5 6 8 9 10 13 15 17 19 20
3 : 4 5 6 8 9 10 13 15 17 19 20
4 : 1 2 3 5 6 7 8 11 12 13 14 16 18 19
5 : 1 2 3 4 7 9 10 11 12 14 15 16 17 18 20
6 : 1 2 3 4 7 9 10 11 12 14 15 16 17 18 20
7 : 4 5 6 8 9 10 13 15 17 19 20
8 : 1 2 3 4 7 9 10 11 12 14 15 16 17 18 20
9 : 1 2 3 5 6 7 8 11 12 13 14 16 18 19
10 : 1 2 3 5 6 7 8 11 12 13 14 16 18 19
11 : 4 5 6 8 9 10 13 15 17 19 20
12 : 4 5 6 8 9 10 13 15 17 19 20
13 : 1 2 3 4 7 9 10 11 12 14 15 16 17 18 20
14 : 4 5 6 8 9 10 13 15 17 19 20
15 : 1 2 3 5 6 7 8 11 12 13 14 16 18 19
16 : 4 5 6 8 9 10 13 15 17 19 20
17 : 1 2 3 5 6 7 8 11 12 13 14 16 18 19
18 : 4 5 6 8 9 10 13 15 17 19 20
19 : 1 2 3 4 7 9 10 11 12 14 15 16 17 18 20
20 : 1 2 3 5 6 7 8 11 12 13 14 16 18 19
每一对都可以列出:
for(i in 1:20){
tempdf = ddf[ddf$group!=ddf[i,2],]
for(j in 1:nrow(tempdf)){
cat(ddf[i,1], tempdf[j,1], '\n')
}
}
}
1 4
1 5
1 6
1 8
1 9
1 10
1 13
1 15
1 17
1 19
1 20
2 4
2 5
2 6
2 8
2 9
2 10
2 13
2 15
2 17
....
可以在另一个data.frame中轻松获得这些对。
创建另一个data.frame:
outdf = data.frame(first=numeric(), second=numeric())
for(i in 1:20){
tempdf = ddf[ddf$group!=ddf[i,2],]
for(j in 1:nrow(tempdf)){
outdf[nrow(outdf)+1,] = c(ddf[i,1], tempdf[j,1])
}
}
head(outdf)
first second
1 1 3
2 1 4
3 1 5
4 1 7
5 1 8
6 1 9
要删除重复项,请先对每一对进行排序:
for(i in 1:nrow(outdf)){
if(outdf[i,2] < outdf[i,1])
outdf[i,] = c(outdf[i,2], outdf[i,1])
}
outdf
对于每行的排序,可能首选R代码:
outdf = data.frame(t(apply(outdf, 1, sort)))
然后删除重复项:
outdf = outdf[!duplicated(outdf),]
唯一对的数量为:
nrow(outdf)
答案 1 :(得分:1)
扩展@mso回答,如果你:
a-> b与b-> a(无向图)相同。
> set.seed(123)
> n<-10 # number of value
> k<-3 # number of groups
> df = data.frame(value = 1:n, group = letters[sample(1:k, n, replace=T)])
> df
value group
1 1 a
2 2 c
3 3 b
4 4 c
5 5 c
6 6 a
7 7 b
8 8 c
9 9 b
10 10 b
> tbl<-table(df$group) # Tabulate number within each group
> tbl
a b c
2 4 4
> sum(outer(tbl,tbl)[upper.tri(outer(tbl,tbl))]) # Count number of pairs
[1] 32
> sum(apply(combn(1:length(tbl),2),2,function(x) prod(tbl[x]) )) # Another way
[1] 32
>for(i in 1:n){
tempdf = df[df$group!=df[i,2] & c(rep(F,i),rep(T,n-i)),]
cat(df[i,1],': ',tempdf[,1], '\n')
}
1 : 2 3 4 5 7 8 9 10
2 : 3 6 7 9 10
3 : 4 5 6 8
4 : 6 7 9 10
5 : 6 7 9 10
6 : 7 8 9 10
7 : 8
8 : 9 10
9 :
10 :
>count<-0
>for(i in 1:n){
tempdf = df[df$group!=df[i,2] & c(rep(F,i),rep(T,n-i)),]
if (nrow(tempdf)>0){
for(j in 1:nrow(tempdf)){
cat(df[i,1], tempdf[j,1], '\n')
count<-count+1
}
}
}
1 2
1 3
1 4
1 5
1 7
1 8
...
> count
[1] 32
答案 2 :(得分:0)
您可以使用cross_join
library(dplyr)
df = data.frame(city = c('Boston', 'Cambridge', 'Long Island', 'NYC'),
state = c('MA', 'MA', 'NY', 'NY'))
prefix = function(df, prefix)
df %>%
setNames(names(.) %>%
paste(prefix, ., sep = "_") )
df %>% prefix("from") %>%
merge(df %>% prefix("to")) %>%
filter(!(from_city == to_city & from_state == to_state))
答案 3 :(得分:0)
对于你的问题,如果我们有k组m1,m2,...,mk组件,那么不同组之间唯一对的总数是:
(m1 * m2 + m1 * m3 + ... + m1 * mk)+(m2 * m3 + m2 * m4 + ... + m2 * mk)+ ... +(m(k-1)* MK)
我有一个解决方案:
library(data.table)
df = data.table(city = 1:10,
state = c(rep(1,3),rep(2,4),rep(3,3)))
res=NULL
df[,{tt=combn(.SD[,city],2)
res<<-cbind(res,tt)},by=state]
res.all <- combn(df[,city],2)
res <- paste(res[1,],res[2,])
res.all <- paste(res.all[1,],res.all[2,])
res.all[!(res.all %in% res)]
> df
city state
1: 1 1
2: 2 1
3: 3 1
4: 4 2
5: 5 2
6: 6 2
7: 7 2
8: 8 3
9: 9 3
10: 10 3
> as.matrix(res.all[!(res.all %in% res)])
[,1]
[1,] "1 4"
[2,] "1 5"
[3,] "1 6"
[4,] "1 7"
[5,] "1 8"
[6,] "1 9"
[7,] "1 10"
[8,] "2 4"
[9,] "2 5"
[10,] "2 6"
[11,] "2 7"
[12,] "2 8"
[13,] "2 9"
[14,] "2 10"
[15,] "3 4"
[16,] "3 5"
[17,] "3 6"
[18,] "3 7"
[19,] "3 8"
[20,] "3 9"
[21,] "3 10"
[22,] "4 8"
[23,] "4 9"
[24,] "4 10"
[25,] "5 8"
[26,] "5 9"
[27,] "5 10"
[28,] "6 8"
[29,] "6 9"
[30,] "6 10"
[31,] "7 8"
[32,] "7 9"
[33,] "7 10"