假设我的数据框中包含id
变量,date
变量和另一个变量x
(例如下面的df
)。对于x
的每对值,我想知道它们为id
和date
共享相同值的次数。因此,结果应如下所示。 a
和b
两次“合在一起”,a
和c
一起“合在一起”(共同意味着共享id
和{{ 1}})。速度有点重要。
date
答案 0 :(得分:3)
我们可以使用 dplyr
包来完成此任务。请注意,当我们定义df
时,我们会在调用stringsAsFactors = FALSE
时设置data.frame
!
library(dplyr)
# grouping by id and date, create a list of the x values
df %>%
group_by(id, date) %>%
summarise(x_list = list(sort(x)))-> df2
# unlist the x values into a data.frame
data.frame(
t(sapply(df2$x_list, function(x) c(x[1], x[2]))),
stringsAsFactors = FALSE) -> df3
# count the occurences
df3 %>% count(X1, X2)
X1 X2 n
<chr> <chr> <int>
1 a b 2
2 a c 1
这也可以通过以下更少的中间步骤来完成:
df %>%
group_by(id, date) %>%
summarise(x_list = list(sort(x))) %>% # list of x values
ungroup() %>%
select(x_list) %>% # only select the list
rowwise() %>% # by each row
mutate(var1 = x_list[1], var2 = x_list[2]) %>% # extract components of list
select(-x_list) %>% # remove list
count(var1, var2) # count the vars
var1 var2 n
<chr> <chr> <int>
1 a b 2
2 a c 1
答案 1 :(得分:2)
df <- data.frame(id = c(1,1,2,2,1,1),
date = c(rep(1,4),2,2),
x = c('a','b','a','c','a','b'))
使用基础R和tidyverse
动词的组合
myfun <- function(df) {
require(tidyverse)
df1 <- tibble(id=paste(df$id, df$date), x=df$x)
df2 <- split(df1, df1$id)
grp <- map_df(df2, ~as.data.frame(matrix(combn(.x$x,2), ncol=2, byrow=TRUE), stringsAsFactors=F)) %>%
count(V1,V2)
return(grp)
}
ans <- myfun(df)
V1 V2 n
1 a b 2
2 a c 1
newdf <- data.frame(id = c(1,1,2,2,1,1,1),
date = c(rep(1,4),2,2,1),
x = c('a','b','a','c','a','b','c'))
请注意,现在a, b, c
分享id=1 date=1
ans <- myfun(newdf)
V1 V2 n
1 a b 2
2 a c 2
3 b c 1
bouncy <- function(df) {
require(dplyr)
newdf %>%
group_by(id, date) %>%
summarise(x_list = list(sort(x))) %>% # list of x values
ungroup() %>%
select(x_list) %>% # only select the list
rowwise() %>% # by each row
mutate(var1 = x_list[1], var2 = x_list[2]) %>% # extract components of list
select(-x_list) %>% # remove list
count(var1, var2) # count the vars
}
ans <- bouncy(newdf)
var1 var2 n
1 a b 2
2 a c 1
注意有弹性功能在更复杂的情况下无法返回正确的答案
library(microbenchmark)
microbenchmark(myfun(newdf), bouncy(newdf))
expr min lq mean median uq max neval
myfun(newdf) 31.72188 32.23807 34.32655 32.70342 34.22985 112.5996 100
bouncy(newdf) 58.55471 59.41472 61.37818 60.59873 61.60430 78.5788 100
myfun 约快两倍