请帮忙。我有一个如下数据框:
df <- data.frame("G"=c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7),
"C"=c(1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0),
"SKU"=c("a","b","c","a","c","d","a","c","d","a","b","c","a","b","c","b","c","d","a","b","c"))
df
G C SKU
1 1 a
1 0 b
1 0 c
2 0 a
2 1 c
2 0 d
3 1 a
3 0 c
3 0 d
4 0 a
4 1 b
4 0 c
5 1 a
5 0 b
5 0 c
6 0 b
6 1 c
6 0 d
7 0 a
7 1 b
7 0 c
我想找到独特的&#34;块&#34;在这个数据框中。例如,这里我们有三个块:(a,b,c),(a,c,d)和(b,c,d)。我想用这些独特的块创建表,并对变量&#34; C&#34;在所有&#34; G&#34;对于某个区块中的每个SKU。最后得到这样的数据框:
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
正如我所说,这里有三个独特的块,每个块的New_G - 识别器,以及New_C - &#34; C&#34;的总和。对于某个区块中的每个SKU。 (例如,请参阅第一行.SKU =&#34; a&#34;,NEW_C = 2 - 这意味着在旧数据框SKU&#34; a&#34;同时在块中(a,b) ,c)变量&#34; C&#34; = 1两次)(另一个例子:见第四行。再次SKU =&#34; a&#34;,但NEW_C = 1 - 这意味着在旧数据框SKU&#34; a&#34;在块(a,c,d)中有变量&#34; C&#34; = 1一次)
如果我的问题不明确请立即告诉我。
答案 0 :(得分:4)
您可以使用toString
创建索引,并从那里按组使用简单求和。挑战在于获得独特群体的索引:
ind <- df %>% group_by(G) %>% summarise(temp=toString(SKU)) %>% mutate(fac=as.numeric(as.factor(temp)))
ind <- rep(ind$fac, each=3)
df$ind <- ind
df %>% group_by(ind, SKU) %>% summarise(New_C = sum(C))
# Source: local data frame [9 x 3]
# Groups: ind [?]
#
# ind SKU New_C
# (dbl) (fctr) (dbl)
# 1 1 a 2
# 2 1 b 2
# 3 1 c 0
# 4 2 a 1
# 5 2 c 1
# 6 2 d 0
# 7 3 b 0
# 8 3 c 1
# 9 3 d 0
修改强>
这可能更快:
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))
答案 1 :(得分:2)
使用dplyr:
library(dplyr)
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
输出:
bin SKU New_C
(chr) (fctr) (dbl)
1 a,b,c a 2
2 a,b,c b 2
3 a,b,c c 0
4 a,c,d a 1
5 a,c,d c 1
6 a,c,d d 0
7 b,c,d b 0
8 b,c,d c 1
9 b,c,d d 0
答案 2 :(得分:2)
这是基础R的解决方案。
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
> Newdf
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
Pierre Lafortune和Edward R. Mazurek提供的dplyr
解决方案要快得多。下面的BuildRandomDF
构建的数据框与OP发布的数据框非常相似。
library(gtools)
BuildRandomDF <- function(n) {
set.seed(117)
samp1 <- sample(3:5, n, replace = TRUE)
Len5 <- length(which(samp1==5))
Len4 <- length(which(samp1==4))
Len3 <- length(which(samp1==3))
perm5 <- permutations(5,5,letters[1:5])
perm4 <- permutations(4,4,letters[1:4])
perm3 <- permutations(3,3,letters[1:3])
sampPerm5 <- sample(nrow(perm5), Len5, replace = TRUE)
sampPerm4 <- sample(nrow(perm4), Len4, replace = TRUE)
sampPerm3 <- sample(nrow(perm3), Len3, replace = TRUE)
G <- do.call(c, lapply(1:n, function(x) rep(x, samp1[x])))
i <- j <- k <- 0L
SKU <- do.call(c, lapply(1:n, function(x) {
if (samp1[x]==3) {
perm3[sampPerm3[j <<- j+1L],]
} else if (samp1[x]==4) {
perm4[sampPerm4[k <<- k+1L],]
} else {
perm5[sampPerm5[i <<- i+1L],]
}}))
C <- sample(0:1, length(SKU), replace = TRUE)
data.frame(G, C, SKU)
}
以下是功能:
library(dplyr)
DplyrTest <- function(df) {
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))
}
DplyrCheck2 <- function(df) {
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
}
BaseTest <- function(df) {
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
Newdf
}
以下是时间:
df <- BuildRandomDF(10^4)
system.time(df1 <- DplyrCheck(df))
user system elapsed
0.43 0.00 0.43
system.time(df2 <- DplyrCheck2(df))
user system elapsed
0.39 0.00 0.39
system.time(df3 <- BaseTest(df))
user system elapsed
5.15 0.00 5.19
all(sort(unlist(df3$New_C))==sort(df1$New_C))
[1] TRUE
all(sort(df1$New_C)==sort(df2$New_C))
[1] TRUE