数据帧中每组的自定义分箱 - R功能

时间:2017-09-11 19:32:26

标签: r dplyr

我想在函数内为每个类别numbers创建变量name的bin。但是我在使用函数中作为参数提供的类别名称时遇到了困难。也许data.table方法会更好。

set.seed(10)
b<-(rnorm(10, sd=1,mean=10))
y<-runif(3)
pr<-y/sum(y)
names<-unlist(lapply(mapply(rep, LETTERS[1:3], 1:3), function (x) paste0(x, collapse = "") ) )
x <- sample(names, 10, replace=TRUE, prob=pr)
df<-data.frame(name=x,numbers=b)
df

#working without bin limits per category (not desired) 
#and using "numbers" in cut (not desired)
binfunction1 <- function(df, colgroup1, varcount,binsize) {
  new<-df %>%
    group_by_(colgroup1) %>%
    mutate(bin = cut(numbers, breaks <- c(seq(7, 15, by = binsize)), # limits by colgroup not implemented
                     labels = 1:(length(breaks)-1) ) )  
  return(new)
}
binfunction1(df,"name","numbers",0.5)
     name   numbers    bin
   <fctr>     <dbl> <fctr>
 1     BB 10.018746      7
 2      A  9.815747      6
 3    CCC  8.628669      4
 4    CCC  9.400832      5
 5     BB 10.294545      7
 6    CCC 10.389794      7
 7      A  8.791924      4
 8      A  9.636324      6
 9      A  8.373327      3
10      A  9.743522      6

2 个答案:

答案 0 :(得分:1)

不是最优雅的解决方案,但结果是你追求的结果? (我不太明白你的问题)

binfunction3 <- function(x, colgroup1, varcount, binsize) {

 tmp <- split(x, x[[colgroup1]], drop = TRUE)

 tp <- lapply(tmp, function(k) {
  breaks <- c(seq(min(k[[varcount]])*0.9, max(k[[varcount]])*1.1, by = binsize))
  cbind(k, data.frame(bin = cut(k[[varcount]], breaks, labels = 1:(length(breaks)-1))))
})

 tp <- do.call(rbind, tp)
 rownames(tp) <- gsub("[[:alpha:]]*\\.", "", rownames(tp))

 return(tp[rownames(x),])
}

binfunction3(df,"name","numbers",0.5)

   #    name   numbers bin
   # 1     A 10.018746   5
   # 2   CCC  9.815747   5
   # 3   CCC  8.628669   2
   # 4    BB  9.400832   2
   # 5     A 10.294545   6
   # 6    BB 10.389794   4
   # 7     A  8.791924   3
   # 8   CCC  9.636324   4
   # 9     A  8.373327   2
   # 10    A  9.743522   5

答案 1 :(得分:0)

我的答案基于Mikko,但允许更好地控制休息时间和箱子大小的最小和最大限制。

binfunctionnew <- function(x, colgroup, varcount, binexp) {
  tmp <- split(x, x[colgroup], drop = TRUE)
  tp <- lapply(tmp, function(k) {
    bin<-cut(k[,varcount], 
             breaks=c(seq(min(k[,varcount])*(1-10^(-(binexp+1))), 
                          max(k[,varcount])*(1+10^(-(binexp-2))), 
                          by = 10^(-(binexp))) ) , labels=F)
    cbind       (k, data.frame(bin = bin))
  } )
  tp <- do.call(rbind, tp)
  return(tp) 
}
#example                              or
binfunctionnew(df,"name","numbers",1) binfunctionnew(df,"name","numbers",0)
#       name   numbers bin                  name   numbers bin            
# A.1      A 10.018746  18            A.1      A 10.018746   3
# A.5      A 10.294545  21            A.5      A 10.294545   3
# A.7      A  8.791924   6            A.7      A  8.791924   2
# A.9      A  8.373327   1            A.9      A  8.373327   1
# A.10     A  9.743522  15            A.10     A  9.743522   3
# BB.4    BB  9.400832   1            BB.4    BB  9.400832   1
# BB.6    BB 10.389794  11            BB.6    BB 10.389794   2
# CCC.2  CCC  9.815747  13            CCC.2  CCC  9.815747   3
# CCC.3  CCC  8.628669   1            CCC.3  CCC  8.628669   1
# CCC.8  CCC  9.636324  11            CCC.8  CCC  9.636324   2