在R data.frame中形成Bin

时间:2014-01-10 13:58:28

标签: r binning

我有一个包含两列的data.frame:

category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200

我需要编写一个带有两个参数的函数:dataframebin_size在数量列上运行cumsum,如果{{1}则对后续行进行拆分超过cumsum并添加一个正在运行的bin编号作为附加列。

说,输入:

bin_size
上面例子中的

应该给我:

function(dataframe, 50)

说明:

category    quantity    cumsum  bin_nbr
a            20        20         1
b            30        50         1
c            50        50         2
c            50        50         3
d            10        10         4
e            1         11         4
f           23         34         4
g            3         37         4
h            13        50         4
h            50        50         5
h            50        50         6
h            50        50         7
h            37        37         8

3 个答案:

答案 0 :(得分:1)

我无法想到使用apply / data.table等执行此操作的简洁方法,因为您具有行间依赖性和更改大小的数据框。你可以用迭代/递归的方式做到这一点,但我觉得想出写循环会更快。一个挑战是难以知道对象的最终大小,因此这可能很慢。如果性能在此应用程序中存在问题,您可以通过从df切换到矩阵(代码应该可以正常工作,转换位除外)来缓解这个问题。

fun <- function(df, binsize){
  df$cumsum <- cumsum(df$quantity)
  df$bin <- 1
  i <- 1
  repeat {
    if((extra <- (df[i, "cumsum"] - binsize)) > 0) { # Bin finished halfway through
      top <- if(i > 1L) df[1L:(i - 1L), ] else df[0L, ]
      mid <- transform(df[i, ], quantity=quantity-extra, cumsum=cumsum-extra)
      bot <- transform(df[i, ], quantity=extra, cumsum=extra, bin=bin + 1L)
      end <- if(i >= nrow(df)) df[0L, ] else df[(i + 1L):nrow(df), ]
      end <- transform(end, cumsum=cumsum(end$quantity) + extra, bin=bin + 1L)
      df <- rbind(top, mid, bot, end)
    } else if (extra == 0 && nrow(df) > i) {  # Bin finished cleanly
      df[(i + 1L):nrow(df), ]$cumsum <- df[(i + 1L):nrow(df), ]$cumsum - binsize
      df[(i + 1L):nrow(df), ]$bin <- df[(i + 1L):nrow(df), ]$bin + 1L
    }
    if(nrow(df) < (i <- i + 1)) break
  }
  rownames(df) <- seq(len=nrow(df))
  df
}
fun(df, binsize) 

#    category quantity cumsum bin
# 1         a       20     20   1
# 2         b       30     50   1
# 3         c       50     50   2
# 4         c       50     50   3
# 5         d       10     10   4
# 6         e        1     11   4
# 7         f       23     34   4
# 8         g        3     37   4
# 9         h       13     50   4
# 10        h       50     50   5
# 11        h       50     50   6
# 12        h       50     50   7
# 13        h       37     37   8

答案 1 :(得分:0)

另一个带循环的解决方案:

DF <- read.table(text="category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200", header=TRUE)

bin_size <- 50
n_bin <- ceiling(sum(DF$quantity)/bin_size)

DF$bin <- findInterval(cumsum(DF$quantity)-1, c(0, seq_len(n_bin)*50))
DF$cumsum <- cumsum(DF$quantity)

result <- lapply(seq_along(DF[,1]), function(i, df) {
  if (i==1) {
    d <- df[i, "bin"]
  } else {
    d <- df[i, "bin"]-df[i-1, "bin"]
  }
  if (d > 1) {    
    res <- data.frame(
      category = df[i, "category"],
      bin_nbr = df[i, "bin"]-seq_len(d+1)+1
    )        
    res[,"quantity"] <- bin_size
    if (i!=1) {
      res[nrow(res),"quantity"] <- df[i-1, "bin"]*bin_size-df[i-1, "cumsum"]
    }  else {
      res[nrow(res),"quantity"] <- 0
    }
    res[1,"quantity"] <- df[i, "quantity"]-sum(res[-1,"quantity"])
    return(res[res$quantity > 0,])
  } else {
    return(data.frame(
      category = df[i, "category"],
      quantity = df[i, "quantity"],
      bin_nbr = df[i, "bin"]
    ))
  }
}, df=DF)

res <- do.call(rbind, result)
res <- res[order(res$category, res$bin_nbr),]
library(plyr)
res <- ddply(res, .(bin_nbr), transform, cumsum=cumsum(quantity))
res

#    category quantity bin_nbr cumsum
# 1         a       20       1     20
# 2         b       30       1     50
# 3         c       50       2     50
# 4         c       50       3     50
# 5         d       10       4     10
# 6         e        1       4     11
# 7         f       23       4     34
# 8         g        3       4     37
# 9         h       13       4     50
# 10        h       50       5     50
# 11        h       50       6     50
# 12        h       50       7     50
# 13        h       37       8     37

答案 2 :(得分:0)

这相当于将bin边界与提供此无循环解决方案的数据合并:

library(zoo)

fun <- function(DF, binsize = 50) {
  nr <- nrow(DF)
  DF2 <- data.frame(cumsum = seq(0, sum(DF$quantity), binsize) + binsize, bin_nbr = 1:nr)
  DF.cs <- transform(DF, cumsum = cumsum(DF$quantity))
  m <- na.locf(merge(DF.cs, DF2, all = TRUE), fromLast = TRUE)
  m$bin_nbr <- as.numeric(m$bin_nbr)
  cs <- as.numeric(m$cumsum)
  m$quantity <- c(cs[1], diff(cs))
  m$cumsum <- ave(m$quantity, m$bin_nbr, FUN = cumsum)
  na.omit(m)[c("category", "quantity", "cumsum", "bin_nbr")]
}

,并提供:

> fun(DF)
   category quantity cumsum bin_nbr
1         a       20     20       1
2         b       30     50       1
3         c       50     50       2
4         c       50     50       3
5         d       10     10       4
6         e        1     11       4
7         f       23     34       4
8         g        3     37       4
9         h       13     50       4
10        h       50     50       5
11        h       50     50       6
12        h       50     50       7
13        h       37     37       8

注意:为了复制上面的结果,这是我们使用的输入:

Lines <- "category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200
"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)

REVISION 代码中的错误已得到纠正。