根据另一个变量的因子中断应用变量分箱

时间:2015-03-26 19:31:57

标签: r

我有两个具有相似分布的变量,但不一样。

var1 <- 1:20
var2 <- 5:25

然后我将bin var1并创建一个新变量:

bin.var1 <- cut(var1,2)

现在我想使用与var2相同的中断来var1。有没有办法做到这不是手动?

1 个答案:

答案 0 :(得分:2)

您可以从cut.default源代码中提取相关部分:

var1 <- 1:20
var2 <- 5:25

breaks <- 2
nb <- breaks + 1
rx <- range(var1)
dx <- diff(rx)
breaks <- seq.int(rx[1L], rx[2L], length.out = nb)
breaks[c(1L, nb)] <- c(rx[1L] - dx/1000, rx[2L] + dx/1000)

breaks
# [1]  0.981 10.500 20.019

cut(var1, 2)
# [1] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5]
# [7] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5] (10.5,20]    (10.5,20]   
# [13] (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]   
# [19] (10.5,20]    (10.5,20]   
# Levels: (0.981,10.5] (10.5,20]

identical(cut(var1, 2), cut(var1, breaks))
# [1] TRUE


cut(var2, breaks)
# [1] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5]
# [7] (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]   
# [13] (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    <NA>         <NA>        
#   [19] <NA>         <NA>         <NA>        
#   Levels: (0.981,10.5] (10.5,20]

或者像@Henrik提到的那样,在?cut下的最后一个示例中,您可以使用标签

## one way to extract the breakpoints
labs <- levels(cut(var1, 2))
(br <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs) ),
            upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs) )))

#       lower upper
# [1,]  0.981  10.5
# [2,] 10.500  20.0

cut(var2, unique(c(br)))

# [1] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5] (0.981,10.5]
# [7] (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]   
# [13] (10.5,20]    (10.5,20]    (10.5,20]    (10.5,20]    <NA>         <NA>        
#   [19] <NA>         <NA>         <NA>        
#   Levels: (0.981,10.5] (10.5,20]