我将一些值分为以下几个间隔:
> set.seed(22)
> a <- rnorm(50)
> b <- as.data.frame(table(Hmisc::cut2(a, m = 10)))
> b
Var1 Freq
1 [-1.616,-0.793) 10
2 [-0.793,-0.200) 10
3 [-0.200, 0.301) 10
4 [ 0.301, 0.937) 10
5 [ 0.937, 3.253] 10
我想在另一个向量c中获取值,并将其分配给b $ Var1中的间隔所定义的bin。
> c <- runif(50, -1, 3)
> c
[1] -0.36167553 -0.42019310 1.80365545 1.45542530 -0.72798537 0.32368285 1.68209984 -0.07971160 2.69304696 -0.84131974 0.89430681
[12] -0.38260232 2.78302235 2.91256761 -0.20692439 2.21367929 2.40534034 0.26349751 0.51897997 0.10485985 -0.14338538 1.65355414
[23] 2.68974930 -0.38767144 0.75481723 2.98473148 0.79046750 2.26079307 -0.24748383 -0.18502040 2.82674089 2.97552886 1.25323374
[34] 2.11271998 2.92941982 -0.62746180 0.53751411 1.34383497 0.02002254 2.04000343 0.23576506 1.67230419 0.68045395 -0.32637800
[45] 0.33067028 -0.58080654 0.38844488 -0.34026266 1.54217623 2.51062797
当我使用findInterval尝试此操作时,c中的值将分配给以下bin:
> interval_c <- findInterval(c, b$Var1)
> interval_c
[1] 0 0 1 1 0 0 1 0 2 0 0 0 2 2 0 2 2 0 0 0 0 1 2 0 0 2 0 2 0 0 2 2 1 2 2 0 0 1 0 2 0 1 0 0 0 0 0 0 1 2
我曾期望看到以下内容:
> interval_c
[1] 2 2 5 5 2 4 5 3 5 1 4
...等
有没有一种方法可以将c中的值分配到b中正确的bin中?谢谢!
答案 0 :(得分:2)
将参数onlycuts = TRUE
添加到Hmisc::cut2
以仅返回切割(请参见?cut2
),并将findInterval
应用于获得的切割矢量:
set.seed(22)
(a <- rnorm(50))
#> [1] -0.512139088 2.485183678 1.007826150 0.292814572 -0.208959361
#> [6] 1.858092390 -0.066026405 -0.162764952 -0.199860680 0.300561734
#> [11] -0.763907283 0.081961904 0.743028275 -0.084022194 -0.792894517
#> [16] -0.922153631 0.861562379 2.002942188 0.936551013 -1.615734872
#> [21] -0.575056589 -0.003973089 -0.676112603 -1.049628275 -0.543280568
#> [26] 0.556144530 0.252837717 -0.901814675 0.824391356 -1.560279752
#> [31] 0.537994003 -1.268353887 0.640519828 -0.535761818 -1.019642817
#> [36] -0.807881506 0.056825225 0.950211404 -1.126763499 -0.201168295
#> [41] -0.228495853 0.558716260 0.748745433 1.918204369 1.007207812
#> [46] 3.253349400 -0.161748014 0.333755546 -1.178672976 1.077604331
(c <- runif(50, -1, 3))
#> [1] -0.36167553 -0.42019310 1.80365545 1.45542530 -0.72798537
#> [6] 0.32368285 1.68209984 -0.07971160 2.69304696 -0.84131974
#> [11] 0.89430681 -0.38260232 2.78302235 2.91256761 -0.20692439
#> [16] 2.21367929 2.40534034 0.26349751 0.51897997 0.10485985
#> [21] -0.14338538 1.65355414 2.68974930 -0.38767144 0.75481723
#> [26] 2.98473148 0.79046750 2.26079307 -0.24748383 -0.18502040
#> [31] 2.82674089 2.97552886 1.25323374 2.11271998 2.92941982
#> [36] -0.62746180 0.53751411 1.34383497 0.02002254 2.04000343
#> [41] 0.23576506 1.67230419 0.68045395 -0.32637800 0.33067028
#> [46] -0.58080654 0.38844488 -0.34026266 1.54217623 2.51062797
(cuts <- Hmisc::cut2(a, m = 10, onlycuts = TRUE))
#> [1] -1.6157349 -0.7928945 -0.1998607 0.3005617 0.9365510 3.2533494
findInterval(c, cuts)
#> [1] 2 2 5 5 2 4 5 3 5 1 4 2 5 5 2 5 5 3 4 3 3 5 5 2 4 5 4 5 2 3 5 5 5 5 5
#> [36] 2 4 5 3 5 3 5 4 2 4 2 4 2 5 5
由reprex package(v0.3.0)
创建于2019-06-13答案 1 :(得分:0)
del
需要一个向量来对值进行分箱。这是将findInterval
转换为向量的一种方法-
b$Var1
数据-
library(tidyverse)
vec <- b %>%
mutate(b_tmp = str_sub(Var1, 2, -2)) %>%
separate(b_tmp, c("minI", "maxI"), sep = ",") %>%
mutate_at(c("minI", "maxI"), as.numeric) %>%
{sort(c(pull(., minI), pull(., maxI)))} %>%
unique()
vec
[1] -1.616 -0.793 -0.200 0.301 0.937 3.253
# c is a function in R so not a good idea to use it as object; using vec_c instead
vec_c <- c(-0.36167553, -0.42019310, 1.80365545, 1.45542530, -0.72798537, 0.32368285)
interval_c <- findInterval(vec_c, vec)
[1] 2 2 5 5 2 4