所以我有一个如下所示的数据框:
x y
1 (0,4] 1
2 (0,4] 2
3 (0,4] 3
4 (0,4] 4
5 (4,5] 5
6 (5,10] 6
7 (5,10] 7
8 (5,10] 8
9 (5,10] 9
10 (5,10] 10
11 (10,20] 11
12 (10,20] 12
13 (10,20] 13
14 (10,20] 14
15 (10,20] 15
16 (10,20] 16
17 (10,20] 17
18 (10,20] 18
19 (10,20] 19
20 (10,20] 20
21 (20,40] 21
22 (20,40] 22
23 (20,40] 23
24 (20,40] 24
25 (20,40] 25
26 (20,40] 26
27 (20,40] 27
28 (20,40] 28
29 (20,40] 29
30 (20,40] 30
我想通过对x列进行分类的不规则分区对Y列进行分区,而不对每个特定的截断点进行硬编码。有没有办法做到这一点?
提前致谢
编辑:希望输出
x y
1 (0,4] (0,4]
2 (0,4] (0,4]
3 (0,4] (0,4]
4 (0,4] (0,4]
5 (4,5] (4,5]
6 (5,10] (5,10]
7 (5,10] (5,10]
8 (5,10] (5,10]
9 (5,10] (5,10]
10 (5,10] (5,10]
11 (10,20] (10,20]
12 (10,20] (10,20]
13 (10,20] (10,20]
14 (10,20] (10,20]
15 (10,20] (10,20]
16 (10,20] (10,20]
17 (10,20] (10,20]
18 (10,20] (10,20]
19 (10,20] (10,20]
20 (10,20] (10,20]
21 (20,40] (20,40]
22 (20,40] (20,40]
23 (20,40] (20,40]
24 (20,40] (20,40]
25 (20,40] (20,40]
26 (20,40] (20,40]
27 (20,40] (20,40]
28 (20,40] (20,40]
29 (20,40] (20,40]
30 (20,40] (20,40]
答案 0 :(得分:3)
从现有分界点提取数字:
library(stringr)
cutpoints = sort(as.numeric(unique(unlist(str_extract_all(df$x, pattern = "\\d+")))))
使用这些切点进行切割
df$y = cut(df$y, breaks = cutpoints)
使用这个可重复的数据:
df = structure(list(x = structure(c(1L, 1L, 1L, 1L, 4L, 5L, 5L, 5L,
5L, 5L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("(0,4]", "(10,20]", "(20,40]",
"(4,5]", "(5,10]"), class = "factor"), y = 1:30), .Names = c("x",
"y"), class = "data.frame", row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30"))
答案 1 :(得分:1)
我们可以从'x'中提取最后一个数字子字符串,转换为numeric
,获取unique
元素并将其用作breaks
中的cut
cut(df1$y, breaks= c(0,sort(unique(as.numeric(sub(".*,(\\d+)\\D+$", "\\1", df1$x))))))
#[1] (0,4] (0,4] (0,4] (0,4] (4,5] (5,10] (5,10] (5,10] (5,10]
#[10] (5,10] (10,20] (10,20] (10,20] (10,20] (10,20] (10,20] (10,20] (10,20]
#[19] (10,20] (10,20] (20,40] (20,40] (20,40] (20,40] (20,40] (20,40] (20,40]
#[28] (20,40] (20,40] (20,40]
#Levels: (0,4] (4,5] (5,10] (10,20] (20,40]