我有一个data.frame,其中数据条目以此格式1,2,3,10
输入。也就是说,它们是逗号分隔的整数,范围从0到20,并且不需要是连续的。目前每个都被认为是一个因素。我有四个包含这些值的变量,我想创建一个新变量,只有当它在四个变量中的三个变量中时才包含给定的整数,如果没有三个整数出现,则使用0
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
我不确定如何处理这些逗号分隔的整数。如果它们是单个整数,我可以做this:
之类的事情# modified from https://stackoverflow.com/a/14114085/1670053
# over each row of data.frame (or matrix)
sapply(1:nrow(df), function(idx) {
# get the number of time each entry in df occurs
t <- table(t(g[idx, ]))
# get the maximum count (or frequency)
if(max(t) > 2){
t.max <- max(t)
}else{ t.max <- 0
}
# get all values that equate to maximum count
t <- as.numeric(names(t[t == t.max]))
})
虽然这些多个值用逗号分隔,但我不确定从哪里开始。
# data and example output
df <- structure(list(M1 = structure(c(3L, 2L, 2L, 5L, 3L, 1L, 7L, 1L,
8L, 1L, 3L, 4L, 3L, 6L), .Label = c("0", "1", "1,2", "1,2,3",
"1,2,3,4", "1,2,3,4,5", "3,4,5,6,7", "6,7,8,9,10,11,12,13,14,15,16"
), class = "factor"), M2 = structure(c(5L, 2L, 2L, 4L, 4L, 1L,
11L, 8L, 7L, 9L, 3L, 6L, 3L, 10L), .Label = c("0", "1,2", "1,2,3",
"1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16",
"2", "2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"), class = "factor"),
M3 = structure(c(4L, 1L, 1L, 8L, 3L, 1L, 6L, 1L, 7L, 3L,
2L, 5L, 9L, 3L), .Label = c("0", "1,2", "1,2,3,4", "1,2,3,4,5",
"1,2,3,4,5,6", "1,2,3,4,5,6,7,8", "1,2,3,4,5,6,7,8,9,10,11,12,13,14",
"3,4", "3,4,5"), class = "factor"), M4 = structure(c(5L,
1L, 2L, 8L, 2L, 1L, 6L, 3L, 4L, 1L, 3L, 3L, 7L, 9L), .Label = c("0",
"1", "1,2", "1,2,3,4,5,12,13,14,15,16,17", "1,2,3,4,5,6",
"1,2,3,4,5,6,7,8,9,10,11,12", "3,4", "4", "4,5"), class = "factor"),
M_NEW = structure(c(6L, 1L, 2L, 8L, 3L, 1L, 9L, 1L, 7L, 1L,
3L, 4L, 5L, 10L), .Label = c("0", "1", "1,2", "1,2,3", "1,2,3,",
"1,2,3,4,5", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16", "3,4",
"3,4,5,6,7,8", "4,5"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW"), class = "data.frame", row.names = c(NA,
-14L))
答案 0 :(得分:2)
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(res == "", "0", res))
}
(df[, 5] <- apply(df[, 1:4], 1, f))
# [1] "1,2,3,4,5"
# [2] "0"
# [3] "1"
# [4] "3,4"
# [5] "1,2"
# [6] "0"
# [7] "3,4,5,6,7,8"
# [8] "0"
# [9] "1,10,11,12,13,14,15,16,2,3,4,5,6,7,8,9"
# [10] "0"
# [11] "1,2"
# [12] "1,2,3"
# [13] "3"
# [14] "4,5"