我想比较R
中不同大小的列中的数据。
这是我的数据集。
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 5 29 53 AA2 11 56 34 AA2
129 516 34 BB5 44 217 42 BB1 36 190 39 BB1
107 436 44 AA3 29 147 53 CC7 30 155 31 CC1
57 244 53 BB6 21 108 53 BB2 14 77 61 BB4
57 227 29 AA1 21 104 39 AA6 9 48 44 BB6
80 318 47 AA2 18 89 47 CC3 37 200 44 DD3
128 529 56 BB4 43 222 54 CC1 36 202 50 CC3
31 127 53 CC1 7 38 53 DD4
18 73 47 CC2
我使用duplicated
函数,在TRUE
和FALSE
结果后面添加了列,然后使用grepped结果,但它不起作用。这是我的代码:
v=duplicated(data$pp_filename, data$filename, data$filename)
b=cbind(data, dup=v)
dupl=(b[grep("FALSE", b$dup),])
这就是我想要的(再次以适当的格式):
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 18 89 47 CC3 11 56 34 AA2
80 318 47 AA2 43 222 54 CC1 30 155 31 CC1
31 127 53 CC1 5 29 53 AA2 36 202 50 CC3
答案 0 :(得分:2)
对我来说,简单的下标操作似乎比grep更合适。但我仍然找到了使用grep的借口。
b <- read.table(header = TRUE, text = "
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 5 29 53 AA2 11 56 34 AA2
129 516 34 BB5 44 217 42 BB1 36 190 39 BB1
107 436 44 AA3 29 147 53 CC7 30 155 31 CC1
57 244 53 BB6 21 108 53 BB2 14 77 61 BB4
57 227 29 AA1 21 104 39 AA6 9 48 44 BB6
80 318 47 AA2 18 89 47 CC3 37 200 44 DD3
128 529 56 BB4 43 222 54 CC1 36 202 50 CC3
31 127 53 CC1 7 38 53 DD4 18 73 47 CC2
")
# Any duplicated values among the "xx_filename" columns?
filenameCols <- grep("_filename", names(b))
v <- apply(b[,filenameCols], 1, FUN = anyDuplicated)
# Save rows with colliding xx_filename columns
dupl <- b[v != 0,]
答案 1 :(得分:1)
为了便于操作,这是对数据格式的更多建议。 我想,更好的数据格式是:
spl = split.default(DF, substring(names(DF), 1, 2))
lDF = do.call(rbind,
lapply(seq_along(spl),
function(i)
setNames(cbind(names(spl)[i],
spl[[i]][complete.cases(spl[[i]]), ]),
c("type", gsub("^(.*?)_", "", names(spl[[i]]))))))
lDF
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#2 mm 36 190 39 BB1
#3 mm 30 155 31 CC1
#4 mm 14 77 61 BB4
#5 mm 9 48 44 BB6
#6 mm 37 200 44 DD3
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#9 nn 44 217 42 BB1
#....
然后,您可以继续(至少从我对问题的理解):
commons = Reduce(intersect, split(lDF$filename, lDF$type))
lDF[lDF$filename %in% commons, ]
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#3 mm 30 155 31 CC1
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#13 nn 18 89 47 CC3
#14 nn 43 222 54 CC1
#16 pp 17 73 53 CC3
#21 pp 80 318 47 AA2
#23 pp 31 127 53 CC1
如果您想要显示的格式,可以使用一些解决方法。 E.g:
res = lDF[lDF$filename %in% commons, ]
tmp = split(res[-1], res[[1]])
do.call(cbind,
lapply(seq_along(tmp),
function(i)
setNames(tmp[[i]],
paste(names(tmp)[i], names(tmp[[i]]), sep = "_"))))
“DF”是:
DF = structure(list(pp_value_1 = c(17L, 129L, 107L, 57L, 57L, 80L,
128L, 31L, 18L), pp_value_2 = c(73L, 516L, 436L, 244L, 227L,
318L, 529L, 127L, 73L), pp_value_3 = c(53L, 34L, 44L, 53L, 29L,
47L, 56L, 53L, 47L), pp_filename = structure(c(9L, 5L, 3L, 6L,
1L, 2L, 4L, 7L, 8L), .Label = c("AA1", "AA2", "AA3", "BB4", "BB5",
"BB6", "CC1", "CC2", "CC3"), class = "factor"), nn_value_1 = c(5L,
44L, 29L, 21L, 21L, 18L, 43L, 7L, NA), nn_value_2 = c(29L, 217L,
147L, 108L, 104L, 89L, 222L, 38L, NA), nn_value_3 = c(53L, 42L,
53L, 53L, 39L, 47L, 54L, 53L, NA), nn_filename = structure(c(1L,
3L, 7L, 4L, 2L, 6L, 5L, 8L, NA), .Label = c("AA2", "AA6", "BB1",
"BB2", "CC1", "CC3", "CC7", "DD4"), class = "factor"), mm_value_1 = c(11L,
36L, 30L, 14L, 9L, 37L, 36L, NA, NA), mm_value_2 = c(56L, 190L,
155L, 77L, 48L, 200L, 202L, NA, NA), mm_value_3 = c(34L, 39L,
31L, 61L, 44L, 44L, 50L, NA, NA), mm_filename = structure(c(1L,
2L, 5L, 3L, 4L, 7L, 6L, NA, NA), .Label = c("AA2", "BB1", "BB4",
"BB6", "CC1", "CC3", "DD3"), class = "factor")), .Names = c("pp_value_1",
"pp_value_2", "pp_value_3", "pp_filename", "nn_value_1", "nn_value_2",
"nn_value_3", "nn_filename", "mm_value_1", "mm_value_2", "mm_value_3",
"mm_filename"), class = "data.frame", row.names = c(NA, -9L))