我想从df的column1中的每个值中选择5行,这样输出对于column2中的每个唯一值至少有1个值。 此外,输出中不应有任何重复
编辑:(column1,column3)对中不应该有重复项: 即对于column1中的每个值,column3中的所有值都应该是唯一的
column1 = rep(c("a","b"), each = 12)
column2 = rep(c(1,2,3), each = 4)
column3 = c("x1","x2","x3","x4","x5","x3","x6","x7","x8","x1","x9","x5","x6","x2","x3","x4","x7","x5","x6","x1","x4","x1","x6","x9")
df = data.frame(column1, column2, column3)
这是一个有效的解决方案
sample_output_1 = data.frame(column1 = rep(c("a","b"), each = 5),
column2 = c(1,1,2,2,3,1,1,2,2,3),
column3 = c("x1","x2","x5","x3","x8","x6","x2","x5","x1","x9"))
答案 0 :(得分:2)
检查
foo = function(a_df){
inds = 1:NROW(a_df)
#Sample 5 indices along the rows of a_df
my_inds = sample(inds, 5)
#If subset of a_df based on my_inds has duplicates
#Or if 2nd column does not have all unique values
while(any(duplicated(a_df[my_inds, c(1, 3)])) &
!identical(sort(unique(a_df[my_inds, 2])), sort(unique(a_df[[2]])))){
#Count the number of duplicates or missing all values
n = sum(duplicated(a_df[my_inds, c(1, 3)]))
n = n + sum(!sort(unique(a_df[my_inds, 2])) %in% sort(unique(a_df[[2]])))
#Remove my_inds from inds
inds = inds[!inds %in% my_inds]
#Remove the n indices that create duplicates from my_nds
my_inds = my_inds[!duplicated(a_df[my_inds, c(1, 3)])]
#Sample n more from inds and add to my_inds
my_inds = sample(c(my_inds, sample(inds, n)))
}
return(a_df[my_inds,])
}
set.seed(42)
do.call(rbind, lapply(split(df, df$column1), function(a) foo(a_df = a)))
# column1 column2 column3
# a.11 a 3 x9
# a.12 a 3 x5
# a.3 a 1 x3
# a.8 a 2 x7
# a.6 a 2 x3
# b.19 b 2 x6
# b.21 b 3 x4
# b.14 b 1 x2
# b.18 b 2 x5
# b.23 b 3 x6