我想消除用于在函数内部m
中创建数据的嵌套循环。 missing_entries()
函数正在尝试识别并返回col1
中的组col2
丢失的值。有没有其他方法可以解决这个问题,以提高效率?
missing_entries <- function( data, x, y )
{
# find missing entries in x for the group in y
# by comparing other groups in y
require( 'data.table' )
# require class of data to be data.table
stopifnot( "data.table" %in% class(data) )
# the outer loop with i refers to each unique value of column y
# the inner loop with j refers to all unique values of column y
# except the value in i under current iteration
uniq_col2 <- unique( data[, get(y) ] )
m <- lapply( uniq_col2, function(i){
lapply( setdiff( uniq_col2, i ), function( j ) {
z <- setdiff( data[ get(y) == i, get(x)], data[ get(y) == j, get(x)])
if( length(z) > 0 ){
return( data.frame( v1 = z, v2 = j,
stringsAsFactors = FALSE ) )
} else{
return()
}
} )
})
# row bind
m <- lapply( m, function(k) {
if(!is.null(k)) {
rbindlist(l = k)
}})
# collect only not null data
m <- rbindlist( l = m[lapply(m, nrow) > 0] )
colnames( m ) <- c( x, y )
return( m )
}
# testing
test_data <- structure(list(cardnty = c("many", "many", "many", "many", "many", "many",
"many", "many", "many", "many", "many", "many",
"many", "many", "many", "many", "many", "many",
"many", "many", "many", "many", "many", "many"),
col1 = c(2L, 4L, 3L, 13L, 5L, 6L, 7L, 17L, 9L, 4L, 3L,
2L, 8L, 5L, 6L, 7L, 14L, 17L, 19L, 13L,
9L, 12L, 11L, 20L),
N = c(599L, 43L, 111L, 12L, 11L, 5L, 4L,
8L, 2L, 72L, 230L, 617L, 13L, 58L, 19L, 9L,
5L, 3L, 2L, 1L, 11L, 1L, 1L, 1L),
col2 = c("cat", "cat", "cat", "cat", "cat",
"cat", "cat", "cat", "cat", "dog",
"dog", "dog", "dog",
"dog", "dog", "dog",
"dog", "dog", "dog",
"dog", "dog", "dog",
"dog", "dog" )),
.Names = c("cardnty", "col1", "N", "col2"),
row.names = c(NA, -24L),
class = "data.frame")
require('data.table')
setDT(test_data)
results <- missing_entries(data = test_data, x = "col1", y = "col2")
setDF(results)
test_results <- structure(list(col1 = c(8L, 14L, 19L, 12L, 11L, 20L),
col2 = c("cat", "cat", "cat", "cat", "cat", "cat")),
.Names = c("col1", "col2"),
row.names = c(NA, -6L),
class = "data.frame")
identical( results, test_results)
# TRUE
答案 0 :(得分:1)
这里有一个建议:使用dcast将数据帧转换为宽,将0填入缺失值,然后转换回long并拉出零。
library(reshape2)
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]
时间比较表明这会更快一些。
start_time <- Sys.time()
for (x in c(1:10000)){
results <- missing_entries(data = test_data, x = "col1", y = "col2")
setDF(results)
}
end_time <- Sys.time()
timeA <- end_time-start_time
# Time difference of 1.725317 mins
start_time <- Sys.time()
for (x in c(1:10000)){
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]
}
end_time <- Sys.time()
timeB <- end_time-start_time
# Time difference of 1.368845 mins
答案 1 :(得分:1)
据我所知,OP正在寻找col1
中col2
和test_data
的缺失组合。
我们可以使用col1
&#39; s col2
(交叉加入)或{获得data.table
和CJ()
的所有独特组合从基础R {1}}然后我们可以使用反连接找到缺少的元素,它会删除已存在的组合。
expand.grid()
library(data.table) setDT(test_data)[, CJ(col1 = col1, col2 = col2, unique = TRUE)][ !test_data, on = .(col1, col2)]