Question

我正在进行测试覆盖率分析，我想重新排列矩阵，以便按“其他”测试失败的次数对列进行排序。

例如，我有一个矩阵，其中TRUE和FALSE为TRUE表示失败。

df <- structure(c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE), .Dim = c(10L, 3L), .Dimnames = list(NULL, c("t1", "t2", "t3")))

t2的失败次数最多，应该是第一列。 t1次高，但所有故障（每行）都由t2覆盖。但是，t3失败的次数较少，但是最后两个失败不在t2的涵盖范围内，因此应该在第二列。

基于失败覆盖率的所需列顺序：

df <- structure(c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE), .Dim = c(10L, 3L), .Dimnames = list(NULL, c("t2", "t3", "t1")))

使用for循环和apply函数结合使用，每次测试我都能获得“额外”失败的计数，但是当数据集中有很多列和行时，性能确实很差。但是，我确实希望重新排列该列以进行进一步处理。

for (n in 2:ncol(out)) {
  idx <- which.max(apply(out, 2, sum, na.rm = T))
  col.list <- c(col.list, names(idx))
  val.list <- c(val.list, sum(out.2[ ,idx], na.rm = T))
  out[out[ ,idx] == T, ] <- F
  out <- out[ ,-idx]
}

有人可以建议一种更好的方法吗？也许不使用for循环？

谢谢。

Answer 1

这里与OP的方法有些相似，但我希望它的性能会稍好一些（不过未经测试）：

select_cols <- names(tail(sort(colSums(df)), 1)) # first col
for(i in seq_len(ncol(df)-1)) {
  remaining_cols <- setdiff(colnames(df), select_cols)
  idx <- rowSums(df[, select_cols, drop=FALSE]) > 0
  select_cols <- c(select_cols, 
                   names(tail(sort(colSums(df[!idx, remaining_cols, drop=FALSE])), 1)))
}
df <- df[, select_cols]
df

#        t2    t3    t1
# [1,]  TRUE FALSE  TRUE
# [2,]  TRUE FALSE  TRUE
# [3,]  TRUE FALSE  TRUE
# [4,]  TRUE FALSE  TRUE
# [5,]  TRUE FALSE  TRUE
# [6,]  TRUE FALSE  TRUE
# [7,]  TRUE FALSE FALSE
# [8,]  TRUE  TRUE FALSE
# [9,] FALSE  TRUE FALSE
# [10,] FALSE  TRUE FALSE

更新：尝试这个经过稍微修改的版本-速度要快得多，我认为它将产生正确的结果：

  select_cols <- names(tail(sort(colSums(m)), 1)) # first col
  idx <- rowSums(m[, select_cols, drop = FALSE]) > 0
  for(i in seq_len(ncol(m)-1)) {
    remaining_cols <- setdiff(colnames(m), select_cols)
    idx[!idx] <- rowSums(m[!idx, select_cols, drop=FALSE]) > 0
    select_cols <- c(select_cols, 
                     names(tail(sort(colSums(m[!idx, remaining_cols, drop=FALSE])), 1)))
  }
  m <- m[, select_cols]
  m

两者之间的主要区别是这一行：

idx[!idx] <- rowSums(m[!idx, select_cols, drop=FALSE]) > 0

这意味着我们无需为先前选择的任何列均已为真的行计算rowSums。

Answer 2

这是我基于快捷方式的解决方案。

df <- as.data.frame(df)
df_new <- df
index <- NULL
for (i in 1:dim(df)[2]) {
  var <- names(sort(apply(X=df, MARGIN=2, sum), decreasing = T))[1]
  index = c(index, var)
  df<-df[df[,var]==F,]
}
df_new[,c(index)]

如果仅计算新的故障，我们可以通过以下方式迭代循环：

失败更多的情况下获取变量
删除先前变量失败的数据
获取更多失败次数更多的变量。

第2步可以使循环更快，第1步和第3步基于apply。

希望有帮助！

Answer 3

这是处理长格式数据的替代方法。我使用data.table函数，但是如果需要，可以将其调整为base。我希望我能正确理解您的逻辑；）至少我会尝试在注释的代码中解释我的理解。

# convert matrix to data.table
dt <- as.data.table(df)

# add row index, 'ci'
dt[ , ri := 1:.N]

# melt to long format
d <- melt(dt, id.vars = "ri", variable.factor = FALSE, variable.name = "ci")

# determine first column
# for each 'ci' (columns in 'df'), count number of TRUE
# select 'ci' with max count
first_col <- d[ , sum(value), by = ci][which.max(V1), ci]

# for each 'ri' (rows in 'df'),
# check if number of unique 'ci' is one (i.e. "additional" test failures)    
d[(value), new := uniqueN(ci) == 1, by = ri]

# select rows where 'new' is TRUE
# for each 'ci', count the number of rows, i.e the number of 'new'
# -> number of rows in 'df' where this column is the only TRUE
d_new <- d[(new), .(n_new = .N), ci]

# set order to descending 'n_new'
setorder(d_new, -n_new)

# combine first column and columns which contribute with additional TRUE
cols <- c(first_col, setdiff(d_new[ , ci], first_col)) 

# set column order. 
# First 'cols', then any columns which haven't contributed with new values
# (none in the test data, but needed for more general cases)  
setcolorder(dt, c(cols, setdiff(names(dt), cols)))

dt
#        t2    t3    t1 ri
#  1:  TRUE FALSE  TRUE  1
#  2:  TRUE FALSE  TRUE  2
#  3:  TRUE FALSE  TRUE  3
#  4:  TRUE FALSE  TRUE  4
#  5:  TRUE FALSE  TRUE  5
#  6:  TRUE FALSE  TRUE  6
#  7:  TRUE FALSE FALSE  7
#  8:  TRUE  TRUE FALSE  8
#  9: FALSE  TRUE FALSE  9
# 10: FALSE  TRUE FALSE 10

在大小为mentioned in comment的矩阵上进行了尝试：

set.seed(1)
nr <- 14000
nc <- 1400
df <- matrix(sample(c(TRUE, FALSE), nr*nc, replace = TRUE), nr, nc,
             dimnames = list(NULL, paste0("t", 1:nc)))

在<5秒内完成。

根据先前列的覆盖范围重新排列列

3 个答案: