Question

我有一个data.table，其中包含几列，在这些列中，一列旨在接收每行的组号（group_id）。首先，group_id列填充了行号。除group_id列外，该列上的值可能为“ NA”。我有一个庞大的数据集，所以我不能使用循环来做，因为它太慢了。

下面是我的data.table的一个小例子：

library(data.table)
group_id = c(1,2,3,4)
type1 = c(1,5,7,5)
type2 = c(3,3,4,NA)
type3 = c(6,7,NA,NA)
df <- data.table(group_id, type1, type2, type3)
df

   group_id type1 type2 type3
1:        1     1     3     6
2:        2     5     3     7
3:        3     7     4    NA
4:        4     5    NA    NA

我要实现的是根据每行与另一列的比较（如果相等）来更改group_id。较低的group_id值始终为1。对于前面的示例，预期结果将是：

   group_id type1 type2 type3
1:        1     1     3     6
2:        1     5     3     7
3:        3     7     4    NA
4:        1     5    NA    NA

group_id更改为1，因为第2行共享相同的type2值（type2 == 3）
第3行的group_id不变
group_id更改为1，因为第4行共享相同的type1值（type1 == 5）

这是我的第二个问题，因此，如果我犯了错误，请立即告诉我，谢谢。

Answer 1

这是使用igraph的一种可能方法。

对于每个^type*列，请先删除NA。然后，针对此^type*列中的每个唯一类型值，创建一个网络，每个顶点都与每个其他顶点相连（即完整的引文图）。

然后，合并所有这些子网以创建集群，其中同一集群中的group_id个共享一个或多个相同类型值。

下一步，在每个群集中找到最早的group_id。

最后，查找每个group_id所在的集群。

library(igraph)
cols <- paste0("type", 1:3)
lg <- list()

#for each type column
for (x in cols) {
    lg <- c(lg, DT[!is.na(get(x)), #remove NAs
        {
            #create graph and label vertices
            gix <- unique(group_id)
            cg <- make_full_citation_graph(length(gix), FALSE)
            V(cg)$name <- as.character(gix)
            .(.(cg))
        }, 
        by=x]$V1)
}

#union all subgraphs
ug <- do.call(union, c(lg, list(byname=TRUE)))
#plot(ug)

#find the earliest group_id for each cluster
clu <- clusters(ug)$membership
split(clu, clu) <- lapply(split(clu, clu), function(x) min(names(x)))

#lookup to update the original dataset
DT[, new_gid := clu[as.character(group_id)]]
DT

输出：

   group_id type1 type2 type3 new_gid
1:        1     1     3     6       1
2:        2     5     3     7       1
3:        3     7     4    NA       3
4:        4     5    NA    NA       1

数据：

library(data.table)
group_id = c(1,2,3,4)
type1 = c(1,5,7,5)
type2 = c(3,3,4,NA)
type3 = c(6,7,NA,NA)
DT <- data.table(group_id, type1, type2, type3)

编辑：使用igraph可能会过大。这个Rcpp版本应该更快

library(Rcpp)
cppFunction("
IntegerVector gclu(IntegerVector id, IntegerVector typ1, IntegerVector typ2, IntegerVector typ3) {
    int i, j, sz = id.size();

    for (i=0; i<sz; i++) {
        for (j=0; j<=i; j++) {
            if ((!IntegerVector::is_na(typ1[i]) && !IntegerVector::is_na(typ1[j]) && typ1[i]==typ1[j]) ||
                (!IntegerVector::is_na(typ2[i]) && !IntegerVector::is_na(typ2[j]) && typ2[i]==typ2[j]) ||
                (!IntegerVector::is_na(typ3[i]) && !IntegerVector::is_na(typ3[j]) && typ3[i]==typ3[j])) {

                id[i] = id[j];
                break;
            }
        }
    }

    return(id);
}
")
DT[, gclu(group_id, type1, type2, type3)]

Answer 2

这是我实现对行进行分组的方法，这可能很笨拙，我是R编码的新手：

group <- function(table) {
  # Index in table
  group.id <- 1

  # Length of table ie number of row
  length.table <- length(table[[1]])

  # Loop on the table O(n) = n(n-1)/2 with n the number of row
  for (i in 1:(length.table - 1)) {
    for (j in (i + 1):length.table) {
      # Go to the next comparison if the two row are already grouped
      if (table[[group.id]][i] == table[[group.id]][j]) {
        next
      } else {
        for (k in (group.id + 1):length.table) {
          # If the two value are equal (and not NA)
          if (!is.na(table[[k]][i]) &
              !is.na(table[[k]][j]) &
              table[[k]][i] == table[[k]][j]) {
            # Then group them with the lesser value of group.id
            if (table[[group.id]][i] < table[[group.id]][j]) {
              table[[group.id]][j] <- table[[group.id]][i]
            } else {
              table[[group.id]][i] <- table[[group.id]][j]
            }
          }
        }
      }
    }
    # If all the row are grouped then return the result
    if (uniqueN(table[[group.id]]) == 1) {
      return(table[[group.id]])
    } 
  }
  return(table[[group.id]])
}

dt[, group.id := group(c(list(group.id, type1, type2, type3)))]

print(dt)

输出：

   group.id type1 type2 type3
1:        1     1     3     6
2:        1     5     3     7
3:        3     7     4    NA
4:        1     5    NA    NA

数据：

library(data.table)
group.id <- c(1, 2, 3, 4)
type1 <- c(1, 5, 7, 5)
type2 <- c(3, 3, 4, NA)
type3 <- c(6, 7, NA, NA)
dt <- data.table(group.id, type1, type2, type3)

在data.table中，比较行，并基于此比较在每行中添加组号，而R中不循环

2 个答案: