Question

我有一个节点对data.table，其中父级在树上比子级高。

我需要从这些规则中提取所有单独的链，例如如果我的格式为parent>child：（a>b，b>c，b>e，c>d），则链为（a>b>c>d，{{1 }}。

我已经用一些虚拟数据作为例子来说明我想做什么。关于如何执行此操作的任何建议都很好？感觉应该很简单，但是我正在努力思考如何开始。谢谢：）

a>b>e

Answer 1

这是另一种可能的解决方案-虽然也有些混乱

输出

output(input)
#    tree_nums elems hierarchy
#  1:         1     a         1
#  2:         1     b         2
#  3:         1     c         3
#  4:         1     d         4
#  5:         2     e         1
#  6:         2     b         2
#  7:         2     c         3
#  8:         2     d         4
#  9:         3     a         1
# 10:         3     b         2
# 11:         3     f         3
# 12:         4     e         1
# 13:         4     b         2
# 14:         4     f         3   
#

功能

output <- function (input) {
  # init
  helper <- do.call(paste0, input)
  elements <- unique(unlist(input))
  res <- integer(length(elements))
  ind <- elements %in% input$Child
  # first generation
  parents <- elements[!ind]
  res[!ind] <- 1L
  # later generations
  val <- 1L
  parents <- parents
  trees <- setNames(as.list(seq_along(parents)), parents)
  while (any(res == 0L)) {
    val <- val + 1L
    children <- unique(input$Child[input$Parent %in% parents])
    res[elements %in% children] <- val

    # create the tree
    nextHelper <- expand.grid(parents, children)
    nextHelper$conc <- do.call(paste0, nextHelper)
    nextHelper <- nextHelper[nextHelper$conc %in% helper,]

    df_1 <- do.call(rbind, strsplit(names(trees),'')) 
    df_2 <- base::merge(df_1, nextHelper[,-3L], by.x = ncol(df_1), by.y = 'Var1', all.x = TRUE)
    n1 <- ncol(df_2)
    if (n1 > 2L) df_2 <- df_2[,c(2:(n1-1),1L,n1)]
    df_2$Var2 <- as.character(df_2$Var2)
    df_2$Var2[is.na(df_2$Var2)] <- ''


    treeNames <- do.call(paste0, df_2)
    trees <- setNames(as.list(seq_along(treeNames)), treeNames)
    parents <- children
  }

  elems <- strsplit(names(trees),'')
  tree_nums <- rep(as.integer(trees), lengths(elems))
  elems <- unlist(elems)
  output <- data.table::data.table(tree_nums,elems)
  out <- data.table::data.table(elements, res)
  output$hierarchy <- out$res[match(output$elems, out$elements)]
  output
}

Answer 2

经过一段口号之后，我有一个解决方案，但如果有的话，我希望有一个更有效的解决方案。

library(stringi)
# convert to string
setkey(input, Parent)
sep <- ">>"
split_regex <- "(?<=%1$s)[^(%1$s)]*$" 
trees <- sprintf("%s%s%s", input$Parent, sep, input$Child)
# get the base nodes, the children
children <- stri_extract_first_regex(trees, sprintf(split_regex, sep),
                                     simplify = TRUE)
# find that which are parents
grid <- input[J(unique(children)), ][!is.na(Child), ]
update <- unique(grid$Parent)
N <- nrow(grid)

while(N > 0){ 

  # add the children on for the ones at the base of the chains, might mean 
  # making more tree splits
  all_trees <- unique(unlist(lapply(update, function(x){
    pos <- children == x
    y <- grid[Parent %in% x, Child]
    trees <- c(trees[!pos], CJ(trees[pos], y)[, sprintf("%s%s%s", V1, sep, V2)])
    trees
  })))
  # I have some trees embedded now, so remove these ones
  trim <- sapply(seq_along(all_trees), function(i){
    any(stri_detect_fixed(all_trees[-i], all_trees[i]))
  })
  trees <- all_trees[!trim]

  # update operations on expanded trees until no children remain with a dependency
  children <- stri_extract_first_regex(trees, sprintf(split_regex, sep, sep),
                                      simplify = TRUE)
  grid <- input[J(unique(children)), ][!is.na(Child), ]
  update <- unique(grid$Parent)
  N <- nrow(grid)
}

# re-structure to appropriate format
output <- data.table(pattern = trees)
output[, Tree := 1:.N]
output[, split := stri_split_regex(pattern, sep)]
output <- output[, .(List = split[[1]],
                     Hierarchy = 1:length(split[[1]])), by=Tree]
output[]

用节点对构建树

2 个答案: