How to parse a data.frame into a tree?

时间:2015-11-30 22:55:59

标签: r parsing tree

Here's a simple taxonomy (labels and IDs):

test_data <- data.frame(
  cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
  cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
  cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
  cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
  cat_h4 = c("Big", "Little", rep(NA, 6)))

The parsed structure should match the following:

list(
  Animals = list(Mammals = list(Dogs  = list(Big = 661, Little = 197), Other = 228),
                 Birds   = list(Hawks = 650, Other = 126)),
  Plants  = list(Wheat = 912, Grass = 949, Other = 428))

3 个答案:

答案 0 :(得分:6)

如果订单略有变化,那么这是一个按列处理的递归解决方案:

f <- function(x, d=cbind(x,NA)) {
    c( 
       # call f by branch
       if(ncol(d) > 3) local({
         x <- d[!is.na(d[[3]]),] 
         by( x[-2], droplevels(x[2]), f, x=NA, simplify=FALSE) 
       }), 
       # leaf nodes
       setNames(as.list(d[[1]]), d[[2]])[is.na(d[[3]])] 
    )
}

会给出这个:

> str(f(test_data))
List of 2
 $ Animals:List of 2
  ..$ Birds  :List of 2
  .. ..$ Hawks: num 650
  .. ..$ Other: num 126
  ..$ Mammals:List of 2
  .. ..$ Dogs :List of 2
  .. .. ..$ Big   : num 661
  .. .. ..$ Little: num 197
  .. ..$ Other: num 228
 $ Plants :List of 3
  ..$ Wheat: num 912
  ..$ Grass: num 949
  ..$ Other: num 428

答案 1 :(得分:3)

也许不是最有效的,但也不是太难:

创建数据:

test_data <- data.frame(
  cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
  cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
  cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
  cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
  cat_h4 = c("Big", "Little", rep(NA, 6)))

循环遍历数据框并构建列表/树:

tax <- list()  ## initialize
for (i in 1:nrow(test_data)) {
    ## convert data.frame row to character vector
    taxdat <- sapply(test_data[i,-1],as.character)
    taxstr <- character(0)  ## initialize taxon string
    ntax <- length(na.omit(taxdat))
    for (j in 1:ntax) {
        taxstr <- c(taxstr,taxdat[j])  ## build string
        if (is.null(tax[[taxstr]])) {
            tax[[taxstr]] <- list()  ## initialize if necessary
        }
    }
    tax[[taxstr]] <- test_data$cat_id[i]  ## assign value to tip
}

将结果与期望值进行比较:

res <- list(
  Animals = list(Mammals = list(Dogs  = list(Big = 661, Little = 197),
                 Other = 228),
                 Birds   = list(Hawks = 650, Other = 126)),
  Plants  = list(Wheat = 912, Grass = 949, Other = 428))

all.equal(res,tax)  ## TRUE

答案 2 :(得分:1)

我会避免使用列表结构,而不是整理数据。这是一种减少数据冗余的方法。

.text
# First Input - Saved to $t1
la  $a0, input
li  $v0, 4
syscall

li  $v0, 5
syscall
move    $s0, $v0  # save to $s0

# Second Input - Saved to $t2
la  $a0, input2
li  $v0, 4
syscall

li  $v0, 5
syscall
move    $t1, $s0 # restore 1st number
move    $t2, $v0

# Compare the two Inputs
bgt $t1, $t2, Bigger
blt $t1, $t2, Smaller

# If the 1st is greater
# ($t1 + 5) - ($t2 * 2) = result
Bigger:
    add $t4, $t1, 5 # $t4 = $t1 + 5
    mul $t5, $t2, 2 # $t5 = $t2 * 2
    sub     $t7, $t4, $t5   # $t7 = $t4 - $t5

    li  $v0, 1
    move    $a0, $t7
    syscall

    li  $v0, 10
    syscall

# If the 1st is smaller
Smaller:
    add $t4, $t2, 5 # $t4 = $t2 + 5
    mul $t5, $t1, 2 # $t5 = $t1 * 2
    sub     $t7, $t4, $t5   # $t7 = $t4 - $t5

    li  $v0, 1
    move    $a0, $t7
    syscall

    li  $v0, 10
    syscall


.data

input:  .asciiz "Enter the First Integer: "

input2: .asciiz "Enter the Second Integer: "

Halt:   li  $v0, 10
        syscall

原件很容易重组:

library(dplyr)

h1_h2 = 
  test_data %>%
  select(cat_h1, cat_h2) %>%
  distinct %>%
  filter(cat_h2 %>% is.na %>% `!`)

h2_h3 =
  test_data %>%
  select(cat_h2, cat_h3) %>%
  distinct %>%
  filter(cat_h3 %>% is.na %>% `!`)

h3_h4 = 
  test_data %>%
  select(cat_h3, cat_h4) %>%
  distinct %>%
  filter(cat_h4 %>% is.na %>% `!`)

编辑:这是一种自动化整个过程的方法。

h1_h2 %>%
  left_join(h2_h3) %>%
  left_join(h3_h4)