有效检索嵌套列表的结构信息

时间:2014-11-30 01:43:02

标签: r list recursion data-structures nested-lists

实际问题

有没有什么方法可以在rapply()之上使用或构建,以便为通过以下函数获得的任意结构化列表导出相同类型的结构信息。

定义:

getRawStructure_2 <- function(input) {
  .getRawStructure <- function(x, level = 0) {
    level <- level + 1
    out <- lapply(seq(along = x), function(el) {
      name <- names(x[el])
      value <- x[[el]]
      cls <- class(value)
      .dim <- if (any(cls %in% c("data.frame", "matrix"))) {
        paste(dim(value), collapse = " ")
      } else {
        length(value)
      }  
      out <- data.frame(
        level = level,
        name = if (is.null(name) || name == "") NA else name,
        class = cls,
        dim = .dim,
        stringsAsFactors = FALSE
      )
      if (any(cls == "list")) {
        deep <- .getRawStructure(x = value, level = level)
        c(list(out), unlist(deep, recursive = FALSE))
      } else {
        list(out)
      }
    })  
  }
  tmp <- do.call("rbind", unlist(.getRawStructure(x = input), recursive = FALSE))
  subs <- tmp$level
  subs_2 <- lapply(1:subs[which.max(subs)], function(ii) {
    out <- subs == ii
    out[out] <- 1
    out
  })
  names(subs_2) <- 1:length(subs_2)
  data.frame(subs_2, tmp, stringsAsFactors = FALSE)
}

输出:

> getRawStructure_2(input)
   X1 X2 X3 level name       class dim
1   1  0  0     1   x1        list   2
2   0  1  0     2  x11        list   2
3   0  0  1     3 x111     numeric   1
4   0  0  1     3 x112     numeric   1
5   0  1  0     2  x12        list   2
6   0  0  1     3 x121     numeric   1
7   0  0  1     3 x122     numeric   1
8   1  0  0     1   x2        list   1
9   0  1  0     2  x21   character   1
10  1  0  0     1   x3        list   1
11  0  1  0     2 <NA>   character   1
12  1  0  0     1   x4   character   1
13  1  0  0     1   x5        list   1
14  0  1  0     2 <NA>     integer   3
15  1  0  0     1 <NA>        list   2
16  0  1  0     2 <NA>        list   2
17  0  0  1     3 <NA>     numeric   1
18  0  0  1     3 <NA>     numeric   1
19  0  1  0     2 <NA>        list   2
20  0  0  1     3 <NA>     numeric   1
21  0  0  1     3 <NA>     numeric   1
22  1  0  0     1 <NA>        list   1
23  0  1  0     2 <NA>     integer   3
24  1  0  0     1 <NA>   character   1
25  1  0  0     1 <NA>   character   1
26  1  0  0     1 <NA>   character   3
27  1  0  0     1 <NA>     numeric   3
28  1  0  0     1 <NA>     logical   1
29  1  0  0     1 <NA> environment   0
30  1  0  0     1 <NA>  data.frame 3 2

对于相当小的列表来说很好,但是对于较大的列表需要一些时间(参见下面的示例)。


背景

考虑以下列表:

input <- list(
  x1 = list(x11 = list(x111 = 1, x112 = 1), x12 = list(x121 = 1, x122 = 1)),
  x2 = list(x21 = "x21"),
  x3 = list("x31"),
  x4 = "x4",
  x5 = list(1:3),
  list(list(1, 2), list(3, 4)),
  list(1:3),
  "char 1",
  "char 2",
  letters[1:3],
  c(1,3,5),
  TRUE,
  new.env(),
  data.frame(x = 1:3, y = 1:3)
)

最后,我希望能够以二维形式表示列表,例如listr::getStructure()返回的列表:

> listr::getStructure(input)
   X1 X2 X3 type           index        oindex        path ppath level name       class dim
1   1 NA NA    1           [[1]]           [1]          x1     1     1   x1        list   2
2   1  1 NA    2      [[1]][[1]]      [[1]][1]      x1/x11   1/1     2  x11        list   2
3   1  1  1    2 [[1]][[1]][[1]] [[1]][[1]][1] x1/x11/x111 1/1/1     3 x111     numeric   1
4   1  1  2    2 [[1]][[1]][[2]] [[1]][[1]][2] x1/x11/x112 1/1/2     3 x112     numeric   1
5   1  2 NA    2      [[1]][[2]]      [[1]][2]      x1/x12   1/2     2  x12        list   2
6   1  2  1    2 [[1]][[2]][[1]] [[1]][[2]][1] x1/x12/x111 1/2/1     3 x121     numeric   1
7   1  2  2    2 [[1]][[2]][[2]] [[1]][[2]][2] x1/x12/x112 1/2/2     3 x122     numeric   1
8   2 NA NA    1           [[2]]           [2]          x2     2     1   x2        list   1
9   2  1 NA    2      [[2]][[1]]      [[2]][1]      x2/x21   2/1     2  x21   character   1
10  3 NA NA    1           [[3]]           [3]          x3     3     1   x3        list   1
11  3  1 NA    2      [[3]][[1]]      [[3]][1]        x3/1   3/1     2 <NA>   character   1
12  4 NA NA    3           [[4]]           [4]          x4     4     1   x4   character   1
13  5 NA NA    1           [[5]]           [5]          x5     5     1   x5        list   1
14  5  1 NA    2      [[5]][[1]]      [[5]][1]        x5/1   5/1     2 <NA>     integer   3
15  6 NA NA    1           [[6]]           [6]           6     6     1 <NA>        list   2
16  6  1 NA    2      [[6]][[1]]      [[6]][1]         6/1   6/1     2 <NA>        list   2
17  6  1  1    2 [[6]][[1]][[1]] [[6]][[1]][1]       6/1/1 6/1/1     3 <NA>     numeric   1
18  6  1  2    2 [[6]][[1]][[2]] [[6]][[1]][2]       6/1/2 6/1/2     3 <NA>     numeric   1
19  6  2 NA    2      [[6]][[2]]      [[6]][2]         6/2   6/2     2 <NA>        list   2
20  6  2  1    2 [[6]][[2]][[1]] [[6]][[2]][1]       6/2/1 6/2/1     3 <NA>     numeric   1
21  6  2  2    2 [[6]][[2]][[2]] [[6]][[2]][2]       6/2/2 6/2/2     3 <NA>     numeric   1
22  7 NA NA    1           [[7]]           [7]           7     7     1 <NA>        list   1
23  7  1 NA    2      [[7]][[1]]      [[7]][1]         7/1   7/1     2 <NA>     integer   3
24  8 NA NA    3           [[8]]           [8]           8     8     1 <NA>   character   1
25  9 NA NA    3           [[9]]           [9]           9     9     1 <NA>   character   1
26 10 NA NA    3          [[10]]          [10]          10    10     1 <NA>   character   3
27 11 NA NA    3          [[11]]          [11]          11    11     1 <NA>     numeric   3
28 12 NA NA    3          [[12]]          [12]          12    12     1 <NA>     logical   1
29 13 NA NA    3          [[13]]          [13]          13    13     1 <NA> environment   0
30 14 NA NA    3          [[14]]          [14]          14    14     1 <NA>  data.frame 3 2

为了做到这一点,我尝试了几种不同的方法。 rapply()的速度似乎非常有趣(参见下面的基准),但AFAICT我无法真正将其用于我的目的,因为f专门应用于叶子/底部值,因此我&#39 ; m&#34;失去任何&#34;中间&#34;关于分支机构或列表结构本身的信息,对吗?

如果有任何方法可以使用rapply() - 或者调整版本 - 不知何故,我真的很感激任何指针。

说明:

require("stringr")
getRawStructure_1 <- function(input) {
  struc <- capture.output(str(input, list.len = length(input)))
  struc <- unlist(strsplit(struc, split = "\n"))
  tops <- str_count(struc, "\\s\\$\\s")
  subs <- str_count(struc, "((\\.\\.)(\\s|\\$))")

  ## Clean //
  idx_out <- which(tops == 0 & subs == 0)
  if (length(idx_out)) {
    tops <- tops[-idx_out]
    subs <- subs[-idx_out]
    struc <- struc[-idx_out]
  }

  ## Types //
  types <- gsub(".*\\$(\\s*:|\\s*\\w+:)", "", struc)
  types <- gsub("(?<=\\w)\\s.*|\\:.*", "", types, perl = TRUE)
  types <- tolower(gsub("\\s|<|'", "", types))

  ## Names //
  idx_names <- str_detect(struc, "\\$\\s\\w+.*:\\s")
  nms <- if (length(idx_names)) {
    gsub("\\$\\s", "", str_extract(struc, "\\$\\s\\w+"))
  } else {
    NA
  }

  ## Levels //
  subs_2 <- lapply(0:subs[which.max(subs)], function(ii) {
    out <- subs == ii
    out[out] <- 1
    out
  })
  names(subs_2) <- 1:length(subs_2)
  data.frame(
    subs_2, 
    name = nms, 
    class = types, 
    str = struc, 
    stringsAsFactors = FALSE
  )  
}

getRawStructure_3 <- function(input) {
  .getRawStructure <- function(x) {
    list(level = length(x), class = class(x))
  }
  rapply(input, .getRawStructure, how = "list")
}

getRawStructure_1(input)
getRawStructure_2(input)
getRawStructure_3(input)

基准:

require("microbenchmark")
res <- microbenchmark(
  "0.1" = str(input, list.len = length(input)),
  "0.2" = capture.output(str(input, list.len = length(input))),
  "1" = getRawStructure_1(input),
  "2" = getRawStructure_2(input),
  "3" = getRawStructure_3(input),
  unit = "ms"
)

> res
Unit: milliseconds
 expr       min        lq       mean   median        uq       max neval
  0.1 22.526810 24.431485 25.3388115 25.04150 26.160263 29.466200   100
  0.2 15.678034 17.549247 17.9725577 17.79237 18.328352 22.353281   100
    1 17.977740 20.286033 21.1265184 20.59993 21.396797 35.075989   100
    2 17.647265 19.622121 20.2261508 20.03640 20.860219 25.066969   100
    3  0.076993  0.092095  0.1055568  0.09861  0.112527  0.251706   100

更大的名单:

scope <- 1000
input <- lapply(1:scope, function(ii) {
  list(name = paste(sample(letters, 4), collapse = ""), value = Sys.time())
})
head(input)

res <- microbenchmark(
  "0.1" = str(input, list.len = length(input)),
  "0.2" = capture.output(str(input, list.len = length(input))),
  "1" = getRawStructure_1(input),
  "2" = getRawStructure_2(input),
  "3" = getRawStructure_3(input),
  unit = "ms",
  times = 3
)

> res
Unit: milliseconds
 expr        min          lq        mean      median          uq         max neval
  0.1 2449.80358 2481.302677 2494.523763 2512.801773 2516.883854 2520.965936     3
  0.2 1591.30210 1662.759397 1696.973466 1734.216693 1749.809148 1765.401602     3
    1 1600.41742 1647.050567 1669.736494 1693.683716 1704.396033 1715.108349     3
    2 2051.56185 2052.655737 2077.485806 2053.749622 2090.447782 2127.145943     3
    3    7.98708    8.559489    8.765691    9.131898    9.154996    9.178094     3

0 个答案:

没有答案