Question

我有一个很大的json文件，其中包含许多有关结果数据的调查。该文件非常复杂，但是我设法使用嵌套循环生成表。解决方案显然是耗时的，而且我相信使用purrrr或dt可以找到更好的方法。不幸的是，我无法共享数据，因此我附上了一个适用于循环的示例：

{
  "research": {
    "re.id": "id"
  },
  "surveys": [
    {
      "id": "10017",
      "participant": "31804190",
      "answers": [
        {
          "item": "685022",
          "results": [
            {
              "items": "o",
              "value": 2
            },
            {
              "items": "pb",
              "value": 3
            },
            {
              "items": "r",
              "value": 2
            },
            {
              "items": "s",
              "value": 0
            },
            {
              "items": "t",
              "value": 0
            },
            {
              "items": "w",
              "value": 0
            },
            {
              "items": "z",
              "value": 0
            },
            {
              "items": "f",
              "value": 2
            },
            {
              "items": "e",
              "value": 1
            },
            {
              "items": "l",
              "value": 0
            }
          ]
        },
        {
          "item": "90118",
          "results": [
            {
              "items": "o",
              "value": 0
            },
            {
              "items": "pb",
              "value": 3
            },
            {
              "items": "r",
              "value": 1
            },
            {
              "items": "s",
              "value": 0
            },
            {
              "items": "t",
              "value": 0
            },
            {
              "items": "w",
              "value": 0
            },
            {
              "items": "z",
              "value": 0
            },
            {
              "items": "f",
              "value": 1
            },
            {
              "items": "e",
              "value": 1
            },
            {
              "items": "l",
              "value": 0
            }
          ]
        },
        {
          "item": "30094",
          "results": [
            {
              "items": "o",
              "value": 0
            },
            {
              "items": "pb",
              "value": 2
            },
            {
              "items": "r",
              "value": 0
            },
            {
              "items": "s",
              "value": 1
            },
            {
              "items": "t",
              "value": 2
            },
            {
              "items": "w",
              "value": 1
            },
            {
              "items": "z",
              "value": 0
            },
            {
              "items": "f",
              "value": 0
            },
            {
              "items": "e",
              "value": -3
            },
            {
              "items": "l",
              "value": 3
}]}]}]}

请注意，357个调查中的每个调查都包含13个数据点，最后一个是25个答案的列表，其中包含10个结果。嵌套循环遍历所有调查，不包括未完成的调查，然后另一个循环遍历每个调查的所有答案，而下一个循环遍历每个调查中每个答案的所有结果。而且，结果是以随机顺序给出的，因此需要对其进行排序，这是在嵌套循环的末尾发生的。

以下是该JSON的示例：https://jsonblob.com/2bffde45-d8c0-11e9-9ec2-759b3e404be9

library(rjson)

survey.report <- fromJSON(file = "sample.json")
values.df <- data.frame(matrix(ncol = 13, nrow = 0))
listofitems <- list() 

for (a in 1:length(surveytoll[["surveys"]])) {
  if (length(surveytoll[["surveys"]][[a]][["answers"]]) == 0) {
    listofitems[i] <- 0
    i <- i + 1
  } else {
    for (o in 1:length(surveytoll[["surveys"]][[a]][["answers"]])) {
      surveyid <- surveytoll[["surveys"]][[a]][["answers"]][[o]][["item"]]
      i <- i + 1
      listofitems[i] <- surveyid 
      for (w in 1:length(surveytoll[["surveys"]][[a]][["answers"]][[o]][["results"]])) {
        answers <- surveytoll[["surveys"]][[a]][["answers"]][[o]][["results"]]
      } 
      survey = surveytoll[["surveys"]][[a]][["id"]]
      participant = surveytoll[["surveys"]][[a]][["participant"]]

      list.items <- list()
      list.values <- list()
      list.items <- sapply(answers, function(x) x[[1]])
      list.values <- sapply(answers, function(x) x[[2]])
      lwtt <- t(t(list.values)) 
      lpbtt <- t(t(list.items))
      lplw <- cbind.data.frame(lpbtt,lwtt)
      lplw$lwtt <- as.character(lplw$lwtt)
      lplw$lpbtt <- as.character(lplw$lpbtt)
      lplw[order(lplw$lpbtt),]
      value <- t(lplw$lwtt) 
      values.row <- cbind(survey,participant,listofitems[i],value)
      values.df <- rbind.data.frame(values.df,values.row) 
    }
  }
}

values.df

循环将生成包含13列的数据框：调查ID，参与者，项目和10个结果。该示例只有3行：

  survey participant     V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
1  10017    31804190 685022  2  3  2  0  0  0   0   2   1   0
2  10017    31804190  90118  0  3  1  0  0  0   0   1   1   0
3  10017    31804190  30094  0  2  0  1  2  1   0   0  -3   3

问题在于主json确实很大（10GB），因此循环将需要数周;）。

Answer 1

因此，您可以利用unnest_wider（在unnest_longer中）中相对较新的tidyr和tidyverse函数-请参阅this vignette on "rectangling: for more information on extracting information from deeply nested structures

library(tidyverse)

json <- jsonlite::fromJSON('sample.json', simplifyVector = FALSE)

surveys <- tibble(survey = json$surveys)

surveys %>%
    unnest_wider(survey) %>%
    unnest_longer(answers) %>%
    unnest_wider(answers) %>%
    unnest_longer(results) %>%
    unnest_wider(results) %>%
    pivot_wider(
        names_from = items,
        values_from = value
    )

# # A tibble: 3 x 13
#   id    participant item       o    pb     r     s     t     w     z     f     e     l
#   <chr> <chr>       <chr>  <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 10017 31804190    685022     2     3     2     0     0     0     0     2     1     0
# 2 10017 31804190    90118      0     3     1     0     0     0     0     1     1     0
# 3 10017 31804190    30094      0     2     0     1     2     1     0     0    -3     3

从R

1 个答案: