Question

我正在尝试使用R将嵌套的JSON文件转换为二维数据帧。

我的JSON文件有一个嵌套结构。但是，不同级别的名称和属性是相同的。

{"name":"A", "value":"1", "c":
  [{"name":"a1", "value":"11", "c":
    [{"name":"a11", "value":"111"}, 
    {"name":"a12", "value":"112"}]
  }, 
  {"name":"a2", "value":"12"}]
}

所需的数据集看起来像这样。虽然确切的列名可能不同。

name    value   c__name c_value c_c_name    c_c_value
A       1       a1      11      a11         111
A       1       a1      11      a12         112
A       1       a2      12

到目前为止我的代码使数据变得平坦，但它似乎只适用于第一级（请参阅输出的屏幕截图）。

library(jsonlite)
json_file <- '    {"name":"A", "value":"1", "c":
      [{"name":"a1", "value":"11", "c":
[{"name":"a11", "value":"111"}, 
{"name":"a12", "value":"112"}]
}, 
{"name":"a2", "value":"12"}]
}'

data <- fromJSON(json_file, flatten = TRUE)
View(data)

enter image description here

我尝试了多个软件包，包括jsonlite和RJSONIO，我花了最后5个小时5个小时调试这个并尝试各种在线教程，但没有成功。谢谢你的帮助！

Answer 1

首先，这是一些丑陋的JSON;如果你有办法避免它，那就这样做。因此，接下来的内容也非常难看 - 我通常不会发布它，但我现在正在这样做，希望有些方法可能会有用。如果它冒犯了你的眼睛，请告诉我，我会删除它。

library(jsonlite)    # for fromJSON
library(reshape2)    # for melt
library(dplyr)       # for inner_join, select

jlist <- fromJSON(json_file)
jdf <- as.data.frame(jlist)
jdf$c.value <- as.numeric(jdf$c.value)          # fix type
jdf$L1 <- as.integer(factor(jdf$c.name))        # for use as a key with an artifact of melt later *urg, sorry*
ccdf <- melt(jdf$c.c)                           # get nested list into usable form
names(ccdf)[1:2] <- c('c.c.name', 'c.c.value')  # fix names so they won't cause problems with the join
df3 <- inner_join(jdf[, -5], ccdf)              # join, take out nested column
df3$c.c.value <- as.numeric(df3$c.c.value)      # fix type
df3 <- df3 %>% select(-L1, -c)                  # get rid of useless columns

离开你

> df3
  name value c.name c.value c.c.name c.c.value
1    A     1     a1      11      a11       111
2    A     1     a1      11      a12       112
3    A     1     a2      12     <NA>        NA

具有合理的合理类型。如果你愿意，可以避免使用的软件包。

这是可扩展的吗？好吧，不是真的，没有更多相同的混乱。如果其他人有一个不那么令人讨厌和更可扩展的方法来处理令人讨厌的JSON，请发布它;我和OP一样感激。

Answer 2

我想我找到了一种方法来做到这一点。它似乎适用于较大的树木。我们的想法是取消列出JSON并使用未列出元素的names属性。在此示例中，如果一个节点有一个父节点，则name属性将以＆＃34; c。＆＃34;开头，如果它有一个父节点和一个＆＃34;祖父母＆＃34;，它会将其列为＆＃34; CC＆＃34; ...等等。因此，下面的代码使用此结构来查找嵌套级别并将节点放在适当的列中。其余代码添加父节点的属性并删除生成的额外行。我知道它不优雅，但我认为它可能对其他人有用。

library(stringr)
library(jsonlite)

json_file <- '    {"name":"A", "value":"1", "c":
                  [{"name":"a1", "value":"11", "c":
                  [{"name":"a11", "value":"111"}, 
                  {"name":"a12", "value":"112"}]
                  }, 
                  {"name":"a2", "value":"12"}]
                  }'

nestedjson <- fromJSON(json_file, simplifyVector = F) #read the json
nAttrPerNode <- 2 #number of attributes per node
strChild <- "c." #determines level of nesting

unnestedjson <- unlist(nestedjson) #convert JSON to unlist 
unnestednames <- attr(unnestedjson, "names") #get the names of the cells
depthTree <- (max(str_count(unnestednames, strChild)) + 1) * nAttrPerNode #maximum tree depth 
htTree <- length(unnestednames) / nAttrPerNode #maximum tree height (number of branches)

X <- array("", c(htTree, depthTree))
for (nodeht in 1:htTree){ #iterate through the branches and place the nodes based on the count of strChild in the name attribute
   nodeIndex <- nodeht * nAttrPerNode 
   nodedepth <- str_count(unnestednames[nodeIndex], strChild) + 1
   X[nodeht, nodedepth * nAttrPerNode - 1] <- unnestedjson[nodeIndex - 1]
   X[nodeht, nodedepth * nAttrPerNode] <- unnestedjson[nodeIndex]
}

for (nodeht in 2:htTree){ #repeat the parent node attributes for the children
  nodedepth <- 0
  repeat{
    nodedepth <- nodedepth + 1
    startcol <- nodedepth * nAttrPerNode - 1
    endcol <- startcol + nAttrPerNode - 1
    if (X[nodeht, startcol] == "" & nodedepth < depthTree/nAttrPerNode){
      X[nodeht, startcol:endcol] <- X[nodeht-1, startcol:endcol]
    } else {
      break()
    }
  }
}

deleteRows <- NULL #Finally delete the rows that only have the parent attributes for nodes that have children
strBranches <- apply(X, 1, paste, collapse="")
for (nodeht in 1:(htTree-1)){
  branch2sub <- substr(strBranches[nodeht+1], 1, nchar(strBranches[nodeht]))
  if (strBranches[nodeht]==branch2sub){
    deleteRows <- c(deleteRows, nodeht)
  }
}
deleteRows
X <- X[-deleteRows,]

在R

2 个答案: