如何将复杂的JSON数据转换为单个数据帧?

时间:2016-09-28 12:39:01

标签: json r

我不知道如何将我的JSON数据正确转换为有用的数据帧。这是显示我的数据结构的一些示例数据:

{
"data":[
{"track":[
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":37}},
{"time":"2016","midpoint":{"x":6,"y":9},"realworld":{"x":2,"y":3},"coordinate":{"x":16,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2016","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":3,"y":15},"realworld":{"x":-9,"y":2},"coordinate":{"x":17,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":6,"y":7},"realworld":{"x":-2,"y":3},"coordinate":{"x":16,"y":39}}
]}]}

我有很多曲目,我希望数据集看起来像这样:

track   time   midpoint   realworld   coordinate 
1
1
1
2
2
2
2
3

到目前为止,我有这个:

json_file <- "testdata.json"
data <- fromJSON(json_file)
data2 <- list.stack(data, fill=TRUE) 

现在它出现了这样:

data output

如何以正确的格式获得此信息?

2 个答案:

答案 0 :(得分:4)

使用flatten = TRUE阅读时添加fromJSON参数。这将为您提供一个嵌套列表,其中包含三个数据帧的最深层次。使用:

library(jsonlite)
# read the json
jsondata <- fromJSON(txt, flatten = TRUE)

# bind the dataframes in the nested 'track' list together    
dat <- do.call(rbind, jsondata$data$track)

# add a track variable
dat$track <- rep(1:length(jsondata$data$track), sapply(jsondata$data$track, nrow))

给出:

> dat
  time midpoint.x midpoint.y realworld.x realworld.y coordinate.x coordinate.y track
1 2015          6          8           1           3           16           38     1
2 2015          6          8           1           3           16           37     1
3 2016          6          9           2           3           16           38     1
4 2015          5          9          -1           3           16           38     2
5 2015          5          9          -1           3           16           38     2
6 2016          5          9          -1           3           16           38     2
7 2015          3         15          -9           2           17           38     2
8 2015          6          7          -2           3           16           39     3

另一种更短的方法是将jsonliterbindlist包中的data.table结合使用:

library(jsonlite)
library(data.table)
# read the json
jsondata <- fromJSON(txt, flatten = TRUE)
# bind the dataframes in the nested 'track' list together 
# and include an id-column at the same time   
dat <- rbindlist(jsondata$data$track, idcol = 'track')

或来自bind_rows包的dplyr以类似的方式:

library(dplyr)
dat <- bind_rows(jsondata$data$track, .id = 'track')

使用过的数据:

txt <- '{
"data":[
{"track":[
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":37}},
{"time":"2016","midpoint":{"x":6,"y":9},"realworld":{"x":2,"y":3},"coordinate":{"x":16,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2016","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":3,"y":15},"realworld":{"x":-9,"y":2},"coordinate":{"x":17,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":6,"y":7},"realworld":{"x":-2,"y":3},"coordinate":{"x":16,"y":39}}
]}]}'

答案 1 :(得分:2)

Sahil的答案(如果它尚未删除)会产生误导,因为stream_in是针对ndjson而你没有ndjson。你只需要稍微纠缠嵌套列表。我认为以下内容可以做得更小,但这是一个快速,直接攻击的黑客攻击:

library(jsonlite)
library(purrr)
library(readr)

dat <- fromJSON(txt, simplifyVector=FALSE) # read in your JSON
map(dat$data, "track") %>%                 # move past the top-level "data" element and iterate over the "track"s
  map_df(function(track) {                 # iterate over each element of "track"
    map_df(track, ~as.list(unlist(track))) # convert it to a data frame
  }, .id="track") %>%                      # add in the track "id"
  type_convert()                           # convert mangled types
## # A tibble: 8 × 8
##   track  time midpoint.x midpoint.y realworld.x realworld.y coordinate.x coordinate.y
##   <int> <int>      <int>      <int>       <int>       <int>        <int>        <int>
## 1     1  2016          6          9           2           3           16           38
## 2     1  2016          6          9           2           3           16           38
## 3     1  2016          6          9           2           3           16           38
## 4     2  2015          3         15          -9           2           17           38
## 5     2  2015          3         15          -9           2           17           38
## 6     2  2015          3         15          -9           2           17           38
## 7     2  2015          3         15          -9           2           17           38
## 8     3  2015          6          7          -2           3           16           39

这也为您提供了不错的列类型,但您可能希望使用col_types参数readr::type_converttime转换为字符向量。

可替换地:

library(jsonlite)
library(purrr)
library(tibble)

dat <- fromJSON(txt, flatten=TRUE) # read in your JSON
map_df(dat$data$track, as_tibble, .id="track")