R:JSON向量数据框架

时间:2016-09-06 08:19:31

标签: json r jsonlite

我有一个JSON向量(具有相同的结构)并将其转换为data.frame。以下示例完全符合我的要求。

require(jsonlite)   # fromJSON()
require(magrittr)   # for the pipeline only
require(data.table) # rbindlist()

jsons <- c('{"num":1,"char":"a","list":{"x":1,"y":2}}',
           '{"num":2,"char":"b","list":{"x":1,"y":2}}',
           '{"num":3,"char":"c","list":{"x":1,"y":2}}')

df <- jsons %>%
  lapply(fromJSON) %>%
  lapply(as.data.frame.list, stringsAsFactors = F) %>%
  rbindlist(fill = T)

JSON的一些元素是对象,即如果我对它进行转换fromJSON(),列表中的一些元素也将是列表。我不能对每个列表使用unlist()因为我有不同的变量类型所以我使用as.data.frame.list()函数。然而,这对于每个JSON来说都是太慢了。有没有办法如何更有效地做到这一点?

json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
system.time(
  df <- json %>% rep(1000) %>%
    lapply(fromJSON) %>%
    lapply(as.data.frame.list, stringsAsFactors = F) %>%
    rbindlist(fill = T)
) # 2.72

我知道有很多类似的问题,但我看到的大部分答案都是关于使用as.data.frame()data.frame()。没有人提到速度。也许没有更好的解决方案。

2 个答案:

答案 0 :(得分:2)

我终于找到了answer。它将在CRAN上soon

devtools::install_github("jeremystan/tidyjson")
tidyjson::spread_all()

这个功能比我上面的例子快10倍。

答案 1 :(得分:0)

尝试将所有JSON折叠在一个字符串中。让我们展示解决方案的示例:

require(jsonlite)
require(data.table)

json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
n <- 1000
ex <- rep(json, 1000)

f1 <- function(x) {
    res <- lapply(x, fromJSON)
    res <- lapply(res, as.data.frame.list, stringsAsFactors = FALSE)
    res <- rbindlist(res, fill = TRUE)
    return(res)
}
f2 <- function(x) {
    res <- fromJSON(paste0("[", paste(x, collapse = ","), "]"), flatten = TRUE)
    lst <- sapply(res, is.list)
    res[lst] <- lapply(res[lst], function(x) as.data.table(transpose(x)))
    res <- flatten(res)
    return(res)
}

bench::mark(
    f1(ex), f2(ex), min_iterations = 100, check = FALSE
)
#> # A tibble: 2 x 14
#>   expression     min    mean  median      max `itr/sec` mem_alloc  n_gc n_itr #> total_time result memory time 
#>   <chr>      <bch:t> <bch:t> <bch:t> <bch:tm>     <dbl> <bch:byt> <dbl> <int>   #> <bch:tm> <list> <list> <lis>
#> 1 f1(ex)       2.27s   2.35s   2.32s    2.49s     0.425        0B  5397   100      #> 3.92m <data… <Rpro… <bch…
#> 2 f2(ex)     48.85ms 63.78ms 57.88ms 116.19ms    15.7          0B   143   100      #> 6.38s <data… <Rpro… <bch…
#> # … with 1 more variable: gc <list>