在下面附上我的代码的简短示例,以显示我正在使用的代码: 。
library(jsonlite)
my_JSONS
[1] "{\"8\":{\"type\":\"Team\",\"value\":298536},\"12\":{\"type\":null,\"value\":\"1\"}}"
[2] "{\"1\":{\"type\":\"Player\",\"value\":2326300}}"
[3] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88181118425883,\"y\":0.42416450778345},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}"
[4] "{\"1\":{\"type\":\"Player\",\"value\":1575886}}"
[5] "{\"1\":{\"type\":\"Player\",\"value\":1575886}}"
[6] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}"
[7] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}"
[8] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88526475286559,\"y\":0.52056553227969},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}"
[9] "{\"8\":{\"type\":\"Team\",\"value\":116222}}"
[10] "{\"1\":{\"type\":\"Player\",\"value\":1575876},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.31542593275055,\"y\":0.58226218795729},\"region\":4}},\"36\":{\"type\":null,\"value\":\"Unknown\"},\"38\":{\"type\":null,\"value\":\"Unknown\"}}"
[11] "{\"1\":{\"type\":\"Player\",\"value\":1575886}}"
[12] "{\"1\":{\"type\":\"Player\",\"value\":1575886},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.088641594240118,\"y\":0.6998714378427},\"region\":3}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null},\"38\":{\"type\":null,\"value\":\"Unknown\"}}"
[13] "{\"8\":{\"type\":\"Team\",\"value\":298536}}"
[14] "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88986951100793,\"y\":0.56683802403789},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}"
[15] "{\"8\":{\"type\":\"Team\",\"value\":116222}}"
[16] "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":1575884}}"
[17] "{\"8\":{\"type\":\"Team\",\"value\":116222}}"
[18] "{\"1\":{\"type\":\"Player\",\"value\":1384076}}"
[19] "{\"1\":{\"type\":\"Player\",\"value\":1384076},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.10015348959598,\"y\":0.58611822893714},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}"
[20] "{\"1\":{\"type\":\"Player\",\"value\":2326299}}"
> dput(my_JSONS)
c("{\"8\":{\"type\":\"Team\",\"value\":298536},\"12\":{\"type\":null,\"value\":\"1\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326300}}", "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88181118425883,\"y\":0.42416450778345},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88526475286559,\"y\":0.52056553227969},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}",
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1575876},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.31542593275055,\"y\":0.58226218795729},\"region\":4}},\"36\":{\"type\":null,\"value\":\"Unknown\"},\"38\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.088641594240118,\"y\":0.6998714378427},\"region\":3}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null},\"38\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"8\":{\"type\":\"Team\",\"value\":298536}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88986951100793,\"y\":0.56683802403789},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}",
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":1575884}}",
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1384076}}",
"{\"1\":{\"type\":\"Player\",\"value\":1384076},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.10015348959598,\"y\":0.58611822893714},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326299}}")
。
为了尽可能快地在这个向量上使用fromJSON函数,我将整个向量折叠成一个巨大的字符串,然后在每个JSON之间添加方括号[]:
。
my_JSONS_string = paste(my_JSONS, collapse = ", ")
my_JSONS_string = paste("[, my_JSONS_string, "]", sep = "")
。
然后最后我使用fromJSON
JSON_dataframe = fromJSON(my_JSONS_string)
JSON_dataframe
8.type 8.value 12.type 12.value 1.type 1.value 2.type 2.value 26.type 26.value.coordinates.x 26.value.coordinates.y 26.value.region 36.type 36.value 37.type 37.value 3.type 3.value 38.type 38.value
1 Team 298536 NA 1 <NA> NA <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
2 <NA> NA NA <NA> Player 2326300 <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
3 <NA> NA NA <NA> Player 2326295 Player NA NA 0.88181118 0.4241645 2 NA Unknown NA <NA> <NA> NA NA <NA>
4 <NA> NA NA <NA> Player 1575886 <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
5 <NA> NA NA <NA> Player 1575886 <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
6 <NA> NA NA <NA> Player 2326295 Player 1575886 NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
7 <NA> NA NA <NA> Player 2326295 Player 1575886 NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
8 <NA> NA NA <NA> Player 2326295 Player NA NA 0.88526475 0.5205655 2 NA <NA> NA Unknown Player NA NA <NA>
9 Team 116222 NA <NA> <NA> NA <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
10 <NA> NA NA <NA> Player 1575876 Player NA NA 0.31542593 0.5822622 4 NA Unknown NA <NA> <NA> NA NA Unknown
11 <NA> NA NA <NA> Player 1575886 <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
12 <NA> NA NA <NA> Player 1575886 Player NA NA 0.08864159 0.6998714 3 NA <NA> NA Unknown Player NA NA Unknown
13 Team 298536 NA <NA> <NA> NA <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
14 <NA> NA NA <NA> Player 2326297 Player NA NA 0.88986951 0.5668380 2 NA <NA> NA Unknown Player NA NA <NA>
15 Team 116222 NA <NA> <NA> NA <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
16 <NA> NA NA <NA> Player 2326297 Player 1575884 NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
17 Team 116222 NA <NA> <NA> NA <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
18 <NA> NA NA <NA> Player 1384076 <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
19 <NA> NA NA <NA> Player 1384076 Player NA NA 0.10015349 0.5861182 2 NA Unknown NA <NA> <NA> NA NA <NA>
20 <NA> NA NA <NA> Player 2326299 <NA> NA NA NA NA NA NA <NA> NA <NA> <NA> NA NA <NA>
。
现在我们开始。但是,对于具有10M条目的my_JSONS向量,运行fromJSON函数最终需要花费相当多的时间,在15-20分钟的范围内,这对于我正在使用的代码非常不方便。
感谢对此的任何意见/建议,包括不同的数据包或不同的数据结构是否会加快这一速度。
编辑:附加信息 - 我只需要JSON_dataframe中的第1,2,和27列,如果值得一提的话。感谢
答案 0 :(得分:1)
唉。我没想到这个:
library(jsonlite)
library(microbenchmark)
library(purrr)
my_jsons <- c("{\"8\":{\"type\":\"Team\",\"value\":298536},\"12\":{\"type\":null,\"value\":\"1\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326300}}", "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88181118425883,\"y\":0.42416450778345},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88526475286559,\"y\":0.52056553227969},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}",
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1575876},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.31542593275055,\"y\":0.58226218795729},\"region\":4}},\"36\":{\"type\":null,\"value\":\"Unknown\"},\"38\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.088641594240118,\"y\":0.6998714378427},\"region\":3}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null},\"38\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"8\":{\"type\":\"Team\",\"value\":298536}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88986951100793,\"y\":0.56683802403789},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}",
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":1575884}}",
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1384076}}",
"{\"1\":{\"type\":\"Player\",\"value\":1384076},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.10015348959598,\"y\":0.58611822893714},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}",
"{\"1\":{\"type\":\"Player\",\"value\":2326299}}")
my_jsons <- rep(my_jsons, 1000)
length(my_jsons)
## [1] 20000
microbenchmark(
stream_in={ stream_in(textConnection(my_jsons), pagesize=10000, verbose=FALSE) },
purrr={ map_df(my_jsons, ~as.list(unlist(fromJSON(.)))) },
string={ fromJSON(sprintf("[%s]", paste0(my_jsons, collapse=","))) },
times=10
) -> mb
mb
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## stream_in 3730.7919 3919.0835 3995.3708 3986.5055 4014.3850 4298.6312 10 b
## purrr 9700.8605 10019.6934 10191.8872 10095.6281 10396.6715 10808.0865 10 c
## string 635.0473 753.4842 814.1994 851.3218 870.9981 932.7041 10 a
或许将您的文件切成n
个部分,然后并行提取/转换它们?
更新
我基于一个基于C ++ 11 JSON头文件库https://github.com/nlohmann/json的小pkg汇总了一个测试pkg,并验证它生成了与您正在使用的最快字符串版本相同的数据框架结构。我将JSON元素的数量增加到200,000并使用该pkg进行了两次试验,一次使用dplyr::bind_rows()
,另一次使用data.table::rbindlist()
:
## Unit: seconds
## expr min lq mean median uq max neval cld
## ndjson_dplyr 7.618801 7.618801 7.718406 7.718406 7.818010 7.818010 2 b
## ndjson_datatable 2.547322 2.547322 2.852176 2.852176 3.157031 3.157031 2 a
## string 7.801338 7.801338 8.031613 8.031613 8.261888 8.261888 2 b
如果你有兴趣,我可以把它清理干净并把它放在github上。