R - 尽可能快地转换1000万个JSON字符串的向量

时间:2016-08-25 16:06:52

标签: json r

在下面附上我的代码的简短示例,以显示我正在使用的代码: 。

library(jsonlite)
my_JSONS

 [1] "{\"8\":{\"type\":\"Team\",\"value\":298536},\"12\":{\"type\":null,\"value\":\"1\"}}"                                                                                                                                                                                                                                                            
 [2] "{\"1\":{\"type\":\"Player\",\"value\":2326300}}"                                                                                                                                                                                                                                                                                                
 [3] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88181118425883,\"y\":0.42416450778345},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}"                                                                                        
 [4] "{\"1\":{\"type\":\"Player\",\"value\":1575886}}"                                                                                                                                                                                                                                                                                                
 [5] "{\"1\":{\"type\":\"Player\",\"value\":1575886}}"                                                                                                                                                                                                                                                                                                
 [6] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}"                                                                                                                                                                                                                                                  
 [7] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}"                                                                                                                                                                                                                                                  
 [8] "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88526475286559,\"y\":0.52056553227969},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}"                                             
 [9] "{\"8\":{\"type\":\"Team\",\"value\":116222}}"                                                                                                                                                                                                                                                                                                   
[10] "{\"1\":{\"type\":\"Player\",\"value\":1575876},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.31542593275055,\"y\":0.58226218795729},\"region\":4}},\"36\":{\"type\":null,\"value\":\"Unknown\"},\"38\":{\"type\":null,\"value\":\"Unknown\"}}"                                           
[11] "{\"1\":{\"type\":\"Player\",\"value\":1575886}}"                                                                                                                                                                                                                                                                                                
[12] "{\"1\":{\"type\":\"Player\",\"value\":1575886},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.088641594240118,\"y\":0.6998714378427},\"region\":3}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null},\"38\":{\"type\":null,\"value\":\"Unknown\"}}"
[13] "{\"8\":{\"type\":\"Team\",\"value\":298536}}"                                                                                                                                                                                                                                                                                                   
[14] "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88986951100793,\"y\":0.56683802403789},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}"                                             
[15] "{\"8\":{\"type\":\"Team\",\"value\":116222}}"                                                                                                                                                                                                                                                                                                   
[16] "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":1575884}}"                                                                                                                                                                                                                                                  
[17] "{\"8\":{\"type\":\"Team\",\"value\":116222}}"                                                                                                                                                                                                                                                                                                   
[18] "{\"1\":{\"type\":\"Player\",\"value\":1384076}}"                                                                                                                                                                                                                                                                                                
[19] "{\"1\":{\"type\":\"Player\",\"value\":1384076},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.10015348959598,\"y\":0.58611822893714},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}"                                                                                        
[20] "{\"1\":{\"type\":\"Player\",\"value\":2326299}}"     


> dput(my_JSONS)
c("{\"8\":{\"type\":\"Team\",\"value\":298536},\"12\":{\"type\":null,\"value\":\"1\"}}", 
"{\"1\":{\"type\":\"Player\",\"value\":2326300}}", "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88181118425883,\"y\":0.42416450778345},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}", 
"{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886}}", 
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}", 
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}", 
"{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88526475286559,\"y\":0.52056553227969},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}", 
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1575876},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.31542593275055,\"y\":0.58226218795729},\"region\":4}},\"36\":{\"type\":null,\"value\":\"Unknown\"},\"38\":{\"type\":null,\"value\":\"Unknown\"}}", 
"{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.088641594240118,\"y\":0.6998714378427},\"region\":3}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null},\"38\":{\"type\":null,\"value\":\"Unknown\"}}", 
"{\"8\":{\"type\":\"Team\",\"value\":298536}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88986951100793,\"y\":0.56683802403789},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}", 
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":1575884}}", 
"{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1384076}}", 
"{\"1\":{\"type\":\"Player\",\"value\":1384076},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.10015348959598,\"y\":0.58611822893714},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}", 
"{\"1\":{\"type\":\"Player\",\"value\":2326299}}")


为了尽可能快地在这个向量上使用fromJSON函数,我将整个向量折叠成一个巨大的字符串,然后在每个JSON之间添加方括号[]: 。

my_JSONS_string = paste(my_JSONS, collapse = ", ")
my_JSONS_string = paste("[, my_JSONS_string, "]", sep = "")


然后最后我使用fromJSON

JSON_dataframe = fromJSON(my_JSONS_string)
JSON_dataframe

 8.type 8.value 12.type 12.value 1.type 1.value 2.type 2.value 26.type 26.value.coordinates.x 26.value.coordinates.y 26.value.region 36.type 36.value 37.type 37.value 3.type 3.value 38.type 38.value
1    Team  298536      NA        1   <NA>      NA   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
2    <NA>      NA      NA     <NA> Player 2326300   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
3    <NA>      NA      NA     <NA> Player 2326295 Player      NA      NA             0.88181118              0.4241645               2      NA  Unknown      NA     <NA>   <NA>      NA      NA     <NA>
4    <NA>      NA      NA     <NA> Player 1575886   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
5    <NA>      NA      NA     <NA> Player 1575886   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
6    <NA>      NA      NA     <NA> Player 2326295 Player 1575886      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
7    <NA>      NA      NA     <NA> Player 2326295 Player 1575886      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
8    <NA>      NA      NA     <NA> Player 2326295 Player      NA      NA             0.88526475              0.5205655               2      NA     <NA>      NA  Unknown Player      NA      NA     <NA>
9    Team  116222      NA     <NA>   <NA>      NA   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
10   <NA>      NA      NA     <NA> Player 1575876 Player      NA      NA             0.31542593              0.5822622               4      NA  Unknown      NA     <NA>   <NA>      NA      NA  Unknown
11   <NA>      NA      NA     <NA> Player 1575886   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
12   <NA>      NA      NA     <NA> Player 1575886 Player      NA      NA             0.08864159              0.6998714               3      NA     <NA>      NA  Unknown Player      NA      NA  Unknown
13   Team  298536      NA     <NA>   <NA>      NA   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
14   <NA>      NA      NA     <NA> Player 2326297 Player      NA      NA             0.88986951              0.5668380               2      NA     <NA>      NA  Unknown Player      NA      NA     <NA>
15   Team  116222      NA     <NA>   <NA>      NA   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
16   <NA>      NA      NA     <NA> Player 2326297 Player 1575884      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
17   Team  116222      NA     <NA>   <NA>      NA   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
18   <NA>      NA      NA     <NA> Player 1384076   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>
19   <NA>      NA      NA     <NA> Player 1384076 Player      NA      NA             0.10015349              0.5861182               2      NA  Unknown      NA     <NA>   <NA>      NA      NA     <NA>
20   <NA>      NA      NA     <NA> Player 2326299   <NA>      NA      NA                     NA                     NA              NA      NA     <NA>      NA     <NA>   <NA>      NA      NA     <NA>


现在我们开始。但是,对于具有10M条目的my_JSONS向量,运行fromJSON函数最终需要花费相当多的时间,在15-20分钟的范围内,这对于我正在使用的代码非常不方便。

感谢对此的任何意见/建议,包括不同的数据包或不同的数据结构是否会加快这一速度。

编辑:附加信息 - 我只需要JSON_dataframe中的第1,2,和27列,如果值得一提的话。感谢

1 个答案:

答案 0 :(得分:1)

唉。我没想到这个:

library(jsonlite)
library(microbenchmark)
library(purrr)

my_jsons <- c("{\"8\":{\"type\":\"Team\",\"value\":298536},\"12\":{\"type\":null,\"value\":\"1\"}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":2326300}}", "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88181118425883,\"y\":0.42416450778345},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":1575886}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":2326295},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88526475286559,\"y\":0.52056553227969},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}", 
              "{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1575876},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.31542593275055,\"y\":0.58226218795729},\"region\":4}},\"36\":{\"type\":null,\"value\":\"Unknown\"},\"38\":{\"type\":null,\"value\":\"Unknown\"}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":1575886}}", "{\"1\":{\"type\":\"Player\",\"value\":1575886},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.088641594240118,\"y\":0.6998714378427},\"region\":3}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null},\"38\":{\"type\":null,\"value\":\"Unknown\"}}", 
              "{\"8\":{\"type\":\"Team\",\"value\":298536}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.88986951100793,\"y\":0.56683802403789},\"region\":2}},\"37\":{\"type\":null,\"value\":\"Unknown\"},\"3\":{\"type\":\"Player\",\"value\":null}}", 
              "{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":2326297},\"2\":{\"type\":\"Player\",\"value\":1575884}}", 
              "{\"8\":{\"type\":\"Team\",\"value\":116222}}", "{\"1\":{\"type\":\"Player\",\"value\":1384076}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":1384076},\"2\":{\"type\":\"Player\",\"value\":null},\"26\":{\"type\":null,\"value\":{\"coordinates\":{\"x\":0.10015348959598,\"y\":0.58611822893714},\"region\":2}},\"36\":{\"type\":null,\"value\":\"Unknown\"}}", 
              "{\"1\":{\"type\":\"Player\",\"value\":2326299}}")

my_jsons <- rep(my_jsons, 1000)

length(my_jsons)
## [1] 20000

microbenchmark(
  stream_in={ stream_in(textConnection(my_jsons), pagesize=10000, verbose=FALSE) },
  purrr={ map_df(my_jsons, ~as.list(unlist(fromJSON(.)))) },
  string={ fromJSON(sprintf("[%s]", paste0(my_jsons, collapse=","))) },
  times=10
) -> mb

mb
## Unit: milliseconds
##       expr       min         lq       mean     median         uq        max neval cld
##  stream_in 3730.7919  3919.0835  3995.3708  3986.5055  4014.3850  4298.6312    10  b 
##      purrr 9700.8605 10019.6934 10191.8872 10095.6281 10396.6715 10808.0865    10   c
##     string  635.0473   753.4842   814.1994   851.3218   870.9981   932.7041    10 a  

enter image description here

或许将您的文件切成n个部分,然后并行提取/转换它们?

更新

我基于一个基于C ++ 11 JSON头文件库https://github.com/nlohmann/json的小pkg汇总了一个测试pkg,并验证它生成了与您正在使用的最快字符串版本相同的数据框架结构。我将JSON元素的数量增加到200,000并使用该pkg进行了两次试验,一次使用dplyr::bind_rows(),另一次使用data.table::rbindlist()

## Unit: seconds
##              expr      min       lq     mean   median       uq      max neval cld
##      ndjson_dplyr 7.618801 7.618801 7.718406 7.718406 7.818010 7.818010     2   b
##  ndjson_datatable 2.547322 2.547322 2.852176 2.852176 3.157031 3.157031     2  a 
##            string 7.801338 7.801338 8.031613 8.031613 8.261888 8.261888     2   b

如果你有兴趣,我可以把它清理干净并把它放在github上。