如何将目录中多个JSON文件的数据加载和处理到R中的数据框?

时间:2017-06-21 16:22:16

标签: json r

我们在目录中存储了几个JSON文件。这些JSON文件具有嵌套结构。我们编写了以下代码来读取每个JSON文件中的数据:

library("jsonlite")
temp = list.files(pattern="*.JSON")

for (files in temp){ 
  data <- fromJSON(files, flatten=TRUE)
  ...
  }

class(data)现在显示'数据'是"list"。这些数据的结构可以描述如下:names(data)给出列名:“a”“b”“c”“d”“e”“f”......等等。

列“a”具有嵌套,以便:names(data$a)给出:“nest1”“nest2”“nest3”......等。

我们希望编写逻辑来读取所有JSON文件,然后if data$e == 1 and data$a$nest1 == TRUE,然后count_nest1 += 1。最后,我们希望对所有nest1 == TRUE的实例进行计数,并计算所有nest2 == TRUE,等等......

实际数据文件1:

{"scans": {"Bkav": {"detected": false, "version": "1.3.0.8876", "result": null, "update": "20170613"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170613"}, "MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170613"}, "nProtect": {"detected": false, "version": "2017-06-13.02", "result": null, "update": "20170613"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170613"}, "CAT-QuickHeal": {"detected": false, "version": "14.00", "result": null, "update": "20170613"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170613"}, "Malwarebytes": {"detected": false, "version": "2.1.1.1115", "result": null, "update": "20170613"}, "Zillya": {"detected": false, "version": "2.0.0.3311", "result": null, "update": "20170613"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170613"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1623", "result": null, "update": "20170612"}, "K7GW": {"detected": false, "version": "10.15.23651", "result": null, "update": "20170613"}, "K7AntiVirus": {"detected": false, "version": "10.15.23640", "result": null, "update": "20170613"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170613"}, "Baidu": {"detected": false, "version": "1.0.0.2", "result": null, "update": "20170613"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170613"}, "Symantec": {"detected": false, "version": "1.3.1.0", "result": null, "update": "20170613"}, "ESET-NOD32": {"detected": false, "version": "15577", "result": null, "update": "20170613"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170613"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170613"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170613"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170613"}, "NANO-Antivirus": {"detected": false, "version": "1.0.76.17389", "result": null, "update": "20170613"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170613"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170613"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170613"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170613"}, "Comodo": {"detected": false, "version": "27271", "result": null, "update": "20170613"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170613"}, "DrWeb": {"detected": false, "version": "7.0.28.2020", "result": null, "update": "20170613"}, "VIPRE": {"detected": false, "version": "58800", "result": null, "update": "20170613"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170613"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170613"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170613"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170613"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170613"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170613"}, "Avira": {"detected": false, "version": "8.3.3.4", "result": null, "update": "20170613"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170613"}, "Endgame": {"detected": false, "version": "0.7.0", "result": null, "update": "20170612"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170613"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170613"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "GData": {"detected": false, "version": "A:25.12848B:25.9761", "result": null, "update": "20170613"}, "AhnLab-V3": {"detected": false, "version": "3.9.1.17781", "result": null, "update": "20170613"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170613"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170613"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170613"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "Rising": {"detected": false, "version": "28.0.0.1", "result": null, "update": "20170613"}, "Yandex": {"detected": false, "version": "5.5.1.3", "result": null, "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170613"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170613"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170613"}}, "scan_id": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7-1497385194", "sha1": "c6a6e3977402e76379f48f09a052f0f3c50f5964", "resource": "00D9D7D8E563AE71DCECC808F35F7D0845FFD91A1731D3F69E6EA5204FD7A3D7", "response_code": 1, "scan_date": "2017-06-13 20:19:54", "permalink": "https://www.virustotal.com/file/00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7/analysis/1497385194/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 0, "sha256": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7", "md5": "8d95236c637c042ff7df7fd7cc502ddb"}

实际数据文件2:

{"scans": {"MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170610"}, "nProtect": {"detected": false, "version": "2017-06-10.02", "result": null, "update": "20170610"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170610"}, "CAT-QuickHeal": {"detected": true, "version": "14.00", "result": "TrojDownloader.NSIS.Genome.V", "update": "20170610"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170610"}, "Malwarebytes": {"detected": true, "version": "2.1.1.1115", "result": "PUP.Optional.MyPCBackup", "update": "20170610"}, "Zillya": {"detected": false, "version": "2.0.0.3308", "result": null, "update": "20170610"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170610"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1596", "result": null, "update": "20170607"}, "K7GW": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "K7AntiVirus": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170610"}, "TrendMicro": {"detected": false, "version": "9.740.0.1012", "result": null, "update": "20170610"}, "Baidu": {"detected": true, "version": "1.0.0.2", "result": "Win32.Trojan.WisdomEyes.16070401.9500.9976", "update": "20170608"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170610"}, "Symantec": {"detected": true, "version": "1.3.1.0", "result": "PUA.MyPCBackup", "update": "20170610"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170610"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170610"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170610"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170610"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170610"}, "NANO-Antivirus": {"detected": true, "version": "1.0.76.17389", "result": "Riskware.Win32.Unwanted.dmgktv", "update": "20170610"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170610"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170610"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170610"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170610"}, "Comodo": {"detected": false, "version": "27254", "result": null, "update": "20170610"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170610"}, "DrWeb": {"detected": true, "version": "7.0.28.2020", "result": "Program.Unwanted.567", "update": "20170610"}, "VIPRE": {"detected": false, "version": "58730", "result": null, "update": "20170610"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170610"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170610"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170610"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170610"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170610"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170610"}, "Avira": {"detected": true, "version": "8.3.3.4", "result": "PUA/MyPCBackup.Gen", "update": "20170610"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170610"}, "Endgame": {"detected": false, "version": "0.5.0", "result": null, "update": "20170515"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170610"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170610"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "GData": {"detected": true, "version": "A:25.12800B:25.9740", "result": "NSIS.Adware.MyPCBackup.E", "update": "20170610"}, "AhnLab-V3": {"detected": false, "version": "3.9.0.17697", "result": null, "update": "20170610"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170610"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170610"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170609"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ESET-NOD32": {"detected": true, "version": "15562", "result": "MSIL/MyPCBackup.D potentially unwanted", "update": "20170610"}, "Rising": {"detected": true, "version": "28.0.0.1", "result": "Malware.Undefined!8.C (cloud:I1YBt1VpobT) ", "update": "20170610"}, "Yandex": {"detected": true, "version": "5.5.1.3", "result": "Riskware.Agent!", "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170610"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170610"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170610"}}, "scan_id": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873-1497129945", "sha1": "7b890323abfe8f3bd33be0bc439076b5525d03b0", "resource": "00D468FA26813736CD14FF91E84F5E31FE30EAEF6B35AF44CAFE540870EA7873", "response_code": 1, "scan_date": "2017-06-10 21:25:45", "permalink": "https://www.virustotal.com/file/00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873/analysis/1497129945/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 11, "sha256": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873", "md5": "45922155c9628e11441aa869c6287bb7"}

实际数据文件3:

{"response_code": 0, "resource": "0E28BEDFBA37CEE5BD639AC86AC08A422C8944C3749CD2C5D7F5A0C2B37115B3", "verbose_msg": "The requested resource is not among the finished, queued or pending scans"}

我们读取文件并检查响应代码。如果响应代码为“0”,则count_not_detected += 1读取JSON数据并计算每种防病毒类型检测到的样本数量,以便最后我们可以说防病毒A检测到323/500总文件并检测到防病毒B 224/500总文件等。

如果某些内容可以完全展平数据并将其全部存储在数据框中,那就太棒了。我们查看了tidyjson包,但没有成功。

1 个答案:

答案 0 :(得分:0)

虽然这些更改尚未发布到CRAN,但我认为tidyjson的开发版本可以很好地满足您的需求。您可以使用devtools::install_github('jeremystan/tidyjson')安装最新的稳定开发版本。

那就是说,我正在努力了解你正在寻找什么。如果您希望了解对象的大小/结构,可以使用json_structure()json_lengths()json_types()进行调查:

suppressMessages({
  library(jsonlite)
  library(dplyr)
  library(tidyjson)
})

rawjson1 <- "raw_json_1.json" %>% as.tbl_json()
rawjson2 <- "raw_json_2.json" %>% as.tbl_json()
rawjson3 <- "raw_json_3.json" %>% as.tbl_json()

rawjson1 %>% json_structure()
#> # A tbl_json: 313 x 9 tibble with a "JSON" attribute
#>          `attr(., "JSON")` document.id parent.id level index child.id
#>                      <chr>       <int>     <chr> <int> <int>    <chr>
#>  1 "{\"scans\":{\"Bkav..."           1      <NA>     0     1        1
#>  2 "{\"Bkav\":{\"detec..."           1         1     1     1      1.1
#>  3   "\"00d9d7d8e563ae..."           1         1     1     2      1.2
#>  4   "\"c6a6e3977402e7..."           1         1     1     3      1.3
#>  5   "\"00D9D7D8E563AE..."           1         1     1     4      1.4
#>  6                       1           1         1     1     5      1.5
#>  7   "\"2017-06-13 20:..."           1         1     1     6      1.6
#>  8   "\"https://www.vi..."           1         1     1     7      1.7
#>  9   "\"Scan finished,..."           1         1     1     8      1.8
#> 10                      60           1         1     1     9      1.9
#> # ... with 303 more rows, and 4 more variables: seq <list>, name <chr>,
#> #   type <fctr>, length <int>


rawjson1 %>% gather_object() %>% json_lengths()
#> # A tbl_json: 12 x 3 tibble with a "JSON" attribute
#>          `attr(., "JSON")` document.id          name length
#>                      <chr>       <int>         <chr>  <int>
#>  1 "{\"Bkav\":{\"detec..."           1         scans     60
#>  2   "\"00d9d7d8e563ae..."           1       scan_id      1
#>  3   "\"c6a6e3977402e7..."           1          sha1      1
#>  4   "\"00D9D7D8E563AE..."           1      resource      1
#>  5                       1           1 response_code      1
#>  6   "\"2017-06-13 20:..."           1     scan_date      1
#>  7   "\"https://www.vi..."           1     permalink      1
#>  8   "\"Scan finished,..."           1   verbose_msg      1
#>  9                      60           1         total      1
#> 10                       0           1     positives      1
#> 11   "\"00d9d7d8e563ae..."           1        sha256      1
#> 12   "\"8d95236c637c04..."           1           md5      1

rawjson1 %>% gather_object() %>% json_types()
#> # A tbl_json: 12 x 3 tibble with a "JSON" attribute
#>          `attr(., "JSON")` document.id          name   type
#>                      <chr>       <int>         <chr> <fctr>
#>  1 "{\"Bkav\":{\"detec..."           1         scans object
#>  2   "\"00d9d7d8e563ae..."           1       scan_id string
#>  3   "\"c6a6e3977402e7..."           1          sha1 string
#>  4   "\"00D9D7D8E563AE..."           1      resource string
#>  5                       1           1 response_code number
#>  6   "\"2017-06-13 20:..."           1     scan_date string
#>  7   "\"https://www.vi..."           1     permalink string
#>  8   "\"Scan finished,..."           1   verbose_msg string
#>  9                      60           1         total number
#> 10                       0           1     positives number
#> 11   "\"00d9d7d8e563ae..."           1        sha256 string
#> 12   "\"8d95236c637c04..."           1           md5 string

那就是说,如果你的最终目标是获得一个数据框以供进一步调查,你可以获得一个非常宽的数据框spread_all(),或者我认为更有用的数据集(一级密钥是列) ,然后每次扫描的行)。请注意,我正在同时处理多个文件(它们各自获得一个唯一的document.id)。

files <- c("raw_json_1.json", "raw_json_2.json")

j <- files %>% as.tbl_json()

clean <- j %>%
spread_all(recursive=FALSE) %>% ## get the level 1 keys
enter_object('scans') %>% gather_object() %>% ## enter and gather scans
spread_all(recursive=FALSE)    ## spread the scans out

names(clean)
#>  [1] "document.id"   "scan_id"       "sha1"          "resource"     
#>  [5] "response_code" "scan_date"     "permalink"     "verbose_msg"  
#>  [9] "total"         "positives"     "sha256"        "md5"          
#> [13] "name"          "detected"      "version"       "result"       
#> [17] "update"

## use tbl_df when done parsing to strip the JSON component
clean %>% tbl_df() %>% group_by(document.id) %>% summarize(count = n(), detected_count = sum(detected))
#> # A tibble: 2 x 3
#>   document.id count detected_count
#>         <int> <int>          <int>
#> 1           1    60              0
#> 2           2    60             11

## look at those with detected==TRUE
clean %>% tbl_df() %>% filter(detected) %>% select(document.id, name, version, 
  result)
#> # A tbl_json: 11 x 4 tibble with a "JSON" attribute
#>         `attr(., "JSON")` document.id           name             version
#>                     <chr>       <int>          <chr>               <chr>
#>  1 "{\"detected\":tru..."           2  CAT-QuickHeal               14.00
#>  2 "{\"detected\":tru..."           2   Malwarebytes          2.1.1.1115
#>  3 "{\"detected\":tru..."           2          Baidu             1.0.0.2
#>  4 "{\"detected\":tru..."           2       Symantec             1.3.1.0
#>  5 "{\"detected\":tru..."           2 NANO-Antivirus        1.0.76.17389
#>  6 "{\"detected\":tru..."           2          DrWeb         7.0.28.2020
#>  7 "{\"detected\":tru..."           2          Avira             8.3.3.4
#>  8 "{\"detected\":tru..."           2          GData A:25.12800B:25.9740
#>  9 "{\"detected\":tru..."           2     ESET-NOD32               15562
#> 10 "{\"detected\":tru..."           2         Rising            28.0.0.1
#> 11 "{\"detected\":tru..."           2         Yandex             5.5.1.3
#> # ... with 1 more variables: result <chr>