我们在目录中存储了几个JSON文件。这些JSON文件具有嵌套结构。我们编写了以下代码来读取每个JSON文件中的数据:
library("jsonlite")
temp = list.files(pattern="*.JSON")
for (files in temp){
data <- fromJSON(files, flatten=TRUE)
...
}
class(data)
现在显示'数据'是"list"
。这些数据的结构可以描述如下:names(data)
给出列名:“a”“b”“c”“d”“e”“f”......等等。
列“a”具有嵌套,以便:names(data$a)
给出:“nest1”“nest2”“nest3”......等。
我们希望编写逻辑来读取所有JSON文件,然后if data$e == 1 and data$a$nest1 == TRUE
,然后count_nest1 += 1
。最后,我们希望对所有nest1 == TRUE的实例进行计数,并计算所有nest2 == TRUE,等等......
实际数据文件1:
{"scans": {"Bkav": {"detected": false, "version": "1.3.0.8876", "result": null, "update": "20170613"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170613"}, "MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170613"}, "nProtect": {"detected": false, "version": "2017-06-13.02", "result": null, "update": "20170613"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170613"}, "CAT-QuickHeal": {"detected": false, "version": "14.00", "result": null, "update": "20170613"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170613"}, "Malwarebytes": {"detected": false, "version": "2.1.1.1115", "result": null, "update": "20170613"}, "Zillya": {"detected": false, "version": "2.0.0.3311", "result": null, "update": "20170613"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170613"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1623", "result": null, "update": "20170612"}, "K7GW": {"detected": false, "version": "10.15.23651", "result": null, "update": "20170613"}, "K7AntiVirus": {"detected": false, "version": "10.15.23640", "result": null, "update": "20170613"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170613"}, "Baidu": {"detected": false, "version": "1.0.0.2", "result": null, "update": "20170613"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170613"}, "Symantec": {"detected": false, "version": "1.3.1.0", "result": null, "update": "20170613"}, "ESET-NOD32": {"detected": false, "version": "15577", "result": null, "update": "20170613"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170613"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170613"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170613"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170613"}, "NANO-Antivirus": {"detected": false, "version": "1.0.76.17389", "result": null, "update": "20170613"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170613"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170613"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170613"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170613"}, "Comodo": {"detected": false, "version": "27271", "result": null, "update": "20170613"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170613"}, "DrWeb": {"detected": false, "version": "7.0.28.2020", "result": null, "update": "20170613"}, "VIPRE": {"detected": false, "version": "58800", "result": null, "update": "20170613"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170613"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170613"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170613"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170613"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170613"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170613"}, "Avira": {"detected": false, "version": "8.3.3.4", "result": null, "update": "20170613"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170613"}, "Endgame": {"detected": false, "version": "0.7.0", "result": null, "update": "20170612"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170613"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170613"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "GData": {"detected": false, "version": "A:25.12848B:25.9761", "result": null, "update": "20170613"}, "AhnLab-V3": {"detected": false, "version": "3.9.1.17781", "result": null, "update": "20170613"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170613"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170613"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170613"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "Rising": {"detected": false, "version": "28.0.0.1", "result": null, "update": "20170613"}, "Yandex": {"detected": false, "version": "5.5.1.3", "result": null, "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170613"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170613"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170613"}}, "scan_id": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7-1497385194", "sha1": "c6a6e3977402e76379f48f09a052f0f3c50f5964", "resource": "00D9D7D8E563AE71DCECC808F35F7D0845FFD91A1731D3F69E6EA5204FD7A3D7", "response_code": 1, "scan_date": "2017-06-13 20:19:54", "permalink": "https://www.virustotal.com/file/00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7/analysis/1497385194/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 0, "sha256": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7", "md5": "8d95236c637c042ff7df7fd7cc502ddb"}
实际数据文件2:
{"scans": {"MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170610"}, "nProtect": {"detected": false, "version": "2017-06-10.02", "result": null, "update": "20170610"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170610"}, "CAT-QuickHeal": {"detected": true, "version": "14.00", "result": "TrojDownloader.NSIS.Genome.V", "update": "20170610"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170610"}, "Malwarebytes": {"detected": true, "version": "2.1.1.1115", "result": "PUP.Optional.MyPCBackup", "update": "20170610"}, "Zillya": {"detected": false, "version": "2.0.0.3308", "result": null, "update": "20170610"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170610"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1596", "result": null, "update": "20170607"}, "K7GW": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "K7AntiVirus": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170610"}, "TrendMicro": {"detected": false, "version": "9.740.0.1012", "result": null, "update": "20170610"}, "Baidu": {"detected": true, "version": "1.0.0.2", "result": "Win32.Trojan.WisdomEyes.16070401.9500.9976", "update": "20170608"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170610"}, "Symantec": {"detected": true, "version": "1.3.1.0", "result": "PUA.MyPCBackup", "update": "20170610"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170610"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170610"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170610"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170610"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170610"}, "NANO-Antivirus": {"detected": true, "version": "1.0.76.17389", "result": "Riskware.Win32.Unwanted.dmgktv", "update": "20170610"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170610"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170610"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170610"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170610"}, "Comodo": {"detected": false, "version": "27254", "result": null, "update": "20170610"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170610"}, "DrWeb": {"detected": true, "version": "7.0.28.2020", "result": "Program.Unwanted.567", "update": "20170610"}, "VIPRE": {"detected": false, "version": "58730", "result": null, "update": "20170610"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170610"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170610"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170610"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170610"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170610"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170610"}, "Avira": {"detected": true, "version": "8.3.3.4", "result": "PUA/MyPCBackup.Gen", "update": "20170610"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170610"}, "Endgame": {"detected": false, "version": "0.5.0", "result": null, "update": "20170515"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170610"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170610"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "GData": {"detected": true, "version": "A:25.12800B:25.9740", "result": "NSIS.Adware.MyPCBackup.E", "update": "20170610"}, "AhnLab-V3": {"detected": false, "version": "3.9.0.17697", "result": null, "update": "20170610"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170610"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170610"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170609"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ESET-NOD32": {"detected": true, "version": "15562", "result": "MSIL/MyPCBackup.D potentially unwanted", "update": "20170610"}, "Rising": {"detected": true, "version": "28.0.0.1", "result": "Malware.Undefined!8.C (cloud:I1YBt1VpobT) ", "update": "20170610"}, "Yandex": {"detected": true, "version": "5.5.1.3", "result": "Riskware.Agent!", "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170610"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170610"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170610"}}, "scan_id": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873-1497129945", "sha1": "7b890323abfe8f3bd33be0bc439076b5525d03b0", "resource": "00D468FA26813736CD14FF91E84F5E31FE30EAEF6B35AF44CAFE540870EA7873", "response_code": 1, "scan_date": "2017-06-10 21:25:45", "permalink": "https://www.virustotal.com/file/00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873/analysis/1497129945/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 11, "sha256": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873", "md5": "45922155c9628e11441aa869c6287bb7"}
实际数据文件3:
{"response_code": 0, "resource": "0E28BEDFBA37CEE5BD639AC86AC08A422C8944C3749CD2C5D7F5A0C2B37115B3", "verbose_msg": "The requested resource is not among the finished, queued or pending scans"}
我们读取文件并检查响应代码。如果响应代码为“0”,则count_not_detected += 1
读取JSON数据并计算每种防病毒类型检测到的样本数量,以便最后我们可以说防病毒A检测到323/500总文件并检测到防病毒B 224/500总文件等。
如果某些内容可以完全展平数据并将其全部存储在数据框中,那就太棒了。我们查看了tidyjson
包,但没有成功。
答案 0 :(得分:0)
虽然这些更改尚未发布到CRAN,但我认为tidyjson
的开发版本可以很好地满足您的需求。您可以使用devtools::install_github('jeremystan/tidyjson')
安装最新的稳定开发版本。
那就是说,我正在努力了解你正在寻找什么。如果您希望了解对象的大小/结构,可以使用json_structure()
,json_lengths()
或json_types()
进行调查:
suppressMessages({
library(jsonlite)
library(dplyr)
library(tidyjson)
})
rawjson1 <- "raw_json_1.json" %>% as.tbl_json()
rawjson2 <- "raw_json_2.json" %>% as.tbl_json()
rawjson3 <- "raw_json_3.json" %>% as.tbl_json()
rawjson1 %>% json_structure()
#> # A tbl_json: 313 x 9 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id parent.id level index child.id
#> <chr> <int> <chr> <int> <int> <chr>
#> 1 "{\"scans\":{\"Bkav..." 1 <NA> 0 1 1
#> 2 "{\"Bkav\":{\"detec..." 1 1 1 1 1.1
#> 3 "\"00d9d7d8e563ae..." 1 1 1 2 1.2
#> 4 "\"c6a6e3977402e7..." 1 1 1 3 1.3
#> 5 "\"00D9D7D8E563AE..." 1 1 1 4 1.4
#> 6 1 1 1 1 5 1.5
#> 7 "\"2017-06-13 20:..." 1 1 1 6 1.6
#> 8 "\"https://www.vi..." 1 1 1 7 1.7
#> 9 "\"Scan finished,..." 1 1 1 8 1.8
#> 10 60 1 1 1 9 1.9
#> # ... with 303 more rows, and 4 more variables: seq <list>, name <chr>,
#> # type <fctr>, length <int>
rawjson1 %>% gather_object() %>% json_lengths()
#> # A tbl_json: 12 x 3 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id name length
#> <chr> <int> <chr> <int>
#> 1 "{\"Bkav\":{\"detec..." 1 scans 60
#> 2 "\"00d9d7d8e563ae..." 1 scan_id 1
#> 3 "\"c6a6e3977402e7..." 1 sha1 1
#> 4 "\"00D9D7D8E563AE..." 1 resource 1
#> 5 1 1 response_code 1
#> 6 "\"2017-06-13 20:..." 1 scan_date 1
#> 7 "\"https://www.vi..." 1 permalink 1
#> 8 "\"Scan finished,..." 1 verbose_msg 1
#> 9 60 1 total 1
#> 10 0 1 positives 1
#> 11 "\"00d9d7d8e563ae..." 1 sha256 1
#> 12 "\"8d95236c637c04..." 1 md5 1
rawjson1 %>% gather_object() %>% json_types()
#> # A tbl_json: 12 x 3 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id name type
#> <chr> <int> <chr> <fctr>
#> 1 "{\"Bkav\":{\"detec..." 1 scans object
#> 2 "\"00d9d7d8e563ae..." 1 scan_id string
#> 3 "\"c6a6e3977402e7..." 1 sha1 string
#> 4 "\"00D9D7D8E563AE..." 1 resource string
#> 5 1 1 response_code number
#> 6 "\"2017-06-13 20:..." 1 scan_date string
#> 7 "\"https://www.vi..." 1 permalink string
#> 8 "\"Scan finished,..." 1 verbose_msg string
#> 9 60 1 total number
#> 10 0 1 positives number
#> 11 "\"00d9d7d8e563ae..." 1 sha256 string
#> 12 "\"8d95236c637c04..." 1 md5 string
那就是说,如果你的最终目标是获得一个数据框以供进一步调查,你可以获得一个非常宽的数据框spread_all()
,或者我认为更有用的数据集(一级密钥是列) ,然后每次扫描的行)。请注意,我正在同时处理多个文件(它们各自获得一个唯一的document.id
)。
files <- c("raw_json_1.json", "raw_json_2.json")
j <- files %>% as.tbl_json()
clean <- j %>%
spread_all(recursive=FALSE) %>% ## get the level 1 keys
enter_object('scans') %>% gather_object() %>% ## enter and gather scans
spread_all(recursive=FALSE) ## spread the scans out
names(clean)
#> [1] "document.id" "scan_id" "sha1" "resource"
#> [5] "response_code" "scan_date" "permalink" "verbose_msg"
#> [9] "total" "positives" "sha256" "md5"
#> [13] "name" "detected" "version" "result"
#> [17] "update"
## use tbl_df when done parsing to strip the JSON component
clean %>% tbl_df() %>% group_by(document.id) %>% summarize(count = n(), detected_count = sum(detected))
#> # A tibble: 2 x 3
#> document.id count detected_count
#> <int> <int> <int>
#> 1 1 60 0
#> 2 2 60 11
## look at those with detected==TRUE
clean %>% tbl_df() %>% filter(detected) %>% select(document.id, name, version,
result)
#> # A tbl_json: 11 x 4 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id name version
#> <chr> <int> <chr> <chr>
#> 1 "{\"detected\":tru..." 2 CAT-QuickHeal 14.00
#> 2 "{\"detected\":tru..." 2 Malwarebytes 2.1.1.1115
#> 3 "{\"detected\":tru..." 2 Baidu 1.0.0.2
#> 4 "{\"detected\":tru..." 2 Symantec 1.3.1.0
#> 5 "{\"detected\":tru..." 2 NANO-Antivirus 1.0.76.17389
#> 6 "{\"detected\":tru..." 2 DrWeb 7.0.28.2020
#> 7 "{\"detected\":tru..." 2 Avira 8.3.3.4
#> 8 "{\"detected\":tru..." 2 GData A:25.12800B:25.9740
#> 9 "{\"detected\":tru..." 2 ESET-NOD32 15562
#> 10 "{\"detected\":tru..." 2 Rising 28.0.0.1
#> 11 "{\"detected\":tru..." 2 Yandex 5.5.1.3
#> # ... with 1 more variables: result <chr>