我尝试使用R。
从此站点创建新的子集数据框#load libraries
library(dplyr)
library(jsonlite)
library(tidyr)
#source file
url = "http://api.us.socrata.com/api/catalog/v1 q=nasa&domains=data.nasa.gov&offset=0&limit=500"
metadata <- fromJSON(url)
#Create a new data frame
nasa_api <- data.frame(id = metadata$results$resource$id,
title = metadata$results$resource$name,
description = metadata$results$resource$description,
download_count = metadata$results$resource$download_count,
domain_category = metadata$results$classification$domain_category,
link = metadata$results$link,
permlink = metadata$results$permalink)
我注意到元数据对象包含嵌套列表。我需要为分类创建一个新的数据集,这是一个嵌套在元数据中的数据框。理想情况下,我希望这个新数据框包含&#34; id&#34;这样我以后就可以加入这两个数据集了。
我认为这将是一项轻松的任务,但我是R的新手。请帮忙吗?
答案 0 :(得分:0)
我注意到您的网址存在问题(v1 q=nasa
应为v1?q=nasa
)。因此,我已经说明了如何使用tidyjson
包解决此问题。它可以进行大量的输入,但之后会为您提供一个坚实的整洁的数据框。我推荐来自devtools::install_github('jeremystan/tidyjson')
的{{3}} CRAN
,其中classification/domain_metadata
尚未提供某些功能。
在任何情况下,由于您没有明确表达您感兴趣的嵌套数组,我只选了一个(## devtools::install_github('jeremystan/tidyjson')
library(dplyr)
library(tidyjson)
j <- as.tbl_json("http://api.us.socrata.com/api/catalog/v1?q=nasa&domains=data.nasa.gov&offset=0&limit=500")
base <- j %>% enter_object(results) %>% gather_array()
nasa_api <- base %>% spread_values(id = jstring(resource, id), title = jstring(resource,
name), description = jstring(resource, description), download_count = jstring(resource,
download_count), domain_category = jstring(classification, domain_category),
link = jstring(link), permlink = jstring(permlink))
print(nasa_api)
#> # A tbl_json: 500 x 9 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id array.index id
#> <chr> <int> <int> <chr>
#> 1 "{\"resource\":{\"d..." 1 1 gvk9-iz74
#> 2 "{\"resource\":{\"d..." 1 2 scmi-np9r
#> 3 "{\"resource\":{\"d..." 1 3 gquh-watm
#> 4 "{\"resource\":{\"d..." 1 4 dtgb-tk9p
#> 5 "{\"resource\":{\"d..." 1 5 j6wr-4xhn
#> 6 "{\"resource\":{\"d..." 1 6 357b-ra7j
#> 7 "{\"resource\":{\"d..." 1 7 e2ud-kf5m
#> 8 "{\"resource\":{\"d..." 1 8 uwnx-gns8
#> 9 "{\"resource\":{\"d..." 1 9 fzmj-dfnj
#> 10 "{\"resource\":{\"d..." 1 10 szzb-kefa
#> # ... with 490 more rows, and 6 more variables: title <chr>,
#> # description <chr>, download_count <chr>, domain_category <chr>,
#> # link <chr>, permlink <chr>
## explore the json_types of one of the objects
base %>% enter_object("classification") %>% .[1, ] %>% gather_object() %>% json_types()
#> # A tbl_json: 5 x 4 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id array.index name
#> <chr> <int> <int> <chr>
#> 1 [] 1 1 categories
#> 2 [] 1 1 tags
#> 3 "\"Management/Ope..." 1 1 domain_category
#> 4 [] 1 1 domain_tags
#> 5 "[{\"value\":\"\",\"k..." 1 1 domain_metadata
#> # ... with 1 more variables: type <fctr>
## example of an ancillary table
base %>% spread_values(id = jstring(resource, id)) %>% enter_object("classification") %>%
enter_object("domain_metadata") %>% gather_array("domain_metadata_id") %>%
spread_values(key = jstring(key), value = jstring(value)) %>% select(document.id,
array.index, id, key, value) %>% as_data_frame()
#> # A tibble: 6,343 x 5
#> document.id array.index id key
#> * <int> <int> <chr> <chr>
#> 1 1 1 gvk9-iz74 Common-Core_Contact-Email
#> 2 1 1 gvk9-iz74 Common-Core_License
#> 3 1 1 gvk9-iz74 Common-Core_System-of-Records
#> 4 1 1 gvk9-iz74 Common-Core_Program-Code
#> 5 1 1 gvk9-iz74 Common-Core_Described-By
#> 6 1 1 gvk9-iz74 Common-Core_Public-Access-Level
#> 7 1 1 gvk9-iz74 Common-Core_Temporal-Applicability
#> 8 1 1 gvk9-iz74 Common-Core_Is-Quality-Data
#> 9 1 1 gvk9-iz74 Common-Core_Language
#> 10 1 1 gvk9-iz74 Common-Core_References
#> # ... with 6,333 more rows, and 1 more variables: value <chr>
)。
void signal_handler_wrapper(int x)
{
myclass::my_handler(x);
}