我的数据来自json API。 结构如下:
我想取消这些列表的嵌套,并创建一个数据框,其中每个文档是一行,每个docvar是一列,缺少的docvar应该是NA
library(rjson)
data = rjson::fromJSON(file="http://search.worldbank.org/api/v2/wds?format=json&fl=abstracts,admreg,alt_title,authr,available_in,bdmdt,chronical_docm_id,closedt,colti,count,credit_no,disclosure_date,disclosure_type,disclosure_type_date,disclstat,display_title,docdt,docm_id,docna,docty,dois,entityid,envcat,geo_reg,geo_reg,geo_reg_and_mdk,guid,historic_topic,id,isbn,ispublicdocs,issn,keywd,lang,listing_relative_url,lndinstr,loan_no,majdocty,majtheme,ml_abstract,ml_display_title,new_url,owner,pdfurl,prdln,projectid,projn,publishtoextweb_dt,repnb,repnme,seccl,sectr,src_cit,subsc,subtopic,teratopic,theme,topic,topicv3,totvolnb,trustfund,txturl,unregnbr,url_friendly_title,versiontyp,versiontyp_key,virt_coll,vol_title,volnb&str_docdt=1986-01-01&end_docdt=2000-12-31&rows=500&os=1&srt=docdt&order=desc")
有很多这样的问题,但是在这种情况下,没有一种解决方案可行。例如:
Unnesting a list of lists in a data frame column
library(tidyverse)
tidy <- data$documents %>% bind_rows(data$documents) %>% # make larger sample data
mutate_if(is.list, simplify_all) %>% # flatten each list element internally
unnest() # expand
Error in bind_rows_(x, .id) : Argument 36 must be length 1, not 2
Unnest one of several list columns in dataframe
Convert list of lists to dataframe
R convert list of lists to dataframe
R: How to extract a list from a dataframe?
Extracting data.frames from a list using for loop
R, dpylr: Converting list of lists of differing lenghts within dataframe into long format dataframe
最后一个即将到来,但是我有多个docvar,其中许多我不知道名字。
我的另一种尝试是使用循环:
df <- data.frame()
df_s <- data.frame()
s=0
#Desired API
for(l in 1:100){
print(l)
s=s+500
url <- paste0("http://search.worldbank.org/api/v2/wds?format=json&fl=abstracts,admreg,alt_title,authr,available_in,bdmdt,chronical_docm_id,closedt,colti,count,credit_no,disclosure_date,disclosure_type,disclosure_type_date,disclstat,display_title,docdt,docm_id,docna,docty,dois,entityid,envcat,geo_reg,geo_reg,geo_reg_and_mdk,guid,historic_topic,id,isbn,ispublicdocs,issn,keywd,lang,listing_relative_url,lndinstr,loan_no,majdocty,majtheme,ml_abstract,ml_display_title,new_url,owner,pdfurl,prdln,projectid,projn,publishtoextweb_dt,repnb,repnme,seccl,sectr,src_cit,subsc,subtopic,teratopic,theme,topic,topicv3,totvolnb,trustfund,txturl,unregnbr,url_friendly_title,versiontyp,versiontyp_key,virt_coll,vol_title,volnb&str_docdt=1986-01-01&end_docdt=2000-12-31&rows=500&os=",s,"&srt=docdt&order=desc")
WBeLib_content = rjson::fromJSON(file= url)
stop <- WBeLib_content$rows
#df <- data.frame()
for(i in 1:500 ){
docu <- WBeLib_content$documents[i]
df[i,1] <- docu[[1]]$url
df[i,2] <- docu[[1]]$txturl
df[i,3] <- docu[[1]]$docdt
df[i,4] <- docu[[1]]$disclstat
df[i,5] <- docu[[1]]$disclosure_date
df[i,6] <- docu[[1]]$versiontyp
df[i,7] <- docu[[1]]$docty
df[i,8] <- docu[[1]]$subtopic
df[i,9] <- docu[[1]]$count
df[i,10] <- docu[[1]]$colti
df[i,11] <- docu[[1]]$historic_topic
df[i,12] <- docu[[1]]$seccl
df[i,13] <- docu[[1]]$lang
df[i,14] <- docu[[1]]$majdocty
df[i,15] <- docu[[1]]$owner
df[i,16] <- docu[[1]]$guid
df[i,17] <- docu[[1]]$repnb
df[i,18] <- docu[[1]]$admreg
df[i,19] <- docu[[1]]$pdfurl
df[i,20] <- docu[[1]]$docm_id
}
if(i>1){ df_s <- rbind(df,df_s) } else { df_s <- df}
}
但是,由于并非每个文档都存在所有docvar,因此超出范围。可以按位置定向,但是列不再按顺序排列。
答案 0 :(得分:0)
这会为您工作吗?
library(jsonlite)
library(tidyverse)
data = fromJSON("http://search.worldbank.org/api/v2/wds?format=json&fl=abstracts,admreg,alt_title,authr,available_in,bdmdt,chronical_docm_id,closedt,colti,count,credit_no,disclosure_date,disclosure_type,disclosure_type_date,disclstat,display_title,docdt,docm_id,docna,docty,dois,entityid,envcat,geo_reg,geo_reg,geo_reg_and_mdk,guid,historic_topic,id,isbn,ispublicdocs,issn,keywd,lang,listing_relative_url,lndinstr,loan_no,majdocty,majtheme,ml_abstract,ml_display_title,new_url,owner,pdfurl,prdln,projectid,projn,publishtoextweb_dt,repnb,repnme,seccl,sectr,src_cit,subsc,subtopic,teratopic,theme,topic,topicv3,totvolnb,trustfund,txturl,unregnbr,url_friendly_title,versiontyp,versiontyp_key,virt_coll,vol_title,volnb&str_docdt=1986-01-01&end_docdt=2000-12-31&rows=500&os=1&srt=docdt&order=desc")
df <-
data$documents %>%
head(-1) %>% # remove facet element
transpose %>% # transpose so each subelement is now a main element
as_tibble %>% # convert to table
purrr::modify(~replace(.x,lengths(.x)==0,list(NA))) %>% # replace empty elements by list(NA) so they have length 1 too
modify_if(~all(lengths(.x)==1),unlist) # unlist lists that contain only items of length 1
仅保留一个列表列:
names(df)[map_chr(df,class) == "list"]
# [1] "keywd"
因为它包含长度为1或2的项目
table(lengths(df$keywd))
# 1 2
# 224 276
输出内容如下:
glimpse(df)
# Observations: 500
# Variables: 38
# $ url <chr> "http://documents.worldbank.org/curated/en/903231468764970044/Attacking-rural-poverty-strategy-and-public-actions", "...
# $ available_in <chr> "English", "English", "English", "English", "English", "English,French,Spanish,Portuguese", "Portuguese,Chinese,Engli...
# $ url_friendly_title <chr> "http://documents.worldbank.org/curated/en/903231468764970044/Attacking-rural-poverty-strategy-and-public-actions", "...
# $ new_url <chr> "2000/12/1000476/Attacking-rural-poverty-strategy-and-public-actions", "2000/12/1000501/State-policies-and-womens-aut...
# $ guid <chr> "903231468764970044", "429001468753367328", "985531468746683502", "890081468757236671", "922151468776107524", "324581...
# $ disclosure_date <chr> "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z...
# $ disclosure_type <chr> "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA...
# $ disclosure_type_date <chr> "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z...
# $ publishtoextweb_dt <chr> "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z", "2010-07-01T00:00:00Z...
# $ docm_id <chr> "090224b0828c737a", "090224b0828ac316", "090224b0828bd3f7", "090224b0828ac343", "090224b0828cf43d", "090224b0828cf42b...
# $ chronical_docm_id <chr> "090224b0828c737a", "090224b0828ac316", "090224b0828bd3f7", "090224b0828ac343", "090224b0828cf43d", "090224b0828cf42b...
# $ txturl <chr> "http://documents.worldbank.org/curated/en/903231468764970044/text/multi-page.txt", "http://documents.worldbank.org/c...
# $ pdfurl <chr> "http://documents.worldbank.org/curated/en/903231468764970044/pdf/multi-page.pdf", "http://documents.worldbank.org/cu...
# $ docdt <chr> "2000-12-31T00:00:00Z", "2000-12-31T00:00:00Z", "2000-12-31T00:00:00Z", "2000-12-31T00:00:00Z", "2000-12-31T00:00:00Z...
# $ totvolnb <chr> "1", "1", "1", "1", "5", "1", "1", "14", "1", "1", "1", "1", "14", "14", "14", "14", "14", "14", "14", "14", "14", "1...
# $ versiontyp <chr> "Final", "Final", "Final", "Final", "Final", "Final", "Final", "Final", "Final", "Final", "Final", "Final", "Final", ...
# $ versiontyp_key <chr> "1309935", "1309935", "1309935", "1309935", "1309935", "1309935", "1309935", "1309935", "1309935", "1309935", "130993...
# $ volnb <chr> "1", "1", "1", "1", "4", "1", "1", "8", "1", "1", "1", "1", "13", "4", "9", "12", "3", "2", "7", "10", "1", "6", "11"...
# $ repnme <chr> "Attacking rural poverty : strategy and\n public actions", "State policies and women's autonomy in\n ...
# $ abstracts <chr> "Poverty remains pervasive, and its\n incidence and intensity are usually higher in rural than in\n ...
# $ display_title <chr> "Attacking rural poverty :\n strategy and public actions", "State policies and women's\n autono...
# $ listing_relative_url <chr> "/research/2000/12/1000476/attacking-rural-poverty-strategy-public-actions", "/research/2000/12/1000501/state-policie...
# $ docty <chr> "Newsletter", "Working Paper (Numbered Series)", "Publication", "Poverty Reduction Strategy Paper (PRSP)", "Environme...
# $ subtopic <chr> "Economic Theory & Research,Rural Settlements,Industrial Economics,Nutrition,Educational Sciences,Economic Growth,Agr...
# $ docna <chr> "Attacking rural poverty : strategy and\n public actions", "State policies and women's autonomy in\n ...
# $ teratopic <chr> "Poverty Reduction", "Education", "Energy", "Poverty Reduction", "Industry,Transport,Water Resources", NA, "Governanc...
# $ authors <chr> "Okidegbe, Nwanze", "Zhang, Xiaodan", "Bogach, V. Susan", NA, "Carl Brothers International Inc.", "World Bank", "Mann...
# $ entityids <chr> "000094946_01022305364180", "000094946_01022705322025", "000094946_01011005520622", "000094946_0102240538258", "00009...
# $ subsc <chr> "Macro/Non-Trade", "Human Development", "(Historic)Other power and energy conversion", "(Historic)Macro/non-trade", "...
# $ lang <chr> "English", "English", "English", "English", "English", "Portuguese", "English", "English", "Chinese", "English", "Eng...
# $ historic_topic <chr> "Poverty Reduction", "Education", "Energy", "Poverty Reduction", "Industry,Transport,Water Resources", NA, "Governanc...
# $ seccl <chr> "Public", "Public", "Public", "Public", "Public", "Public", "Public", "Public", "Public", "Public", "Public", "Public...
# $ sectr <chr> "(Historic)Economic Policy", "(Historic)Multisector", "(Historic)Electric Power & Other Energy", "(Historic)Economic ...
# $ majdocty <chr> "Publications & Research", "Publications & Research", "Publications,Publications & Research", "Country Focus", "Proje...
# $ src_cit <chr> "Rural development note. -- No. 6 (December 2000)", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
# $ keywd <list> [[["Rural Poor;medium term expenditure\n framework;rural poverty reduction strategy;rural\n ar...
# $ owner <chr> "Environ & Soc Sustainable Dev VP (ESD)", "Off of Sr VP Dev Econ/Chief Econ (DECVP)", "Energy & Mining Sector Unit (E...
# $ repnb <chr> "21649", "21743", "WTP492", "21834", "E287", "27779", "21604", "E425", "21604", "22194", "21837", "22903", "E425", "E...