如何从也包含数据框架的列表的深层嵌套列表中创建数据框架?

时间:2019-06-10 21:58:15

标签: r list dataframe nested

我使用在API和R帮助下获得的TMDb数据。输出是一个包含n个列表的列表,n个列表本身包含数据帧。我不能使用嵌套列表,而只能使用一个数据框。

感觉就像我已经尝试了一切,但没有任何效果。

我的丑陋列表的可复制代码(假设此列表本身已超过400.000个列表):

genres <- data.frame("id" = as.integer(c(35, 40)), "name" = c('Horror','Comedy'), stringsAsFactors = FALSE)

cast <- data.frame("id" = as.integer(c(1, 2)), "name" = c("Bruce Willis", "Demi Moore"), stringsAsFactors = FALSE)

crew <- data.frame("job" = c("Director", "Producer", "Screenwriter"), "id" = as.integer(c(1,2,3)), "name" = c("Steven Spielberg", "Peter Pan", "Arnold Schwarzenegger"), "gender" = as.integer(c(0,0,0)), stringsAsFactors = FALSE)

genres2 <- data.frame("id" = as.integer(c(42, 43)), "name" = c("Drama","Lovestory"), stringsAsFactors = FALSE)

cast2 <- data.frame("id" = as.integer(c(3, 4)), "name" = c("Johnny Depp", "Leonardo Di Caprio"), stringsAsFactors = FALSE)

crew2 <- data.frame("job" = c("Director", "Producer", "Producer", "Screenwriter"),"id" = as.integer(c(1,6,7,8)), "name" = c("Steven Spielberg", "Dumbo", "Cinderella", "Micky Mouse"), "gender" = as.integer(c(0,0,1,0)), stringsAsFactors = FALSE)

genres3 <- data.frame("id" = as.integer(c(35, 42)), "name" = c("Horror","Drama"), stringsAsFactors = FALSE)


lst <- list(list("id" = as.integer(601), 
                  "revenue" = as.integer(15000), 
                  "genre" = genres, 
                  "cast" = list("cast" = cast, "crew" = crew)), 
             list("id" = as.integer(602), 
                  "revenue" = as.integer(20000), 
                  "genre" = genres2, 
                  "cast" = list("cast" = cast2, "crew" = crew2)),
             list("id" = as.integer(603), 
                  "revenue" = as.integer(12000), 
                  "genre" = genres3, 
                  "cast" = list("cast" = list(), "crew" = list())))                     

我只想展平这个嵌套列表并从中获取3个数据帧。我只对制片人和导演感兴趣。

df1

id    revenue   genre               Producer            Director
601   15000     Horror, Comedy      Peter Pan           Steven Spielberg
602   20000     Drama, Lovestory    Dumbo, Cinderella   Steven Spielberg
603   12000     Horror, Drama       NA                  NA

df2 - So here I need the collaborations from Producer and Director

id    Producer    Director
601   Peter Pan   Steven Spielberg
602   Dumbo       Steven Spielberg
602   Cinderella  Steven Spielberg

df3 - Only information about people

Name              Gender    ID
Peter Pan         0         2 
Steven Spielberg  0         1
Cinderella        1         7
.
.
.

希望您能以任何方式帮助我。我希望这是可以理解的。

2 个答案:

答案 0 :(得分:0)

这是一种极其复杂的方法来完成您想要的事情。您所需的df2只是df1,其中删除了几列。至于具有性别的df3,您应该能够以类似于这些功能的方式来访问它们。

df <- data.frame(t(sapply(list,c)))

f_genre <- function(df_input){
    row_num <- c(1:nrow(df_input))
    for (i in row_num){
        temp <- Reduce(merge,df_input$genre[[i]]$name)
        temp_concat <- paste(temp$x,temp$y, sep=", ")
        df_input$genre[[i]] <- temp_concat
        df_final <<- df_input
    }
}


f_producer <- function(df_input)
{
    row_num <- c(1:nrow(df_input))
    for (i in row_num)
    {
        num_jobs <- c(1:length(df_input$cast[[i]]$crew$job))
        temp_producer_list <- list()
        if (length(df_input$cast[[i]]$crew) != 0)
        {
            for (job_num in num_jobs)
            {
                if(df_input$cast[[i]]$crew$job[job_num] == "Producer")
                {
                    temp_producer <- df_input$cast[[i]]$crew$name[job_num]
                    temp_producer_list <- c(temp_producer_list, temp_producer)
                    temp_producer_list <- paste(temp_producer_list,collapse=", ")
                }
            }
            df_input$Producer[[i]] <- temp_producer_list
        }
        if (length(df_input$cast[[i]]$crew) == 0)
        {
            df_input$Producer[[i]] <- NA
        }
        df_final <<- df_input
    }
}


f_director <- function(df_input)
{
    row_num <- c(1:nrow(df_input))
    for (i in row_num)
    {
        num_jobs <- c(1:length(df_input$cast[[i]]$crew$job))
        temp_director_list <- list()
        if (length(df_input$cast[[i]]$crew) != 0)
        {
            for (job_num in num_jobs)
            {
                if(df_input$cast[[i]]$crew$job[job_num] == "Director")
                {
                    temp_director <- df_input$cast[[i]]$crew$name[job_num]
                    temp_director_list <- c(temp_director_list, temp_director)
                    temp_director_list <- paste(temp_director_list,collapse=", ")
                }
            }
            df_input$Director[[i]] <- temp_director_list
        }
        if (length(df_input$cast[[i]]$crew) == 0)
        {
            df_input$Director[[i]] <- NA
        }
        df_final <<- df_input
    }
}



f_genre(df)
f_producer(df_final)
f_director(df_final)

df1 <- df_final[,-4]

dframe

答案 1 :(得分:0)

我认为您的主要问题是获取一个干净的数据框,因为我们可以使用map_df遍历 lst 并创建一个数据框。之后,您可以使用selectdplyr::filtertidyr::spreadseparate_rows来获取df1,df2,df3

library(purrr)
library(dplyr)
library(tidyr)
map_df(lst,  
        ~ tibble(id = .x$id, 
                 revenue = .x$revenue, 
                 genre = ifelse(length(.x$genre$name)>1, paste(.x$genre$name, collapse = ','), .x$genre$name), 
                 cast = imap(.x$cast, 
                            ~ if(length(.x$id)==0) data.frame(id1=NA, name=NA, term='crew', job=c("Director", "Producer"), gender=NA, stringsAsFactors = FALSE) else 
                              data.frame(.x, term=.y, stringsAsFactors = FALSE))
                ) %>% 
          unnest(cast)
        ) -> df

#df1
df1 <- filter(df, term=='crew' & job %in% c("Director", "Producer")) %>% 
       group_by(id, job) %>% 
       mutate(name= paste(name[!is.na(name)], collapse = ',')) %>%   
       slice(1) %>% dplyr::select(-id1, -term, -gender) %>% 
       spread(key=job, value = name) %>% ungroup()

#df2
separate_rows(df1, Producer, sep = ',')