我使用在API和R帮助下获得的TMDb数据。输出是一个包含n个列表的列表,n个列表本身包含数据帧。我不能使用嵌套列表,而只能使用一个数据框。
感觉就像我已经尝试了一切,但没有任何效果。
我的丑陋列表的可复制代码(假设此列表本身已超过400.000个列表):
genres <- data.frame("id" = as.integer(c(35, 40)), "name" = c('Horror','Comedy'), stringsAsFactors = FALSE)
cast <- data.frame("id" = as.integer(c(1, 2)), "name" = c("Bruce Willis", "Demi Moore"), stringsAsFactors = FALSE)
crew <- data.frame("job" = c("Director", "Producer", "Screenwriter"), "id" = as.integer(c(1,2,3)), "name" = c("Steven Spielberg", "Peter Pan", "Arnold Schwarzenegger"), "gender" = as.integer(c(0,0,0)), stringsAsFactors = FALSE)
genres2 <- data.frame("id" = as.integer(c(42, 43)), "name" = c("Drama","Lovestory"), stringsAsFactors = FALSE)
cast2 <- data.frame("id" = as.integer(c(3, 4)), "name" = c("Johnny Depp", "Leonardo Di Caprio"), stringsAsFactors = FALSE)
crew2 <- data.frame("job" = c("Director", "Producer", "Producer", "Screenwriter"),"id" = as.integer(c(1,6,7,8)), "name" = c("Steven Spielberg", "Dumbo", "Cinderella", "Micky Mouse"), "gender" = as.integer(c(0,0,1,0)), stringsAsFactors = FALSE)
genres3 <- data.frame("id" = as.integer(c(35, 42)), "name" = c("Horror","Drama"), stringsAsFactors = FALSE)
lst <- list(list("id" = as.integer(601),
"revenue" = as.integer(15000),
"genre" = genres,
"cast" = list("cast" = cast, "crew" = crew)),
list("id" = as.integer(602),
"revenue" = as.integer(20000),
"genre" = genres2,
"cast" = list("cast" = cast2, "crew" = crew2)),
list("id" = as.integer(603),
"revenue" = as.integer(12000),
"genre" = genres3,
"cast" = list("cast" = list(), "crew" = list())))
我只想展平这个嵌套列表并从中获取3个数据帧。我只对制片人和导演感兴趣。
df1
id revenue genre Producer Director
601 15000 Horror, Comedy Peter Pan Steven Spielberg
602 20000 Drama, Lovestory Dumbo, Cinderella Steven Spielberg
603 12000 Horror, Drama NA NA
df2 - So here I need the collaborations from Producer and Director
id Producer Director
601 Peter Pan Steven Spielberg
602 Dumbo Steven Spielberg
602 Cinderella Steven Spielberg
df3 - Only information about people
Name Gender ID
Peter Pan 0 2
Steven Spielberg 0 1
Cinderella 1 7
.
.
.
希望您能以任何方式帮助我。我希望这是可以理解的。
答案 0 :(得分:0)
这是一种极其复杂的方法来完成您想要的事情。您所需的df2
只是df1
,其中删除了几列。至于具有性别的df3
,您应该能够以类似于这些功能的方式来访问它们。
df <- data.frame(t(sapply(list,c)))
f_genre <- function(df_input){
row_num <- c(1:nrow(df_input))
for (i in row_num){
temp <- Reduce(merge,df_input$genre[[i]]$name)
temp_concat <- paste(temp$x,temp$y, sep=", ")
df_input$genre[[i]] <- temp_concat
df_final <<- df_input
}
}
f_producer <- function(df_input)
{
row_num <- c(1:nrow(df_input))
for (i in row_num)
{
num_jobs <- c(1:length(df_input$cast[[i]]$crew$job))
temp_producer_list <- list()
if (length(df_input$cast[[i]]$crew) != 0)
{
for (job_num in num_jobs)
{
if(df_input$cast[[i]]$crew$job[job_num] == "Producer")
{
temp_producer <- df_input$cast[[i]]$crew$name[job_num]
temp_producer_list <- c(temp_producer_list, temp_producer)
temp_producer_list <- paste(temp_producer_list,collapse=", ")
}
}
df_input$Producer[[i]] <- temp_producer_list
}
if (length(df_input$cast[[i]]$crew) == 0)
{
df_input$Producer[[i]] <- NA
}
df_final <<- df_input
}
}
f_director <- function(df_input)
{
row_num <- c(1:nrow(df_input))
for (i in row_num)
{
num_jobs <- c(1:length(df_input$cast[[i]]$crew$job))
temp_director_list <- list()
if (length(df_input$cast[[i]]$crew) != 0)
{
for (job_num in num_jobs)
{
if(df_input$cast[[i]]$crew$job[job_num] == "Director")
{
temp_director <- df_input$cast[[i]]$crew$name[job_num]
temp_director_list <- c(temp_director_list, temp_director)
temp_director_list <- paste(temp_director_list,collapse=", ")
}
}
df_input$Director[[i]] <- temp_director_list
}
if (length(df_input$cast[[i]]$crew) == 0)
{
df_input$Director[[i]] <- NA
}
df_final <<- df_input
}
}
f_genre(df)
f_producer(df_final)
f_director(df_final)
df1 <- df_final[,-4]
答案 1 :(得分:0)
我认为您的主要问题是获取一个干净的数据框,因为我们可以使用map_df
遍历 lst 并创建一个数据框。之后,您可以使用select
,dplyr::filter
,tidyr::spread
和separate_rows
来获取df1,df2,df3
library(purrr)
library(dplyr)
library(tidyr)
map_df(lst,
~ tibble(id = .x$id,
revenue = .x$revenue,
genre = ifelse(length(.x$genre$name)>1, paste(.x$genre$name, collapse = ','), .x$genre$name),
cast = imap(.x$cast,
~ if(length(.x$id)==0) data.frame(id1=NA, name=NA, term='crew', job=c("Director", "Producer"), gender=NA, stringsAsFactors = FALSE) else
data.frame(.x, term=.y, stringsAsFactors = FALSE))
) %>%
unnest(cast)
) -> df
#df1
df1 <- filter(df, term=='crew' & job %in% c("Director", "Producer")) %>%
group_by(id, job) %>%
mutate(name= paste(name[!is.na(name)], collapse = ',')) %>%
slice(1) %>% dplyr::select(-id1, -term, -gender) %>%
spread(key=job, value = name) %>% ungroup()
#df2
separate_rows(df1, Producer, sep = ',')