我之前已经问过这种性质的问题(将列表转换为数据帧),但是我遇到了一个嵌套的列表列表,我想将其转换为数据帧。我得到的数据来自R中的API调用,因此我为什么要处理这个嵌套的列表结构列表。这是我正在使用的API返回对象的一个小例子(5场运动数据游戏):
dput(soccer_data)
list(structure(list(id = 1603158L, league_id = 779L, season_id = 914L,
stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL,
venue_id = 139L, referee_id = 656L, localteam_id = 607L,
visitorteam_id = 3639L, weather_report = NULL, commentaries = TRUE,
attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE,
formations = structure(list(localteam_formation = "4-2-3-1",
visitorteam_formation = "4-1-4-1"), .Names = c("localteam_formation",
"visitorteam_formation")), scores = structure(list(localteam_score = 5L,
visitorteam_score = 1L, localteam_pen_score = 0L, visitorteam_pen_score = 0L,
ht_score = "1-0", ft_score = "5-1", et_score = NULL), .Names = c("localteam_score",
"visitorteam_score", "localteam_pen_score", "visitorteam_pen_score",
"ht_score", "ft_score", "et_score")), time = structure(list(
status = "FT", starting_at = structure(list(date_time = "2017-03-04 05:30:00",
date = "2017-03-04", time = "05:30:00", timestamp = 1488605400L,
timezone = "UTC"), .Names = c("date_time", "date",
"time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL,
injury_time = NULL), .Names = c("status", "starting_at",
"minute", "extra_minute", "injury_time")), coaches = structure(list(
localteam_coach_id = 429924L, visitorteam_coach_id = 429940L), .Names = c("localteam_coach_id",
"visitorteam_coach_id")), standings = structure(list(localteam_position = NULL,
visitorteam_position = NULL), .Names = c("localteam_position",
"visitorteam_position")), deleted = FALSE), .Names = c("id",
"league_id", "season_id", "stage_id", "round_id", "group_id",
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id",
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated",
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603159L, league_id = 779L, season_id = 914L,
stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL,
venue_id = 113L, referee_id = 3614L, localteam_id = 577L,
visitorteam_id = 75L, weather_report = NULL, commentaries = FALSE,
attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE,
formations = structure(list(localteam_formation = "4-2-3-1",
visitorteam_formation = "4-2-3-1"), .Names = c("localteam_formation",
"visitorteam_formation")), scores = structure(list(localteam_score = 1L,
visitorteam_score = 1L, localteam_pen_score = 0L, visitorteam_pen_score = 0L,
ht_score = "1-0", ft_score = "1-1", et_score = NULL), .Names = c("localteam_score",
"visitorteam_score", "localteam_pen_score", "visitorteam_pen_score",
"ht_score", "ft_score", "et_score")), time = structure(list(
status = "FT", starting_at = structure(list(date_time = "2017-03-04 22:00:00",
date = "2017-03-04", time = "22:00:00", timestamp = 1488664800L,
timezone = "UTC"), .Names = c("date_time", "date",
"time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL,
injury_time = NULL), .Names = c("status", "starting_at",
"minute", "extra_minute", "injury_time")), coaches = structure(list(
localteam_coach_id = 455860L, visitorteam_coach_id = 176760L), .Names = c("localteam_coach_id",
"visitorteam_coach_id")), standings = structure(list(localteam_position = NULL,
visitorteam_position = NULL), .Names = c("localteam_position",
"visitorteam_position")), deleted = FALSE), .Names = c("id",
"league_id", "season_id", "stage_id", "round_id", "group_id",
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id",
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated",
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603160L, league_id = 779L, season_id = 914L,
stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL,
venue_id = 28L, referee_id = 555L, localteam_id = 413L, visitorteam_id = 583L,
weather_report = NULL, commentaries = FALSE, attendance = 23554L,
pitch = NULL, winning_odds_calculated = FALSE, formations = structure(list(
localteam_formation = "4-4-1-1", visitorteam_formation = "4-4-2"), .Names = c("localteam_formation",
"visitorteam_formation")), scores = structure(list(localteam_score = 1L,
visitorteam_score = 2L, localteam_pen_score = 0L, visitorteam_pen_score = 0L,
ht_score = "0-0", ft_score = "1-2", et_score = NULL), .Names = c("localteam_score",
"visitorteam_score", "localteam_pen_score", "visitorteam_pen_score",
"ht_score", "ft_score", "et_score")), time = structure(list(
status = "FT", starting_at = structure(list(date_time = "2017-03-05 00:00:00",
date = "2017-03-05", time = "00:00:00", timestamp = 1488672000L,
timezone = "UTC"), .Names = c("date_time", "date",
"time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL,
injury_time = NULL), .Names = c("status", "starting_at",
"minute", "extra_minute", "injury_time")), coaches = structure(list(
localteam_coach_id = 429914L, visitorteam_coach_id = 429917L), .Names = c("localteam_coach_id",
"visitorteam_coach_id")), standings = structure(list(localteam_position = NULL,
visitorteam_position = NULL), .Names = c("localteam_position",
"visitorteam_position")), deleted = FALSE), .Names = c("id",
"league_id", "season_id", "stage_id", "round_id", "group_id",
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id",
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated",
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603161L, league_id = 779L, season_id = 914L,
stage_id = 1810L, round_id = 29156L, group_id = NULL, aggregate_id = NULL,
venue_id = 411L, referee_id = 274L, localteam_id = 1062L,
visitorteam_id = 111L, weather_report = NULL, commentaries = FALSE,
attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE,
formations = structure(list(localteam_formation = "4-2-3-1",
visitorteam_formation = "3-5-2"), .Names = c("localteam_formation",
"visitorteam_formation")), scores = structure(list(localteam_score = 0L,
visitorteam_score = 0L, localteam_pen_score = 0L, visitorteam_pen_score = 0L,
ht_score = "0-0", ft_score = "0-0", et_score = NULL), .Names = c("localteam_score",
"visitorteam_score", "localteam_pen_score", "visitorteam_pen_score",
"ht_score", "ft_score", "et_score")), time = structure(list(
status = "FT", starting_at = structure(list(date_time = "2017-03-05 00:30:00",
date = "2017-03-05", time = "00:30:00", timestamp = 1488673800L,
timezone = "UTC"), .Names = c("date_time", "date",
"time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL,
injury_time = NULL), .Names = c("status", "starting_at",
"minute", "extra_minute", "injury_time")), coaches = structure(list(
localteam_coach_id = 456638L, visitorteam_coach_id = 516577L), .Names = c("localteam_coach_id",
"visitorteam_coach_id")), standings = structure(list(localteam_position = NULL,
visitorteam_position = NULL), .Names = c("localteam_position",
"visitorteam_position")), deleted = FALSE), .Names = c("id",
"league_id", "season_id", "stage_id", "round_id", "group_id",
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id",
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated",
"formations", "scores", "time", "coaches", "standings", "deleted"
)), structure(list(id = 1603162L, league_id = 779L, season_id = 914L,
stage_id = 1810L, round_id = 29157L, group_id = NULL, aggregate_id = NULL,
venue_id = 11573L, referee_id = 370L, localteam_id = 179L,
visitorteam_id = 641L, weather_report = NULL, commentaries = FALSE,
attendance = NULL, pitch = NULL, winning_odds_calculated = FALSE,
formations = structure(list(localteam_formation = "4-2-3-1",
visitorteam_formation = "4-3-1-2"), .Names = c("localteam_formation",
"visitorteam_formation")), scores = structure(list(localteam_score = 1L,
visitorteam_score = 0L, localteam_pen_score = 0L, visitorteam_pen_score = 0L,
ht_score = "0-0", ft_score = "1-0", et_score = NULL), .Names = c("localteam_score",
"visitorteam_score", "localteam_pen_score", "visitorteam_pen_score",
"ht_score", "ft_score", "et_score")), time = structure(list(
status = "FT", starting_at = structure(list(date_time = "2017-03-05 02:00:00",
date = "2017-03-05", time = "02:00:00", timestamp = 1488679200L,
timezone = "UTC"), .Names = c("date_time", "date",
"time", "timestamp", "timezone")), minute = 90L, extra_minute = NULL,
injury_time = NULL), .Names = c("status", "starting_at",
"minute", "extra_minute", "injury_time")), coaches = structure(list(
localteam_coach_id = 524071L, visitorteam_coach_id = 261458L), .Names = c("localteam_coach_id",
"visitorteam_coach_id")), standings = structure(list(localteam_position = NULL,
visitorteam_position = NULL), .Names = c("localteam_position",
"visitorteam_position")), deleted = FALSE), .Names = c("id",
"league_id", "season_id", "stage_id", "round_id", "group_id",
"aggregate_id", "venue_id", "referee_id", "localteam_id", "visitorteam_id",
"weather_report", "commentaries", "attendance", "pitch", "winning_odds_calculated",
"formations", "scores", "time", "coaches", "standings", "deleted"
)))
soccer_data有5个MLS足球数据游戏,以下是我目前正在做的将其转换为数据帧的方法:
# grab the "scores" info from the nested list $scores (from each game)
season_scores <- data.frame()
for(i in 1:length(soccer_data)) {
game_scores <- as.data.frame(t(unlist(soccer_data[[i]]$scores)), stringsAsFactors = FALSE)
game_scores$date <- as.Date(soccer_data[[i]]$time$starting_at$date)
season_scores <- rbind.fill(season_scores, game_scores)
}
season_scores <- season_scores %>% readr::type_convert()
# create df of the game scores, add the season scores, and drop the bad cols
season_boxscores <- as.data.frame(do.call(rbind, soccer_data), stringsAsFactors = FALSE) %>%
dplyr::select(-one_of(c('scores', 'group_id', 'aggregate_id', 'time', 'standings'))) %>%
cbind(season_scores) %>%
readr::type_convert()
不幸的是,这种方法的问题是最后一次type_convert()函数调用没有按照我的意愿执行,结果是season_boxscores数据框的列的类大多数是class == list。
# check yourself
sapply(season_boxscores, class)
我的问题是:
提前致谢!
编辑:如果所有嵌套列表(在这种情况下,soccer_data都有一些:编队,分数,时间,教练,积分榜)本身就是自己进行的,就像我将它们放入其中一样,这将是特别好的分数的for循环。
编辑2:很抱歉只分享了5个游戏的大型列表对象。在列表列表或像这样的大嵌套对象中,我实际上不知道如何从每个嵌套列表中删除相同的项目,我将为此帖子做。 (即从soccer_data [[i]]中删除league_id,round_id等)。如果有人知道怎么做,那就太棒了!
编辑3:因为soccer_data不仅仅是列表列表,而是列表列表(每个列表列表中包含其他非列表对象),这里没有任何解决方案 - Force list of lists into dataframe - 在soccer_data上工作。
答案 0 :(得分:2)
我还在努力学习这些东西。我测试了一百万件事,这是我能想到的最简单的事情:
library(tidyverse)
soccer_data %>%
map(unlist) %>%
map(t) %>%
map(as_tibble) %>%
bind_rows()
这个想法:把你的列表soccer_data
,将unlist
映射到每个元素(所以它在第二级别列出,这意味着它将所有游戏保存在最顶层列表的单独元素中) 。然后使用地图转置t
将列表转换为看似行的内容,然后将其转换为tibble
,然后将bind_rows
转换为完整行。
结果:
# A tibble: 5 x 30
id league_id season_id stage_id round_id venue_id referee_id localteam_id
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1603158 779 914 1810 29156 139 656 607
2 1603159 779 914 1810 29156 113 3614 577
3 1603160 779 914 1810 29156 28 555 413
4 1603161 779 914 1810 29156 411 274 1062
5 1603162 779 914 1810 29157 11573 370 179
# ... with 22 more variables: visitorteam_id <chr>, commentaries <chr>,
# winning_odds_calculated <chr>, formations.localteam_formation <chr>,
# formations.visitorteam_formation <chr>, scores.localteam_score <chr>,
# scores.visitorteam_score <chr>, scores.localteam_pen_score <chr>,
# scores.visitorteam_pen_score <chr>, scores.ht_score <chr>, scores.ft_score <chr>,
# time.status <chr>, time.starting_at.date_time <chr>, time.starting_at.date <chr>,
# time.starting_at.time <chr>, time.starting_at.timestamp <chr>,
# time.starting_at.timezone <chr>, time.minute <chr>,
# coaches.localteam_coach_id <chr>, coaches.visitorteam_coach_id <chr>,
# deleted <chr>, attendance <chr>
看起来不错吗?祝你好运!
答案 1 :(得分:1)
以下基于R的方法(使用unlist
):
折叠list
char
个向量的列表清单列表:
# Collapse list of list of list to list of character vectors
lst <- lapply(soccer_data, unlist);
确保所有列表条目具有相同的键。例如,只有示例数据的list
条目3具有键attendance
。
# Make sure that all list entries have values for the same keys
keys <- unique(unlist(lapply(lst, names)));
使用NA
# Fill missing entries with NULL
lst <- lapply(lst, function(x) x[match(keys, names(x))]);
rbind
加入data.frame
:
# Combind in dataframe
df <- do.call(rbind.data.frame, lst);
colnames(df) <- keys;
df;
#id league_id season_id stage_id round_id venue_id referee_id
#1 1603158 779 914 1810 29156 139 656
#2 1603159 779 914 1810 29156 113 3614
#3 1603160 779 914 1810 29156 28 555
#4 1603161 779 914 1810 29156 411 274
#5 1603162 779 914 1810 29157 11573 370
#localteam_id visitorteam_id commentaries winning_odds_calculated
#1 607 3639 TRUE FALSE
#2 577 75 FALSE FALSE
#3 413 583 FALSE FALSE
#4 1062 111 FALSE FALSE
#5 179 641 FALSE FALSE
#formations.localteam_formation formations.visitorteam_formation
#1 4-2-3-1 4-1-4-1
#2 4-2-3-1 4-2-3-1
#3 4-4-1-1 4-4-2
#4 4-2-3-1 3-5-2
#5 4-2-3-1 4-3-1-2
#scores.localteam_score scores.visitorteam_score scores.localteam_pen_score
#1 5 1 0
#2 1 1 0
#3 1 2 0
#4 0 0 0
#5 1 0 0
#scores.visitorteam_pen_score scores.ht_score scores.ft_score time.status
#1 0 1-0 5-1 FT
#2 0 1-0 1-1 FT
#3 0 0-0 1-2 FT
#4 0 0-0 0-0 FT
#5 0 0-0 1-0 FT
#time.starting_at.date_time time.starting_at.date time.starting_at.time
#1 2017-03-04 05:30:00 2017-03-04 05:30:00
#2 2017-03-04 22:00:00 2017-03-04 22:00:00
#3 2017-03-05 00:00:00 2017-03-05 00:00:00
#4 2017-03-05 00:30:00 2017-03-05 00:30:00
#5 2017-03-05 02:00:00 2017-03-05 02:00:00
#time.starting_at.timestamp time.starting_at.timezone time.minute
#1 1488605400 UTC 90
#2 1488664800 UTC 90
#3 1488672000 UTC 90
#4 1488673800 UTC 90
#5 1488679200 UTC 90
#coaches.localteam_coach_id coaches.visitorteam_coach_id deleted attendance
#1 429924 429940 FALSE <NA>
#2 455860 176760 FALSE <NA>
#3 429914 429917 FALSE 23554
#4 456638 516577 FALSE <NA>
#5 524071 261458 FALSE <NA>
如果删除所有多余的文字/解释,这很短。
不幸的是,由于unlist
,列类型会丢失。您可以通过以下方式将factors
转换回numeric
:
# Smart-convert to numeric
is.num <- apply(df, 2, function(x) {
x <- x[!is.na(x)];
all(suppressWarnings(!is.na(as.numeric(as.character(x)))));
})
df[, is.num] <- apply(df[, is.num], 2, function(x) as.numeric(as.character(x)));
它有点乱,但有效。