Question

我使用jsonlite展平了json数据，并添加了一些奇特的列名称。请参阅以下示例数据：

df <- data.frame("ID" = c(1,2,3,4))
df$`events.location.John.round.1` = 
list(list("A","B","C"),list("B","C","E"),list("A","C"),list("D","E","B"))
df$`events.location.John.round.2` = 
list(list("A","D","E"),NA,list("B","C"),list("B","E","C"))
df$`events.location.Mary.round.1` = list(NA,NA,list("B","C"),list("E","A"))
df$`events.location.Mary.round.2` = list(list("A","B","E"),NA,list("B","A"),list("D","E","C"))

LocationList <- c("A","B","C","D","E")
PersonList <- c("John", "Mary")

我想遍历我的位置和人员列表，以便在我的数据框中生成新变量。这是一些示例输出：

df$`NumLocationsJohnRound1` = c(3,3,2,3)
df$`NumLocationsMaryRound1` = c(0,0,2,2)
df$`B.JohnRound1` = c(1,1,0,1)
df$`B.MaryRound1` = c(0,0,1,0)

在英语中，第一个操作是＃34;对于PersonList中的每个人，找到包含人名和可能的其他文本的列名，并返回该单元的列表的长度＆＃ 34;

第二个操作是＃34;对于PersonList中的每个人，找到包含该人姓名的列名，并为LocationList 中的每个位置创建一个新的二进制字段，如果该列包含该位置，则为1。

基本上我需要的所有新变量都是将一个函数应用于匹配列，或者在单元格中搜索某个值的列表。这里的关键是一种灵活的迭代列表，按名称查找列，并根据列表生成新列的方法。

我认为解决方案取决于Regex/grep()，但我不知道如何将列表项插入到Regex字符串中（可能使用paste？）。 select(contains())可能是其中一个步骤。

解决方案可能会引发purrr::map()，dplyr::mutate()，或者可能涉及这些问题的自定义函数。我想避免完全依赖for循环。

我知道这是一个具有挑战性的问题。深入了解它的任何部分（如何在名称中查找包含列表项的列，如何使用基于列表项的名称创建新列，如何搜索列表列）将非常有用。

Answer 1

目前尚不完全清楚，但基于“Numlocations＆＃39;输出，可能会有所帮助

library(dplyr)
library(purrr)
nm1 <- sub("events", "Num", names(df)[-1])
df[nm1] <-  df[-1] %>% 
                  map(., ~lengths(.) *map_lgl(., ~ !all(is.na(.))))

对于第二部分，我们可以使用mtabulate

获取二进制输出

library(qdapTools)
nm2 <-rep(paste0(names(df)[2:5], sub("events.location", "", names(df)[2:5])), each = 5) 
df[nm2] <- df[2:5] %>% 
                map(mtabulate) %>% 
                bind_cols

或者这可以在链中完成

nm3 <- sub("events.location", "", names(df)[2:5])
df[-1] %>%
  map_df(., ~lengths(.) *map_lgl(., ~ !all(is.na(.)))) %>% 
           rename_all(~nm1) %>%
           bind_cols(df, ., 
             df[-1] %>%
               map(., ~map(., ~factor(., levels = LETTERS[1:5]))) %>% 
               map(~as.data.frame.matrix(table(melt(.)[2:1]))) %>% 
               map2(., nm3, ~setNames(.x,  paste0(names(.x), .y))))

给出输出

#ID events.location.John.round.1 events.location.John.round.2 events.location.Mary.round.1 events.location.Mary.round.2 Num.location.John.round.1
#1  1                      A, B, C                      A, D, E                           NA                      A, B, E                         3
#2  2                      B, C, E                           NA                           NA                           NA                         3
#3  3                         A, C                         B, C                         B, C                         B, A                         2
#4  4                      D, E, B                      B, E, C                         E, A                      D, E, C                         3
#  Num.location.John.round.2 Num.location.Mary.round.1 Num.location.Mary.round.2 A.John.round.1 B.John.round.1 C.John.round.1 D.John.round.1 E.John.round.1
#1                         3                         0                         3              1              1              1              0              0
#2                         0                         0                         0              0              1              1              0              1
#3                         2                         2                         2              1              0              1              0              0
#4                         3                         2                         3              0              1              0              1              1
#  A.John.round.2 B.John.round.2 C.John.round.2 D.John.round.2 E.John.round.2 A.Mary.round.1 B.Mary.round.1 C.Mary.round.1 D.Mary.round.1 E.Mary.round.1 A.Mary.round.2
#1              1              0              0              1              1              0              0              0              0              0              1
#2              0              0              0              0              0              0              0              0              0              0              0
#3              0              1              1              0              0              0              1              1              0              0              1
#4              0              1              1              0              1              1              0              0              0              1              0
#  B.Mary.round.2 C.Mary.round.2 D.Mary.round.2 E.Mary.round.2
#1              1              0              0              1
#2              0              0              0              0
#3              1              0              0              0
#4              0              1              1              1

Answer 2

使用dplyr和purrr

的解决方案

首先，使用mutate_at计算以＆＃34; events＆＃34;开头的所有列的列表长度。

library(dplyr)
library(purrr)

df2 <- df %>%
  mutate_at(vars(starts_with("events")), funs(`Len` = map(., ~length(.x[!is.na(.x)]))))

之后，设计一个报告二进制结果的函数。将该函数应用于LocationList中的所有元素。将结果存储在loc_results。

中

match_fun <- function(Location, df){
  df2 <- df %>%
    mutate_at(vars(starts_with("events")), 
              funs(!!Location := map_int(., ~as.integer(Location %in% unlist(.x))))) %>%
    select(ID, contains("_"))
  return(df2)
}

loc_results <- map(LocationList, match_fun, df = df)

最后，将loc_results中的所有数据框合并为df3，然后将df2和df3加入df4。 df4是最终输出。

df3 <- reduce(loc_results, left_join, by = "ID")
df4 <- df2 %>% left_join(df3, by = "ID")

此解决方案考虑了命名约定。下面是结果数据框。如您所见，以_Len结尾的列显示列表的长度，而列以_A，_B，_C，_D和{{结尾1}}显示二进制结果。

_E

R：循环列表以比较列名并在数据框

2 个答案: