ivot_wider输出<S3:vctrs_list_of>对象

时间:2019-09-20 21:39:40

标签: r tidyr

我要传播以下数据集。

    #create df    
df <- structure(list(file_number = c("3098129", "3096451", "3096774", 
"3095276", "3095464", "3096846", "3097132", "3096355", "3096951", 
"3096328", "3095441", "3096325", "3094412", "3096366", "3096372", 
"3096507", "3098510", "3096335", "3096403", "3094343", "3096941", 
"3096419", "3094431", "3096495", "3094647", "3094487", "3094947", 
"3094398", "3094386", "3094367", "3097480", "3096425", "3095193", 
"3095839a", "3097197", "3098453", "3098549", "3098428", "3096427", 
"3096895", "3096434", "3094835", "3096312", "3094517", "3094372", 
"3096387", "3096480", "3098504", "3096338", "3094615", "3096382", 
"3096638", "3096750", "3096418", "3094734", "3098503", "3096311", 
"3097197", "3094353", "3098442", "3097111", "3097325", "3096531", 
"3096405", "3096301", "3096692", "3096495", "3098406", "3098422", 
"3096315", "3096951", "3094491", "3096304", "3098416", "3096332", 
"3098404", "3098419", "3095225", "3094404", "3096374", "3098411", 
"3098556", "3096398", "3094421b", "3098477", "3094369b", "3098463", 
"3096893", "3098514", "3098477", "3098465", "3094560", "3098409", 
"3096434", "3097557", "3095061", "3098419", "3096404", "3095441", 
"3096537", "3098503", "3098400", "3097808", "3096389b", "3098446", 
"3096330", "3095533", "3094421a", "3094339", "3095578", "3094404", 
"3098552", "3098514", "3096630", "3096941", "3097027", "3096322", 
"3096514", "3098484", "3097038", "3096672", "3098483", "3094373", 
"3096774", "3096677", "3096408", "3096664", "3096365", "3096491", 
"3096820", "3096514", "3096556", "3096292", "3096495", "3094781", 
"3094344", "3094487", "3094690", "3098504", "3096503"), reader = c("aa", 
"aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", 
"aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "ae", "ae", "ae", 
"ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae", 
"ae", "ae", "ae", "ae", "ae", "ae", "db", "db", "db", "db", "db", 
"db", "db", "db", "db", "db", "db", "db", "db", "db", "db", "db", 
"db", "db", "db", "db", "dl", "dl", "dl", "dl", "dl", "dl", "dl", 
"dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl", 
"dl", "dl", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", 
"mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", 
"mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", 
"mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "np", "np", 
"np", "np", "np", "np", "np", "np", "np", "np", "np", "np", "np", 
"np", "np", "np", "np", "np", "np", "np"), event = c("fail", 
"fail", "fail", "fail", "pass", "fail", "fail", "pass", "fail", 
"fail", "pass", "pass", "pass", "fail", "fail", "pass", "pass", 
"fail", "pass", "pass", "pass", "pass", "pass", "pass", "fail", 
"fail", "pass", "pass", "fail", "pass", "pass", "pass", "pass", 
"pass", "fail", "pass", "fail", "fail", "fail", "pass", "pass", 
"pass", "fail", "pass", "pass", "fail", "pass", "fail", "fail", 
"pass", "fail", "fail", "pass", "fail", "pass", "fail", "pass", 
"fail", "fail", "fail", "fail", "pass", "pass", "fail", "pass", 
"pass", "fail", "pass", "fail", "pass", "pass", "fail", "pass", 
"fail", "fail", "pass", "pass", "fail", "pass", "pass", "fail", 
"pass", "fail", "pass", "fail", "pass", "pass", "pass", "pass", 
"fail", "pass", "pass", "fail", "pass", "fail", "pass", "fail", 
"pass", "pass", "fail", "pass", "pass", "fail", "pass", "pass", 
"fail", "pass", "fail", "fail", "fail", "pass", "pass", "pass", 
"fail", "fail", "fail", "fail", "fail", "fail", "fail", "fail", 
"fail", "pass", "fail", "fail", "fail", "pass", "pass", "pass", 
"pass", "fail", "pass", "pass", "fail", "fail", "pass", "pass", 
"fail", "fail", "fail")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -140L))

>head(df)
file_number reader  event
3098129     aa      fail        
3096451     aa      fail        
3096774     aa      fail        
3095276     aa      fail

但是,当我运行以下tidyr::pivot_wider时,我得到的输出是<S3: vctrs_list_of>。我认为这与在names_from列中具有多个相同类型的值有关。

df %>%
  tidyr::pivot_wider(id_cols = file_number, names_from = reader, values_from = event)

id                       aa                 ae
3098129         <S3: vctrs_list_of> <S3: vctrs_list_of>     
3096451         <S3: vctrs_list_of> <S3: vctrs_list_of>     

伴随以下警告:

Values in `event` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list(event = list)` to suppress this warning.
* Use `values_fn = list(event = length)` to identify where the duplicates arise
* Use `values_fn = list(event = summary_fun)` to summarise duplicates

我的问题是:为什么pivot_wider输出S3矢量列表?

编辑 -添加了更好的可复制示例。 -重新定义的问题。

3 个答案:

答案 0 :(得分:2)

通常,如果我们有names_from列,但没有重复行的序列标识符,则可能会发生

library(tidyr)
library(dplyr)
df %>%        
    pivot_wider(names_from = reader, values_from = event)
# A tibble: 124 x 8
#   file_number          aa          ae          db          dl          mk          mm          np
#   <chr>       <list<chr>> <list<chr>> <list<chr>> <list<chr>> <list<chr>> <list<chr>> <list<chr>>
# 1 3098129             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# 2 3096451             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# 3 3096774             [1]         [0]         [0]         [0]         [0]         [0]         [1]
# 4 3095276             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# 5 3095464             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# 6 3096846             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# 7 3097132             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# 8 3096355             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# 9 3096951             [1]         [0]         [0]         [1]         [0]         [0]         [0]
#10 3096328             [1]         [0]         [0]         [0]         [0]         [0]         [0]
# … with 114 more rows

因此,在这种情况下,我们需要通过分组变量来创建序列

df %>%        
    group_by(reader) %>%
    mutate(rn = row_number()) %>% # recreated unique identifier column
    pivot_wider(names_from = reader, values_from = event)
# A tibble: 139 x 9
#   file_number    rn aa    ae    db    dl    mk    mm    np   
#   <chr>       <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 3098129         1 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 2 3096451         2 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 3 3096774         3 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 4 3095276         4 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 5 3095464         5 pass  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 6 3096846         6 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 7 3097132         7 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 8 3096355         8 pass  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# 9 3096951         9 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#10 3096328        10 fail  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
# … with 129 more rows

所有列均为factors,因为如果没有,则在data.frame调用中。指定stringsAsFactors = FALSE,默认情况下为TRUE

str(df)
#'data.frame':  10 obs. of  3 variables:
# $ id    : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5
# $ reader: Factor w/ 2 levels "aa","bb": 1 1 1 1 1 2 2 2 2 2
# $ event : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 2 1 2

相反,请指定stringsAsFactors = FALSE,列将为character

df <- data.frame(id = as.character(rep(seq(1:5),2)), 
             reader = c("aa","aa","aa","aa","aa","bb","bb","bb","bb","bb"), 
             event = as.character(rbinom(10, size = 1, prob=0.5)),
        stringsAsFactors = FALSE
             )

答案 1 :(得分:0)

我可以通过在S3矢量对象上使用tidyr::unnest函数来解决此问题。

df %>% ungroup() %>% pivot_wider(names_from = reader, values_from = event) %>% tidyr::unnest()
id  aa  bb
1   0   0       
2   0   1       
3   1   0       
4   1   1       
5   0   1

注意:现在所有变量都是因素

答案 2 :(得分:0)

TL;DR

如果您最终得到的值无法组成向量,您将得到一个列表。

例如,如果 pivot_wider 找到多个值并将其组合到一个列表中,因为它无法唯一标识一条记录,或者因为这些值并非都是相同的基本类型,或者因为任何值是不是基本类型或无法正确组合向量,例如 NULL

更多详情:

在您的示例中,您有一个重复的记录:

df %>%
  filter(duplicated(.))    

# # A tibble: 1 x 3
# file_number reader event
# <chr>       <chr>  <chr>
# 1 3098477     mk     fail

因为同一个 event + file_number 有多个 readerpivot_wider 除了将它们组合在一个列表中之外不知道如何处理它,并且event 列现在是包含这些值的列表列表,如 Values in `event` are not uniquely identified; output will contain list-cols. 警告:

df %>%
  pivot_wider(names_from = reader, values_from = event) %>%
  filter(file_number == "3098477") %>%
  select(mk) %>%
  glimpse

# Warning: Values are not uniquely identified; output will contain list-cols.
# * Use `values_fn = list` to suppress this warning.
# * Use `values_fn = length` to identify where the duplicates arise
# * Use `values_fn = {summary_fun}` to summarise duplicates
# Rows: 1
# Columns: 1
# $ mk <list> <"fail", "fail">

如果这是错误的,或者如果您真的不关心重复记录,您可以:

df %>%
  unique %>%
  pivot_wider(names_from = reader, values_from = event)

# # A tibble: 124 x 8
# file_number aa    ae    db    dl    mk    mm    np
# <chr>       <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 3098129     fail  NA    NA    NA    NA    NA    NA
# 2 3096451     fail  NA    NA    NA    NA    NA    NA
# 3 3096774     fail  NA    NA    NA    NA    NA    fail
# 4 3095276     fail  NA    NA    NA    NA    NA    NA
# 5 3095464     pass  NA    NA    NA    NA    NA    NA
# 6 3096846     fail  NA    NA    NA    NA    NA    NA
# 7 3097132     fail  NA    NA    NA    NA    NA    NA
# 8 3096355     pass  NA    NA    NA    NA    NA    NA
# 9 3096951     fail  NA    NA    pass  NA    NA    NA
# 10 3096328     fail  NA    NA    NA    NA    NA    NA
# # … with 114 more rows

或者,如果您确实希望同一个 file_number + reader 有重复甚至多个不同的值,您可以教 pivot_wider 如何将这些值与函数结合:

df %>%
  pivot_wider(id_cols = file_number, names_from = reader, values_from = event, values_fn = function(values) paste(values, collapse = ", ")) %>%
  filter(file_number == "3098477")

# # A tibble: 1 x 8
# file_number aa    ae    db    dl    mk         mm    np
# <chr>       <chr> <chr> <chr> <chr> <chr>      <chr> <chr>
# 1 3098477     NA    NA    NA    NA    fail, fail NA    NA

最后,如果您想为每个 value + file_number 的每个 reader 保留一个条目,那么添加具有人工唯一标识符的另一列就可以了。

>