尝试将嵌套的子列表转换为数据框

时间:2018-10-16 22:52:53

标签: r list dataframe

我有一个列表,列出了几个层次:

[[12]]
[[12]][[1]]
                  estimateName                    insuredName                      priceList 
        "KING"                 "IDIF8X_MAY18" 
                      laborEff                    claimNumber                   policyNumber 
 "Restoration/Service/Remodel"                 "000000-01"               "0000000" 
                    typeOfLoss                     roofDamage                        deprMat 
                        "Hail"                            "0"                            "1" 
                    deprNonMat                      deprOandP                      deprTaxes 
                           "1"                            "1"                            "1" 
                        onsite            recipientsXNAddress                      carrierId 
                           "1"              "CO"                      "00000" 
                  estimateType 
                       "Mixed" 

[[12]][[2]]
                                       type                                     lineNum 
                                        "I"                                        "12" 
                                        cat                                         sel 
                                      "SFG"                                      "GUTA" 
                                        act                                   actPrefix 
                                        "&"                                       "R&R" 
                                   "1" 


[[13]]
[[13]][[1]]
                  eName                    iName                      priceList 
        "KING"                 "MAY18" 
                      laborEff                    claimNumber                   policyNumber 
 "Restoration/Service/Remodel"                 "00000-01"               "000000000" 
                    typeOfLoss                     roofDamage                        deprMat 
                        "Hail"                            "0"                            "1" 
                    deprNonMat                      deprOandP                      deprTaxes 
                           "1"                            "1"                            "1" 
                        onsite            recipientsXNAddress                      carrierId 
                           "1"              "MRP.BRIGHTON.CO"                      "2570112" 
                  estimateType 
                       "Mixed" 

[[13]][[2]]
                                        type                                      lineNum 
                                         "I"                                         "13" 
                                         cat                                          sel 
                                       "FEN"                                      "VNLS6" 
                                         act                                    actPrefix 
                                         "&"                                        "R&R" 

这只是一个示例。我只是要创建一个数据框,它将属性转换为标题,并且所有行都是其值。我试图取消列出和其他一些功能,但没有给我正确的格式。

因此,所需的输出将是两行带有列名称的值。

list(list(structure(c("DARIAN_KING_&_CASSI1", "DARIAN KING & CASSIDY R KING", 
"IDIF8X_MAY18", "Restoration/Service/Remodel", "037262569-01", 
"H3726819012070", "Hail", "0", "1", "1", "1", "1", "1", "MRP.BRIGHTON.CO", 
"2570112", "Mixed"), .Names = c("estimateName", "insuredName", 
"priceList", "laborEff", "claimNumber", "policyNumber", "typeOfLoss", 
"roofDamage", "deprMat", "deprNonMat", "deprOandP", "deprTaxes", 
"onsite", "recipientsXNAddress", "carrierId", "estimateType")), 
    structure(c("I", "12", "SFG", "GUTA", "&", "R&R", "Gutter / downspout - aluminum - up to 5\"", 
    "12", "12", "LF", "Dwelling", "0.32", "4.23", "54.6", "36.33", 
    "2", "1", "9", "25", "18.27", "1.49", "37.28", "18.81", "56.09", 
    "1", "29.76", "10.08", "6.36", "13.32", "0.58", "24.84", 
    "1"), .Names = c("type", "lineNum", "cat", "sel", "act", 
    "actPrefix", "desc", "calc", "qty", "unit", "coverageName", 
    "remove", "replace", "total", "acv", "deprType", "recoverable", 
    "age", "lifeExp", "depr", "tax", "acvTotal", "deprTotal", 
    "rcvTotal", "isPartOfInitSettle", "laborTotal", "laborBase", 
    "laborBurden", "laborMarkup", "laborHours", "material", "containsBSCDontApply"
    ))), list(structure(c("DARIAN_KING_&_CASSI1", "DARIAN KING & CASSIDY R KING", 
"IDIF8X_MAY18", "Restoration/Service/Remodel", "037262569-01", 
"H3726819012070", "Hail", "0", "1", "1", "1", "1", "1", "MRP.BRIGHTON.CO", 
"2570112", "Mixed"), .Names = c("estimateName", "insuredName", 
"priceList", "laborEff", "claimNumber", "policyNumber", "typeOfLoss", 
"roofDamage", "deprMat", "deprNonMat", "deprOandP", "deprTaxes", 
"onsite", "recipientsXNAddress", "carrierId", "estimateType")), 
    structure(c("I", "13", "FEN", "VNLS6", "&", "R&R", "Vinyl (PVC) fence, 5'- 6' high - full slat", 
    "8*3", "24", "LF", "Other Structures", "4.01", "27.71", "761.28", 
    "721.38", "2", "9", "150", "39.9", "29.61", "749.21", "41.68", 
    "790.89", "1", "267.84", "141.36", "80.64", "45.84", "9.1", 
    "493.44", "1"), .Names = c("type", "lineNum", "cat", "sel", 
    "act", "actPrefix", "desc", "calc", "qty", "unit", "coverageName", 
    "remove", "replace", "total", "acv", "deprType", "age", "lifeExp", 
    "depr", "tax", "acvTotal", "deprTotal", "rcvTotal", "isPartOfInitSettle", 
    "laborTotal", "laborBase", "laborBurden", "laborMarkup", 
    "laborHours", "material", "containsBSCDontApply"))))

2 个答案:

答案 0 :(得分:1)

预先:

dplyr::bind_rows(lapply(datlst, function(dl) as.data.frame(as.list(unlist(dl)), stringsAsFactors=FALSE)))
#           estimateName                  insuredName    priceList                    laborEff  claimNumber   policyNumber typeOfLoss roofDamage deprMat deprNonMat deprOandP deprTaxes onsite
# 1 DARIAN_KING_&_CASSI1 DARIAN KING & CASSIDY R KING IDIF8X_MAY18 Restoration/Service/Remodel 037262569-01 H3726819012070       Hail          0       1          1         1         1      1
# 2 DARIAN_KING_&_CASSI1 DARIAN KING & CASSIDY R KING IDIF8X_MAY18 Restoration/Service/Remodel 037262569-01 H3726819012070       Hail          0       1          1         1         1      1
#   recipientsXNAddress carrierId estimateType type lineNum cat   sel act actPrefix                                       desc calc qty unit     coverageName remove replace  total    acv deprType
# 1     MRP.BRIGHTON.CO   2570112        Mixed    I      12 SFG  GUTA   &       R&R   Gutter / downspout - aluminum - up to 5"   12  12   LF         Dwelling   0.32    4.23   54.6  36.33        2
# 2     MRP.BRIGHTON.CO   2570112        Mixed    I      13 FEN VNLS6   &       R&R Vinyl (PVC) fence, 5'- 6' high - full slat  8*3  24   LF Other Structures   4.01   27.71 761.28 721.38        2
#   recoverable age lifeExp  depr   tax acvTotal deprTotal rcvTotal isPartOfInitSettle laborTotal laborBase laborBurden laborMarkup laborHours material containsBSCDontApply
# 1           1   9      25 18.27  1.49    37.28     18.81    56.09                  1      29.76     10.08        6.36       13.32       0.58    24.84                    1
# 2        <NA>   9     150  39.9 29.61   749.21     41.68   790.89                  1     267.84    141.36       80.64       45.84        9.1   493.44                    1

让我们分解一下。在每个顶层中都有两个非常不同的列表,因此我猜测它们需要位于同一行。我们可以简单地将它们与unlist(...)组合在一起。 (我将在需要的地方使用str和截断符号来辅助演示。)

str(unlist(datlst[[1]]))
#  Named chr [1:48] "DARIAN_KING_&_CASSI1" "DARIAN KING & CASSIDY R KING" "IDIF8X_MAY18" "Restoration/Service/Remodel" "037262569-01" "H3726819012070" "Hail" "0" "1" "1" "1" "1" "1" ...
#  - attr(*, "names")= chr [1:48] "estimateName" "insuredName" "priceList" "laborEff" ...

知道我们可以轻松地将命名的list转换为data.frame,让我们将该命名矢量转换为命名列表:

str(as.list(unlist(datlst[[1]])))
# List of 48
#  $ estimateName        : chr "DARIAN_KING_&_CASSI1"
#  $ insuredName         : chr "DARIAN KING & CASSIDY R KING"
#  $ priceList           : chr "IDIF8X_MAY18"
#  ...snip...
#  $ laborHours          : chr "0.58"
#  $ material            : chr "24.84"
#  $ containsBSCDontApply: chr "1"

因此,我们可以使用datlst将其应用于lapply的每个元素。从那里,我们需要rbind全部。在基数R中,这通常是用do.call(rbind, lapply(...))完成的,但是我注意到一个元素在另一个元素中而不是另一个元素(recoverable),所以这突出了rbind的约束。 :列名称必须相同且顺序相同。但是,两个(非基础)工具可以解决此问题:

  • dplyr::bind_rows(...)
  • data.table::rbindlist(..., fill=TRUE)

我展示了第一个,但是第二个也一样……使用您可能已经安装的任何一个。

答案 1 :(得分:0)

> microbenchmark(
+     test1 <- datlst %>% ldply(., function(x) c(x, recursive=TRUE) %>% t %>% as_data_frame),
+     test2 <- dplyr::bind_rows(lapply(datlst, function(dl) as.data.frame(as.list(unlist(dl)), stringsAsFactors=FALSE))), 
+     test3 <- datlst %>% llply(., function(x) c(x, recursive=TRUE) %>% t %>% as_data_frame) %>% bind_rows)
Unit: milliseconds
                                                                                                                      expr
 test1 <- datlst %>% ldply(., function(x) c(x, recursive = TRUE) %>% t %>% as_data_frame)
 test2 <- dplyr::bind_rows(lapply(datlst, function(dl) as.data.frame(as.list(unlist(dl)), stringsAsFactors = FALSE)))
 test3 <- datlst %>% llply(., function(x) c(x, recursive = TRUE) %>% t %>% as_data_frame) %>% bind_rows                       
      min       lq      mean   median       uq      max neval cld
 3.984349 4.463147  6.516226 4.674322 4.851113 35.71595   100  a 
 6.804798 7.246649 11.556857 7.455770 7.750850 49.91981   100   b
 2.805568 3.040350  4.628043 3.153763 3.326192 36.70896   100  a