Question

我正在尝试将数据帧解析为一种列表格式，其中A列是链接到B列中一组bin的一组“ bin”，其中C列是从bin A移到bin B的比例。手动创建列表格式很容易，例如。

df.list <- list() 
df.list$litter <- data.frame('lost'=50,'pot'=15,'pool'=35)
df.list$lost <- data.frame('pool'=30, 'pot'=10, 'lost'=60)
df.list$bin <- data.frame(uncollected=4, collected=96)
df.list$dump <- data.frame(litter=50, burn=20)
df.list$uncollected <- data.frame(litter=25, burn=55, dump=20)
df.list$collected <- data.frame(litter=3.7, dump=54.3, recycle=30, burn=12 )

但是我有一个大型数据库，并且以csv格式存在多个年份和多个位置，在这里我需要将每年和位置分开，然后列出一个类似于以下每年输出的列表。

> df.list
$litter
  lost pot pool
1   50  15   35

$lost
  pool pot lost
1   30  10   60

$bin
  uncollected collected
1           4        96

$dump
  litter burn
1     50   20

$uncollected
  litter burn dump
1     25   55   20

$collected
  litter dump recycle burn
1    3.7 54.3      30   12

我尝试了其他已发布的示例，但是我不知道如何在 item 列中隔离每个“ bin”，然后嵌套 node 和以列表格式在项目中关联值 ... 这是我正在使用的数据框的模拟：

df <- data.frame(year=sort(c(rep(c(2010, 2011, 2012), 15))), 
                 item=c(rep(rep(c("litter", "lost", "bin", "uncollected", "collected"), c(3,3,2,3,4)),3)),
                 node=rep(c("lost",  "pot" , "pool", "pool" ,"pot" , "lost", "collected",  "uncollected",
                                 "litter", "burn" , "dump" , "litter" , "dump", "recycle", "burn"),3),
                 value=rep(c(50.0, 15.0 ,35.0, 30.0, 10.0, 60.0, 96.0,  4.0, 25.0, 55.0 , 20.0,  3.7, 54.3,30.0, 12.0),3))

非常感谢！

Answer 1

我已经将其敲定到位，但这应该可以满足您的要求。我将在这里笨拙地进行循环，但是我敢肯定可以通过lapply进行修改以提高效率。

outlist <- list()
for(i in unique(df$year)){

  for(j in unique(df$item)){

    list.item.name <- paste(j,i,sep='.')

    tdf <- df[which(df$year==i & df$item==j),c('node','value')]
    newdf <- data.frame(t(tdf))
    names(newdf) <- newdf$node
    rownames(newdf) <- NULL

    outlist[[list.item.name]] <- newdf

  }

}

这将产生：

> outlist
$`litter.2010`

1 lost pot pool
2   50  15   35

$lost.2010

1 pool pot lost
2   30  10   60

$bin.2010

1 collected uncollected
2        96           4

$uncollected.2010

1 litter burn dump
2     25   55   20

$collected.2010

1 litter dump recycle burn
2    3.7 54.3    30.0 12.0

$litter.2011

1 lost pot pool
2   50  15   35

$lost.2011

1 pool pot lost
2   30  10   60

$bin.2011

1 collected uncollected
2        96           4

$uncollected.2011

1 litter burn dump
2     25   55   20

$collected.2011

1 litter dump recycle burn
2    3.7 54.3    30.0 12.0

$litter.2012

1 lost pot pool
2   50  15   35

$lost.2012

1 pool pot lost
2   30  10   60

$bin.2012

1 collected uncollected
2        96           4

$uncollected.2012

1 litter burn dump
2     25   55   20

$collected.2012

1 litter dump recycle burn
2    3.7 54.3    30.0 12.0

Answer 2

两次调用dlply()是否可以提供所需的输出？

df1 <- dlply(df,.(year),function(x){
  x %>% select(-year) %>% 
    dlply(.,.(item))
  })

df1

$`2010`
$bin
  item        node value
1  bin   collected    96
2  bin uncollected     4

$collected
       item    node value
1 collected  litter   3.7
2 collected    dump  54.3
3 collected recycle  30.0
4 collected    burn  12.0

$litter
    item node value
1 litter lost    50
2 litter  pot    15
3 litter pool    35

$lost
  item node value
1 lost pool    30
2 lost  pot    10
3 lost lost    60
...

str(df1)
List of 3
 $ 2010:List of 5
  ..$ bin        :'data.frame': 2 obs. of  3 variables:
  .. ..$ item : Factor w/ 5 levels "bin","collected",..: 1 1
  .. ..$ node : Factor w/ 9 levels "burn","collected",..: 2 9
  .. ..$ value: num [1:2] 96 4
  .. ..- attr(*, "vars")= chr "item"
  ..$ collected  :'data.frame': 4 obs. of  3 variables:
  .. ..$ item : Factor w/ 5 levels "bin","collected",..: 2 2 2 2
  .. ..$ node : Factor w/ 9 levels "burn","collected",..: 4 3 8 1
  .. ..$ value: num [1:4] 3.7 54.3 30 12
  .. ..- attr(*, "vars")= chr "item"
  ..$ litter     :'data.frame': 3 obs. of  3 variables:
  .. ..$ item : Factor w/ 5 levels "bin","collected",..: 3 3 3
  .. ..$ node : Factor w/ 9 levels "burn","collected",..: 5 7 6
  .. ..$ value: num [1:3] 50 15 35
  .. ..- attr(*, "vars")= chr "item"
  ..$ lost       :'data.frame': 3 obs. of  3 variables:
  .. ..$ item : Factor w/ 5 levels "bin","collected",..: 4 4 4
  .. ..$ node : Factor w/ 9 levels "burn","collected",..: 6 7 5
  .. ..$ value: num [1:3] 30 10 60
  .. ..- attr(*, "vars")= chr "item"
  ..$ uncollected:'data.frame': 3 obs. of  3 variables:
  .. ..$ item : Factor w/ 5 levels "bin","collected",..: 5 5 5
  .. ..$ node : Factor w/ 9 levels "burn","collected",..: 4 1 3
  .. ..$ value: num [1:3] 25 55 20
  .. ..- attr(*, "vars")= chr "item"
  ..- attr(*, "class")= chr [1:2] "split" "list"
...

编辑
首先尝试使用spread()，然后将其传递给两个dlply()调用。我认为这与您所需的输出有点接近。您可能可以修改dlply()中的调用以删除具有NA值的变量。

df %>% spread(node,value) %>% dlply(.,.(year),function(x){
  x %>% select(-year) %>% 
    dlply(.,.(item))
})
$`2010`
$bin
  item burn collected dump litter lost pool pot recycle uncollected
1  bin   NA        96   NA     NA   NA   NA  NA      NA           4

$collected
       item burn collected dump litter lost pool pot recycle uncollected
1 collected   12        NA 54.3    3.7   NA   NA  NA      30          NA

$litter
    item burn collected dump litter lost pool pot recycle uncollected
1 litter   NA        NA   NA     NA   50   35  15      NA          NA

$lost
  item burn collected dump litter lost pool pot recycle uncollected
1 lost   NA        NA   NA     NA   60   30  10      NA          NA

$uncollected
         item burn collected dump litter lost pool pot recycle uncollected
1 uncollected   55        NA   20     25   NA   NA  NA      NA          NA


$`2011`
$bin
  item burn collected dump litter lost pool pot recycle uncollected
1  bin   NA        96   NA     NA   NA   NA  NA      NA           4

$collected
       item burn collected dump litter lost pool pot recycle uncollected
1 collected   12        NA 54.3    3.7   NA   NA  NA      30          NA

$litter
    item burn collected dump litter lost pool pot recycle uncollected
1 litter   NA        NA   NA     NA   50   35  15      NA          NA

$lost
  item burn collected dump litter lost pool pot recycle uncollected
1 lost   NA        NA   NA     NA   60   30  10      NA          NA

$uncollected
         item burn collected dump litter lost pool pot recycle uncollected
1 uncollected   55        NA   20     25   NA   NA  NA      NA          NA


$`2012`
$bin
  item burn collected dump litter lost pool pot recycle uncollected
1  bin   NA        96   NA     NA   NA   NA  NA      NA           4

$collected
       item burn collected dump litter lost pool pot recycle uncollected
1 collected   12        NA 54.3    3.7   NA   NA  NA      30          NA

$litter
    item burn collected dump litter lost pool pot recycle uncollected
1 litter   NA        NA   NA     NA   50   35  15      NA          NA

$lost
  item burn collected dump litter lost pool pot recycle uncollected
1 lost   NA        NA   NA     NA   60   30  10      NA          NA

$uncollected
         item burn collected dump litter lost pool pot recycle uncollected
1 uncollected   55        NA   20     25   NA   NA  NA      NA          NA

Answer 3

修改：不确定投反对票。无论如何，这是一个类似的输出，它提供矢量而不是data.frames：

by(df[, -(1:2)]
   , INDICES = list(df$item, df$year)
   , FUN = function(DF) {x <- DF$value; names(x) = as.character(DF$node); x}
   )

这提供了相似的输出，但附加了一年，输出是一堆data.frames。

split(df, f = list(df$item, df$year))

$bin.2010
  year item        node value
7 2010  bin   collected    96
8 2010  bin uncollected     4

$collected.2010
   year      item    node value
12 2010 collected  litter   3.7
13 2010 collected    dump  54.3
14 2010 collected recycle  30.0
15 2010 collected    burn  12.0

$litter.2010
  year   item node value
1 2010 litter lost    50
2 2010 litter  pot    15
3 2010 litter pool    35

$lost.2010
  year item node value
4 2010 lost pool    30
5 2010 lost  pot    10
6 2010 lost lost    60

$uncollected.2010
   year        item   node value
9  2010 uncollected litter    25
10 2010 uncollected   burn    55
11 2010 uncollected   dump    20

$bin.2011
   year item        node value
22 2011  bin   collected    96
23 2011  bin uncollected     4

#rest truncated

从数据框中列出：A列是变量，B列与C列中的值相关联

3 个答案: