如何将列值替换为列名

时间:2016-01-31 19:55:49

标签: r dataframe reshape

我创建了一个最小的例子来演示我的问题。

我有一个数据框,我想交换列以基于列中的值。换句话说,我想转换这样的东西:

structure(list(index = c("a", "b", "c"), A = c("zz", "yy", "xx"), B = c("yy", NA, "vv"), C = c("xx", "ww", "vv")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L), .Names = c("index", "A", "B", "C"))

进入类似这样的数据框:

structure(list(index = c("a", "b", "c", "c"), vv = c(NA, NA, "B", "C"), ww = c(NA, "C", NA, NA), xx = c("C", NA, "A", NA),     yy = c("B", "A", NA, NA), zz = c("A", NA, NA, NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -4L), .Names = c("index", "vv", "ww", "xx", "yy", "zz"))

更新

虽然至少有一个解决方案适用于我的最小示例,但它似乎不适用于我的主应用程序。下面发布的是我实际df的片段。

structure(list(index = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L), .Label = c("16fA", "16fB", "16gA", "16gB", "16gC", "16gD", "16gE", "16gF", "16gG", "16gP"), class = "factor"), AA = structure(c(1L, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), .Label = "GEC (1)", class = "factor"),     BB = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA,     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 
NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), .Label = "BER (3)", class ="factor"), 
CC = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,     NA, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA,     NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA,     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "ZUR (3)", class = "factor"),     DD = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,     NA, NA, NA, NA, 2L, NA, NA, NA, 2L, NA, NA, NA, 2L, NA, NA, 
NA, 2L, NA, NA, NA, 2L, NA, NA, NA, 2L, NA, NA, NA, 2L, NA,     NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), .Label = c("LIK (3)",     "SLB (3)"), class = "factor")), .Names = c("index", "AA", "BB", "CC", "DD"), row.names = c(NA, -50L), class = "data.frame")

2 个答案:

答案 0 :(得分:2)

可以使用melt/dcast中的reshape2(或data.table中的相同功能)来完成此操作

library(reshape2)
#convert the dataset to long format
d2 <- melt(d1, id.var='index', na.rm=TRUE)
#create a sequence column grouped by value, index
d2$i1 <- with(d2, ave(seq_along(index), value, index,FUN=seq_along))
#convert from long to wide format
dcast(d2, i1+index~value, value.var='variable')[-1]
#  index   vv   ww   xx   yy   zz
#1    a <NA> <NA>    C    B    A
#2    b <NA>    C <NA>    A <NA>
#3    c    B <NA>    A <NA> <NA>
#4    c    C <NA> <NA> <NA> <NA>

如果我们需要使用dplyr/tidyr,请使用gather/spread

library(dplyr)
library(tidyr)
gather(d1, variable, value, -index, na.rm=TRUE) %>% 
           group_by(value, index) %>%                
           mutate(i1= row_number()) %>%
           spread(value, variable) %>%
           select(-i1)
#  index    vv    ww    xx    yy    zz
#   (chr) (chr) (chr) (chr) (chr) (chr)
# 1     a    NA    NA     C     B     A
# 2     b    NA     C    NA     A    NA
# 3     c     B    NA     A    NA    NA
# 4     c     C    NA    NA    NA    NA

根据更新的数据集,dcast输出

dcast(d2, i1+index~value, value.var='variable')[-1]
#    index BER (3) GEC (1) LIK (3) SLB (3) ZUR (3)
#1   16fA    <NA>      AA    <NA>    <NA>    <NA>
#2   16fB    <NA>      AA    <NA>    <NA>    <NA>
#3   16gA    <NA>      AA    <NA>      DD      CC
#4   16gB    <NA>      AA    <NA>      DD      CC
#5   16gC      BB      AA    <NA>      DD    <NA>
#6   16gD    <NA>      AA    <NA>      DD    <NA>
#7   16gE    <NA>      AA    <NA>      DD    <NA>
#8   16gF    <NA>      AA    <NA>      DD      CC
#9   16gG      BB      AA    <NA>      DD    <NA>
#10  16gP      BB      AA      DD    <NA>    <NA>

gather/spread输出

#     index BER (3) GEC (1) LIK (3) SLB (3) ZUR (3)
#   (fctr)   (chr)   (chr)   (chr)   (chr)   (chr)
#1    16fA      NA      AA      NA      NA      NA
#2    16fB      NA      AA      NA      NA      NA
#3    16gA      NA      AA      NA      DD      CC
#4    16gB      NA      AA      NA      DD      CC
#5    16gC      BB      AA      NA      DD      NA
#6    16gD      NA      AA      NA      DD      NA
#7    16gE      NA      AA      NA      DD      NA
#8    16gF      NA      AA      NA      DD      CC
#9    16gG      BB      AA      NA      DD      NA
#10   16gP      BB      AA      DD      NA      NA

答案 1 :(得分:0)

假设您启动数据框称为d1,您可以使用合并melt&amp;使用 data.table 包进行dcast操作:

library(data.table)
dcast(melt(setDT(d1), "index"),
      index ~ value,
      value.var = "variable",
      fun.aggregate = toString,
      fill = NA_character_)[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed=TRUE))),
                          by = index][, "NA" := NULL]

给出:

   index vv ww xx yy zz
1:     a NA NA  C  B  A
2:     b NA  C NA  A NA
3:     c  B NA  A NA NA
4:     c  C NA  A NA NA

使用新数据集,您可以:

df2 <- dcast(melt(setDT(df1), "index"),
             index ~ value,
             value.var = "variable",
             fun.aggregate = toString,
             fill = NA_character_)[, "NA" := NULL]

给出:

> df2
    index BER (3) GEC (1) LIK (3) SLB (3) ZUR (3)
 1:  16fA      NA      AA      NA      NA      NA
 2:  16fB      NA      AA      NA      NA      NA
 3:  16gA      NA      AA      NA      DD      CC
 4:  16gB      NA      AA      NA      DD      CC
 5:  16gC      BB      AA      NA      DD      NA
 6:  16gD      NA      AA      NA      DD      NA
 7:  16gE      NA      AA      NA      DD      NA
 8:  16gF      NA      AA      NA      DD      CC
 9:  16gG      BB      AA      NA      DD      NA
10:  16gP      BB      AA      DD      NA      NA