我创建了一个最小的例子来演示我的问题。
我有一个数据框,我想交换列以基于列中的值。换句话说,我想转换这样的东西:
structure(list(index = c("a", "b", "c"), A = c("zz", "yy", "xx"), B = c("yy", NA, "vv"), C = c("xx", "ww", "vv")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L), .Names = c("index", "A", "B", "C"))
进入类似这样的数据框:
structure(list(index = c("a", "b", "c", "c"), vv = c(NA, NA, "B", "C"), ww = c(NA, "C", NA, NA), xx = c("C", NA, "A", NA), yy = c("B", "A", NA, NA), zz = c("A", NA, NA, NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -4L), .Names = c("index", "vv", "ww", "xx", "yy", "zz"))
更新
虽然至少有一个解决方案适用于我的最小示例,但它似乎不适用于我的主应用程序。下面发布的是我实际df的片段。
structure(list(index = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L), .Label = c("16fA", "16fB", "16gA", "16gB", "16gC", "16gD", "16gE", "16gF", "16gG", "16gP"), class = "factor"), AA = structure(c(1L, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), .Label = "GEC (1)", class = "factor"), BB = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), .Label = "BER (3)", class ="factor"),
CC = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "ZUR (3)", class = "factor"), DD = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, 2L, NA, NA, NA, 2L, NA, NA,
NA, 2L, NA, NA, NA, 2L, NA, NA, NA, 2L, NA, NA, NA, 2L, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), .Label = c("LIK (3)", "SLB (3)"), class = "factor")), .Names = c("index", "AA", "BB", "CC", "DD"), row.names = c(NA, -50L), class = "data.frame")
答案 0 :(得分:2)
可以使用melt/dcast
中的reshape2
(或data.table
中的相同功能)来完成此操作
library(reshape2)
#convert the dataset to long format
d2 <- melt(d1, id.var='index', na.rm=TRUE)
#create a sequence column grouped by value, index
d2$i1 <- with(d2, ave(seq_along(index), value, index,FUN=seq_along))
#convert from long to wide format
dcast(d2, i1+index~value, value.var='variable')[-1]
# index vv ww xx yy zz
#1 a <NA> <NA> C B A
#2 b <NA> C <NA> A <NA>
#3 c B <NA> A <NA> <NA>
#4 c C <NA> <NA> <NA> <NA>
如果我们需要使用dplyr/tidyr
,请使用gather/spread
library(dplyr)
library(tidyr)
gather(d1, variable, value, -index, na.rm=TRUE) %>%
group_by(value, index) %>%
mutate(i1= row_number()) %>%
spread(value, variable) %>%
select(-i1)
# index vv ww xx yy zz
# (chr) (chr) (chr) (chr) (chr) (chr)
# 1 a NA NA C B A
# 2 b NA C NA A NA
# 3 c B NA A NA NA
# 4 c C NA NA NA NA
根据更新的数据集,dcast
输出
dcast(d2, i1+index~value, value.var='variable')[-1]
# index BER (3) GEC (1) LIK (3) SLB (3) ZUR (3)
#1 16fA <NA> AA <NA> <NA> <NA>
#2 16fB <NA> AA <NA> <NA> <NA>
#3 16gA <NA> AA <NA> DD CC
#4 16gB <NA> AA <NA> DD CC
#5 16gC BB AA <NA> DD <NA>
#6 16gD <NA> AA <NA> DD <NA>
#7 16gE <NA> AA <NA> DD <NA>
#8 16gF <NA> AA <NA> DD CC
#9 16gG BB AA <NA> DD <NA>
#10 16gP BB AA DD <NA> <NA>
,gather/spread
输出
# index BER (3) GEC (1) LIK (3) SLB (3) ZUR (3)
# (fctr) (chr) (chr) (chr) (chr) (chr)
#1 16fA NA AA NA NA NA
#2 16fB NA AA NA NA NA
#3 16gA NA AA NA DD CC
#4 16gB NA AA NA DD CC
#5 16gC BB AA NA DD NA
#6 16gD NA AA NA DD NA
#7 16gE NA AA NA DD NA
#8 16gF NA AA NA DD CC
#9 16gG BB AA NA DD NA
#10 16gP BB AA DD NA NA
答案 1 :(得分:0)
假设您启动数据框称为d1
,您可以使用合并melt
&amp;使用 data.table 包进行dcast
操作:
library(data.table)
dcast(melt(setDT(d1), "index"),
index ~ value,
value.var = "variable",
fun.aggregate = toString,
fill = NA_character_)[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed=TRUE))),
by = index][, "NA" := NULL]
给出:
index vv ww xx yy zz
1: a NA NA C B A
2: b NA C NA A NA
3: c B NA A NA NA
4: c C NA A NA NA
使用新数据集,您可以:
df2 <- dcast(melt(setDT(df1), "index"),
index ~ value,
value.var = "variable",
fun.aggregate = toString,
fill = NA_character_)[, "NA" := NULL]
给出:
> df2
index BER (3) GEC (1) LIK (3) SLB (3) ZUR (3)
1: 16fA NA AA NA NA NA
2: 16fB NA AA NA NA NA
3: 16gA NA AA NA DD CC
4: 16gB NA AA NA DD CC
5: 16gC BB AA NA DD NA
6: 16gD NA AA NA DD NA
7: 16gE NA AA NA DD NA
8: 16gF NA AA NA DD CC
9: 16gG BB AA NA DD NA
10: 16gP BB AA DD NA NA