我正在研究清理一些数据的解决方案,但我不能100%确定最佳解决方案是什么。我找到了一个有效的解决方案,但想知道是否有更简单的方法(特别是在尝试扩展它时)。我想要做的是分离数据框中的所有元素(用分号分隔),将每个组件应用到数据框的元素,然后将结果合并到一个新的数据框中。示例如下:
test <- data.frame(class=c("a1", "a2","a3","a4"),
person=c("p1;p3;p4","p2;p4","p4;p5;p6","p1;p5"),
stringsAsFactors = F)
test1 <- c()
test2 <- c()
for (i in 1:nrow(test)){
test1 <-append(test1, strsplit(test[i,2],";")[[1]])
test2 <- append(test2, rep(test[i,1],length(strsplit(test[i,2],";")[[1]])))
}
答案 0 :(得分:1)
如果我完全理解你要做的事情,那么这个略显冗长的oneliner会做到这一点:
do.call("rbind",apply(test, 1, function(x) expand.grid(x[1], unlist(strsplit(x[2], split=";")))))
Var1 Var2
1 a1 p1
2 a1 p3
3 a1 p4
4 a2 p2
5 a2 p4
6 a3 p4
7 a3 p5
8 a3 p6
9 a4 p1
10 a4 p5
我在原始数据框的每一行使用expand.grid
,包括第一个变量和第二个变量;
作为输入。由于apply
是一个列表,我使用do.call
和rbind
将其重新放回数据框。
答案 1 :(得分:1)
我们可以使用来自[0, 1, 2, 5, 6]
[0, 1, 2, 5, 7]
[0, 1, 2, 5, 8]
[0, 1, 2, 5, 9]
[0, 1, 2, 6, 7]
[0, 1, 2, 6, 8]
[0, 1, 2, 6, 9]
[0, 1, 2, 7, 8]
[0, 1, 2, 7, 9]
[0, 1, 2, 8, 9]
[0, 1, 3, 5, 6]
[0, 1, 3, 5, 7]
[0, 1, 3, 5, 8]
[0, 1, 3, 5, 9]
[0, 1, 3, 6, 7]
[0, 1, 3, 6, 8]
[0, 1, 3, 6, 9]
[0, 1, 3, 7, 8]
[0, 1, 3, 7, 9]
[0, 1, 3, 8, 9]
[0, 1, 4, 5, 6]
[0, 1, 4, 5, 7]
[0, 1, 4, 5, 8]
[0, 1, 4, 5, 9]
[0, 1, 4, 6, 7]
[0, 1, 4, 6, 8]
[0, 1, 4, 6, 9]
[0, 1, 4, 7, 8]
[0, 1, 4, 7, 9]
[0, 1, 4, 8, 9]
[0, 2, 3, 5, 6]
[0, 2, 3, 5, 7]
[0, 2, 3, 5, 8]
[0, 2, 3, 5, 9]
[0, 2, 3, 6, 7]
[0, 2, 3, 6, 8]
[0, 2, 3, 6, 9]
[0, 2, 3, 7, 8]
[0, 2, 3, 7, 9]
[0, 2, 3, 8, 9]
[0, 2, 4, 5, 6]
[0, 2, 4, 5, 7]
[0, 2, 4, 5, 8]
[0, 2, 4, 5, 9]
[0, 2, 4, 6, 7]
[0, 2, 4, 6, 8]
[0, 2, 4, 6, 9]
[0, 2, 4, 7, 8]
[0, 2, 4, 7, 9]
[0, 2, 4, 8, 9]
[0, 3, 4, 5, 6]
[0, 3, 4, 5, 7]
[0, 3, 4, 5, 8]
[0, 3, 4, 5, 9]
[0, 3, 4, 6, 7]
[0, 3, 4, 6, 8]
[0, 3, 4, 6, 9]
[0, 3, 4, 7, 8]
[0, 3, 4, 7, 9]
[0, 3, 4, 8, 9]
[1, 2, 3, 5, 6]
[1, 2, 3, 5, 7]
[1, 2, 3, 5, 8]
[1, 2, 3, 5, 9]
[1, 2, 3, 6, 7]
[1, 2, 3, 6, 8]
[1, 2, 3, 6, 9]
[1, 2, 3, 7, 8]
[1, 2, 3, 7, 9]
[1, 2, 3, 8, 9]
[1, 2, 4, 5, 6]
[1, 2, 4, 5, 7]
[1, 2, 4, 5, 8]
[1, 2, 4, 5, 9]
[1, 2, 4, 6, 7]
[1, 2, 4, 6, 8]
[1, 2, 4, 6, 9]
[1, 2, 4, 7, 8]
[1, 2, 4, 7, 9]
[1, 2, 4, 8, 9]
[1, 3, 4, 5, 6]
[1, 3, 4, 5, 7]
[1, 3, 4, 5, 8]
[1, 3, 4, 5, 9]
[1, 3, 4, 6, 7]
[1, 3, 4, 6, 8]
[1, 3, 4, 6, 9]
[1, 3, 4, 7, 8]
[1, 3, 4, 7, 9]
[1, 3, 4, 8, 9]
[2, 3, 4, 5, 6]
[2, 3, 4, 5, 7]
[2, 3, 4, 5, 8]
[2, 3, 4, 5, 9]
[2, 3, 4, 6, 7]
[2, 3, 4, 6, 8]
[2, 3, 4, 6, 9]
[2, 3, 4, 7, 8]
[2, 3, 4, 7, 9]
[2, 3, 4, 8, 9]
的{{1}}(此处我提到了一个包并使用了一个而不包含另一个包)来分割“#1}}&#39;分隔符cSplit
的列,并将splitstackshape
指定为;
,以重新定义为“{1}}长期&#39;分割后的格式。
direction
或来自long
的另一个单行(不使用任何套餐)
library(splitstackshape)
cSplit(test, 'person', ';', 'long')
# class person
# 1: a1 p1
# 2: a1 p3
# 3: a1 p4
# 4: a2 p2
# 5: a2 p4
# 6: a3 p4
# 7: a3 p5
# 8: a3 p6
# 9: a4 p1
#10: a4 p5
答案 2 :(得分:1)
使用tidyr
:
library(dplyr)
library(tidyr)
# separate person into multiple columns
test %>% separate(person, into = paste0('person', 1:5), fill = 'right') %>%
# gather from wide to long
gather(key = id, value = person, -class, na.rm = TRUE) %>%
# clean up extra column
select(-id)
# class person
# 1 a1 p1
# 2 a2 p2
# 3 a3 p4
# 4 a4 p1
# 5 a1 p3
# 6 a2 p4
# 7 a3 p5
# 8 a4 p5
# 9 a1 p4
# 11 a3 p6