我有一个这种形式的data.frame:
C1 C2 C3 support
1 {A} {B} <NA> 1.00
2 {D} {A} <NA> 0.50
3 {F} {A} <NA> 0.30
4 {D} {F} {A} 0.75
5 {B,F} {A} <NA> 0.50
6 {D} {B,F} {A} 0.25
我希望将上面的data.frame转换为以下内容:
FROM TO support
1 {A} {B} 1.00
2 {D} {A} 0.50
3 {F} {A} 0.30
4 {D} {F} 0.75
5 {F} {A} 0.75
6 {B,F} {A} 0.50
7 {D} {B,F} 0.25
8 {B,F} {A} 0.25
这是(`C1`
和`C2`
)和(`C2`
和`C3`
)之间的每个链接。
提前谢谢。
答案 0 :(得分:3)
一个选项是rbind
数据集的子集,其中第2列到第4列(基于第三列中的非NA值)到没有第3列的完整数据集,将其放在{ {1}}并使用list
(来自rbindlist
)对data.table
的元素进行行绑定。如果需要,我们可以使用list
更改列名称。
setnames
library(data.table)
setnames(rbindlist(list(df[, c(1,2,4)], df[!is.na(df[,3]), 2:4])),
1:2, c("FROM", "TO"))[]
# FROM TO support
#1: {A} {B} 1.00
#2: {D} {A} 0.50
#3: {F} {A} 0.30
#4: {D} {F} 0.75
#5: {B,F} {A} 0.50
#6: {D} {B,F} 0.25
#7: {F} {A} 0.75
#8: {B,F} {A} 0.25
答案 1 :(得分:2)
这是另一种可行的方法。这预先计算了一个行索引向量r
,它复制了`3`
列中具有非NA值的每一行。然后,它使用该行索引向量索引data.table,并使用FROM
构建所需的TO
和ifelse()
列,以在duplicated(r)
上进行谓词,为每个列选择正确的列值行。
r <- rep(seq_len(nrow(dt)),2L-is.na(dt$`3`));
dt[r,.(FROM=ifelse(d <- duplicated(r),`2`,`1`),TO=ifelse(d,`3`,`2`),support)];
## FROM TO support
## 1: {A} {B} 1.00
## 2: {D} {A} 0.50
## 3: {F} {A} 0.30
## 4: {D} {F} 0.75
## 5: {F} {A} 0.75
## 6: {B,F} {A} 0.50
## 7: {D} {B,F} 0.25
## 8: {B,F} {A} 0.25
数据强>
dt <- data.table(`1`=c('{A}','{D}','{F}','{D}','{B,F}','{D}'),`2`=c('{B}','{A}','{A}','{F}',
'{A}','{B,F}'),`3`=c(NA,NA,NA,'{A}',NA,'{A}'),support=c(1,0.5,0.3,0.75,0.5,0.25));
library(microbenchmark);
library(data.table);
akrun <- function(df) setnames(rbindlist(list(df[,c(1,2,4)],df[!is.na(df[,3]),2:4])),1:2,c("FROM","TO"));
bgoldst <- function(dt) { r <- rep(seq_len(nrow(dt)),2L-is.na(dt$`3`)); dt[r,.(FROM=ifelse(d <- duplicated(r),`2`,`1`),TO=ifelse(d,`3`,`2`),support)]; };
harmonize <- function(df) {
df <- as.data.frame(df); ## coerce to data.frame
df <- df[order(names(df))]; ## order columns
df <- df[do.call(order,df),]; ## order rows
df;
}; ## end harmonize()
## OP's example
df <- data.frame(`1`=c('{A}','{D}','{F}','{D}','{B,F}','{D}'),`2`=c('{B}','{A}','{A}','{F}','{A}','{B,F}'),`3`=c(NA,NA,NA,'{A}',NA,'{A}'),support=c(1,0.5,0.3,0.75,0.5,0.25),check.names=F,stringsAsFactors=F);
dt <- as.data.table(df);
ex <- harmonize(akrun(df));
all.equal(ex,harmonize(bgoldst(dt)),check.attributes=F);
## [1] TRUE
microbenchmark(akrun(df),bgoldst(dt));
## Unit: microseconds
## expr min lq mean median uq max neval
## akrun(df) 274.126 299.9995 352.6557 319.243 345.1165 1680.675 100
## bgoldst(dt) 882.247 916.4595 1022.4152 962.219 1043.0450 2317.450 100
## scale test
set.seed(1L); NR <- 1e6L; probNA <- 4/6;
df <- data.frame(`1`=sample(LETTERS,NR,T),`2`=sample(LETTERS,NR,T),`3`=sample(c(NA,LETTERS),NR,T,c(probNA,rep((1-probNA)/length(LETTERS),length(LETTERS)))),support=rnorm(NR),check.names=F,stringsAsFactors=F);
dt <- as.data.table(df);
ex <- harmonize(akrun(df));
all.equal(ex,harmonize(bgoldst(dt)),check.attributes=F);
## [1] TRUE
microbenchmark(akrun(df),bgoldst(dt));
## Unit: milliseconds
## expr min lq mean median uq max neval
## akrun(df) 52.91901 58.77635 71.72132 67.36425 72.08575 138.3405 100
## bgoldst(dt) 908.32926 941.77597 980.96350 979.60482 1003.23541 1139.1633 100
结论:akrun要快得多。