R

时间:2016-07-13 08:58:26

标签: r dataframe

我有一个这种形式的data.frame:

        C1    C2   C3 support
1      {A}   {B} <NA>    1.00
2      {D}   {A} <NA>    0.50
3      {F}   {A} <NA>    0.30
4      {D}   {F}  {A}    0.75
5    {B,F}   {A} <NA>    0.50
6      {D} {B,F}  {A}    0.25

我希望将上面的data.frame转换为以下内容:

      FROM    TO      support
1      {A}   {B}         1.00
2      {D}   {A}         0.50
3      {F}   {A}         0.30
4      {D}   {F}         0.75
5      {F}   {A}         0.75
6    {B,F}   {A}         0.50
7      {D} {B,F}         0.25
8    {B,F}   {A}         0.25

这是(`C1``C2`)和(`C2``C3`)之间的每个链接。
提前谢谢。

2 个答案:

答案 0 :(得分:3)

一个选项是rbind数据集的子集,其中第2列到第4列(基于第三列中的非NA值)到没有第3列的完整数据集,将其放在{ {1}}并使用list(来自rbindlist)对data.table的元素进行行绑定。如果需要,我们可以使用list更改列名称。

setnames

数据

library(data.table)
setnames(rbindlist(list(df[, c(1,2,4)], df[!is.na(df[,3]), 2:4])),
              1:2, c("FROM", "TO"))[]
#    FROM    TO support
#1:   {A}   {B}    1.00
#2:   {D}   {A}    0.50
#3:   {F}   {A}    0.30
#4:   {D}   {F}    0.75
#5: {B,F}   {A}    0.50
#6:   {D} {B,F}    0.25
#7:   {F}   {A}    0.75
#8: {B,F}   {A}    0.25

答案 1 :(得分:2)

这是另一种可行的方法。这预先计算了一个行索引向量r,它复制了`3`列中具有非NA值的每一行。然后,它使用该行索引向量索引data.table,并使用FROM构建所需的TOifelse()列,以在duplicated(r)上进行谓词,为每个列选择正确的列值行。

r <- rep(seq_len(nrow(dt)),2L-is.na(dt$`3`));
dt[r,.(FROM=ifelse(d <- duplicated(r),`2`,`1`),TO=ifelse(d,`3`,`2`),support)];
##     FROM    TO support
## 1:   {A}   {B}    1.00
## 2:   {D}   {A}    0.50
## 3:   {F}   {A}    0.30
## 4:   {D}   {F}    0.75
## 5:   {F}   {A}    0.75
## 6: {B,F}   {A}    0.50
## 7:   {D} {B,F}    0.25
## 8: {B,F}   {A}    0.25

数据

dt <- data.table(`1`=c('{A}','{D}','{F}','{D}','{B,F}','{D}'),`2`=c('{B}','{A}','{A}','{F}',
'{A}','{B,F}'),`3`=c(NA,NA,NA,'{A}',NA,'{A}'),support=c(1,0.5,0.3,0.75,0.5,0.25));

基准

library(microbenchmark);
library(data.table);

akrun <- function(df) setnames(rbindlist(list(df[,c(1,2,4)],df[!is.na(df[,3]),2:4])),1:2,c("FROM","TO"));
bgoldst <- function(dt) { r <- rep(seq_len(nrow(dt)),2L-is.na(dt$`3`)); dt[r,.(FROM=ifelse(d <- duplicated(r),`2`,`1`),TO=ifelse(d,`3`,`2`),support)]; };

harmonize <- function(df) {
    df <- as.data.frame(df); ## coerce to data.frame
    df <- df[order(names(df))]; ## order columns
    df <- df[do.call(order,df),]; ## order rows
    df;
}; ## end harmonize()
## OP's example
df <- data.frame(`1`=c('{A}','{D}','{F}','{D}','{B,F}','{D}'),`2`=c('{B}','{A}','{A}','{F}','{A}','{B,F}'),`3`=c(NA,NA,NA,'{A}',NA,'{A}'),support=c(1,0.5,0.3,0.75,0.5,0.25),check.names=F,stringsAsFactors=F);
dt <- as.data.table(df);

ex <- harmonize(akrun(df));
all.equal(ex,harmonize(bgoldst(dt)),check.attributes=F);
## [1] TRUE

microbenchmark(akrun(df),bgoldst(dt));
## Unit: microseconds
##         expr     min       lq      mean  median        uq      max neval
##    akrun(df) 274.126 299.9995  352.6557 319.243  345.1165 1680.675   100
##  bgoldst(dt) 882.247 916.4595 1022.4152 962.219 1043.0450 2317.450   100
## scale test
set.seed(1L); NR <- 1e6L; probNA <- 4/6;
df <- data.frame(`1`=sample(LETTERS,NR,T),`2`=sample(LETTERS,NR,T),`3`=sample(c(NA,LETTERS),NR,T,c(probNA,rep((1-probNA)/length(LETTERS),length(LETTERS)))),support=rnorm(NR),check.names=F,stringsAsFactors=F);
dt <- as.data.table(df);

ex <- harmonize(akrun(df));
all.equal(ex,harmonize(bgoldst(dt)),check.attributes=F);
## [1] TRUE

microbenchmark(akrun(df),bgoldst(dt));
## Unit: milliseconds
##         expr       min        lq      mean    median         uq       max neval
##    akrun(df)  52.91901  58.77635  71.72132  67.36425   72.08575  138.3405   100
##  bgoldst(dt) 908.32926 941.77597 980.96350 979.60482 1003.23541 1139.1633   100

结论:akrun要快得多。