通过不在该data.table中的列的行展开R data.table?

时间:2017-07-16 22:25:44

标签: r dataframe data.table

我有以下R data.table

library(data.table)
DT <- fread('unique_point biased    data_points   team   groupID                                                                                                           
 up1          FALSE     3             1      xy28352                                                                                                                 
 up1          TRUE      4             22     xy28352                                                                                                                 
 up2          FALSE     1             4      xy28352                                                                                                                  
 up2          TRUE      0             3      xy28352                                                                                                                  
 up3          FALSE     12            5      xy28352                                                                                                                 
 up3          TRUE      35            7      xy28352')

印刷如下:

DT
    unique_point biased    data_points   team   groupID                                                                                                           
 1: up1          FALSE     3             1      xy28352                                                                                                                 
 2: up1          TRUE      4             22     xy28352                                                                                                                 
 3: up2          FALSE     1             4      xy28352                                                                                                                  
 4: up2          TRUE      0             3      xy28352                                                                                                                  
 5: up3          FALSE     12            5      xy28352                                                                                                                 
 6: up3          TRUE      35            7      xy28352 
 ....  

目前,每个unique_point都有两行,biasedTRUEFALSE。我想展开DT,以便按以下格式为每个unique_point分为6行:

    unique_point biased    type    data_points   team   groupID                                                                                                           
 1: up1          FALSE     A       3             1      xy28352                                                                                                                 
 2: up1          TRUE      A       4             22     xy28352                                                                                                                 
 3: up1          FALSE     B       0             1      xy28352                                                                                                                  
 4: up1          TRUE      B       0             22     xy28352                                                                                                                  
 5: up1          FALSE     C       0             1      xy28352                                                                                                                 
 6: up1          TRUE      C       0             22     xy28352 
 7: up2          FALSE     A       1             4      xy28352
 ...

也就是说,对于每个唯一的点,A,B和C都会为FALSE / TRUE。

我从以下代码开始:

> DT2 <- DT[, .SD[CJ(type=c("A", "B", "C"), biased = biased, unique = TRUE), 
                on = .(biased, type)], by = .(unique_point)][]     

我收到以下错误

Error in `[.data.table`(.SD, CJ(variants = c("SNP", "INS", "DEL"), fused = fused,  :                                                                                                                                                                                                                                                                                   
   Column(s) [variants] not found in x   

所以,我使用以下hack在名为DT的{​​{1}}中创建了一个新列,至少包含以下三个唯一值:

type

上面的代码然后工作。

通过DT$type[2] = "A" DT$type[4] = "B" DT$type[6] = "C" DTtypeA展开B的正确方法是什么,而不使用此&#34;技巧&#34 ;?我现在这样做的方式并不标准,可能会使第三方感到困惑。可读代码和学习为什么我的原始尝试不起作用是目标。

编辑:实际上,我认为尺寸是错误的。我的解决方案有bug。

2 个答案:

答案 0 :(得分:1)

考虑使用辅助数据框或数据表 abc_DT ,在主表上运行它的交叉连接。此外,使用条件ifelse data_points 列中填充零以扩展行。

<强> data.table

abc_DT <- data.table(type=c("A", "B", "C"), data_points_=0)

# CROSS JOIN                     
DT2 <- setkey(DT[,c(k=1,.SD)],k)[abc_DT[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
# RE-ORDER ROWS
DT2 <- DT2[order(unique_point, type, biased)]

# CONDITIONAL ASSIGNMENT AND RE-ORDER COLS
setcolorder(DT2[, data_points:= ifelse(type=="A",  data_points,  data_points_)][,data_points_:=NULL], 
            c("unique_point", "biased", "type", "data_points", "team", "groupID"))
DT2
#     unique_point biased type data_points team groupID
#  1:          up1  FALSE    A           3    1 xy28352
#  2:          up1   TRUE    A           4   22 xy28352
#  3:          up1  FALSE    B           0    1 xy28352
#  4:          up1   TRUE    B           0   22 xy28352
#  5:          up1  FALSE    C           0    1 xy28352
#  6:          up1   TRUE    C           0   22 xy28352
#  7:          up2  FALSE    A           1    4 xy28352
#  ...

基础R

abc_df <- data.frame(type=LETTERS[1:3], data_points_=0)

# CROSS JOIN
df2 <- merge(df, abc_df, all=TRUE)
# RE-ORDER ROWS
df2 <- with(df2, df2[order(unique_point, type, biased),])
row.names(df2) <- NULL

# CONDITIONAL ASSIGNMENT
df2$data_points <- ifelse(df2$type=='A', df2$data_points, df2$data_points_)
# SUBSET AND RE-ORDER COLS
df2 <- df2[c("unique_point", "biased", "type", "data_points", "team", "groupID")]
df2
#    unique_point biased type data_points team groupID
# 1           up1  FALSE    A           3    1 xy28352
# 2           up1   TRUE    A           4   22 xy28352
# 3           up1  FALSE    B           0    1 xy28352
# 4           up1   TRUE    B           0   22 xy28352
# 5           up1  FALSE    C           0    1 xy28352
# 6           up1   TRUE    C           0   22 xy28352
# 7           up2  FALSE    A           1    4 xy28352
# ...

答案 1 :(得分:1)

我会尝试:

DT2 <- DT[CJ(type = LETTERS[1:3], biased = biased, unique_point = unique_point, unique = TRUE),
   on = .(unique_point, biased), nomatch = 0]

#     unique_point biased data_points team groupID type
#  1:          up1  FALSE           3    1 xy28352    A
# 2:          up2  FALSE           1    4 xy28352    A
# 3:          up3  FALSE          12    5 xy28352    A
# 4:          up1   TRUE           4   22 xy28352    A
# 5:          up2   TRUE           0    3 xy28352    A
# 6:          up3   TRUE          35    7 xy28352    A
# 7:          up1  FALSE           3    1 xy28352    B
# 8:          up2  FALSE           1    4 xy28352    B
# 9:          up3  FALSE          12    5 xy28352    B
# 10:          up1   TRUE           4   22 xy28352    B
# 11:          up2   TRUE           0    3 xy28352    B
# 12:          up3   TRUE          35    7 xy28352    B
# 13:          up1  FALSE           3    1 xy28352    C
# 14:          up2  FALSE           1    4 xy28352    C
# 15:          up3  FALSE          12    5 xy28352    C
# 16:          up1   TRUE           4   22 xy28352    C
# 17:          up2   TRUE           0    3 xy28352    C
# 18:          up3   TRUE          35    7 xy28352    C