如何基于两个条件填充矩阵?

时间:2019-02-06 15:40:14

标签: r matrix

我有两个数据框df_1,是:

df1.reset_index().merge(df2).empty

和df_2是:

  symbol Sample_name                        
 1 MTPAP       sample_1
 2 MTPAP       sample_1
 3 MTPAP          sample_1
 4 TENT2      sample_1
 5 KIDINS220   sample_2
 6 POLR1A        sample_3
 7 CCDC138     sample_4
 8 CCDC74A     sample_5
 9 ATF2        sample_6
10 TLR9        sample_7

我想创建一个矩阵,其列名与df_1中的“ Sample_name”中的值匹配,而行名与df_2中的“ symbol”相匹配。每个“ symbol” /“ sample_name”对的值应为“如果来自df_2的“符号”值在df_1中以“符号”的形式出现且样本名称在df_1中,则为“ 0”,如果给定的“符号” /“ sample_name”不存在于df_1中,则为“ 0”:

      HGNC.ID         symbol
1  HGNC:25532           MTPAP
2  HGNC:26776           TENT2
3  HGNC:16705          TENT4A
4  HGNC:30758          TENT4B
5  HGNC:26184            TUT1
6  HGNC:28981            TUT4
7  HGNC:25817            TUT7
8  HGNC:17264          POLR1A
9  HGNC:20454          POLR1B
10 HGNC:20194          POLR1C

我用以下方法创建了一个空矩阵:

        Sample_1 Sample_2 Sampl_3 
MTPAP      1       0        0
TENT2      1       0        0
TENT4A     0       0        0
TENT4B     0       0        0
TUT1       0       0        0
TUT4       0       0        0
TUT7       0       0        0
POLR1A     0       0        1
POLR1B     0       0        0
POLR1C     0       0        0

我曾尝试用df_1和df_2中的数据填充它,但到目前为止,我失败了……

有人可以帮我吗?

谢谢。

3 个答案:

答案 0 :(得分:1)

一种选择是将两个数据集on连接到“符号”列,然后将dcast从“长”到“宽”,将fun.aggregate指定为length < / p>

library(data.table)
setDT(df_2)[df_1, Sample_name := Sample_name, on = .(symbol)]
df_2[, symbol := factor(symbol, levels = unique(symbol))]
dcast(df_2, symbol ~ factor(Sample_name, levels = paste0("sample_", 
          1:3)), length, drop = FALSE)
#     symbol sample_1 sample_2 sample_3
# 1:  MTPAP        1        0        0
# 2:  TENT2        1        0        0
# 3: TENT4A        0        0        0
# 4: TENT4B        0        0        0
# 5:   TUT1        0        0        0
# 6:   TUT4        0        0        0
# 7:   TUT7        0        0        0
# 8: POLR1A        0        0        1
# 9: POLR1B        0        0        0
#10: POLR1C        0        0        0

数据

df_1 <- structure(list(symbol = c("MTPAP", "MTPAP", "MTPAP", "TENT2", 
 "KIDINS220", "POLR1A", "CCDC138", "CCDC74A", "ATF2", "TLR9"), 
Sample_name = c("sample_1", "sample_1", "sample_1", "sample_1", 
"sample_2", "sample_3", "sample_4", "sample_5", "sample_6", 
"sample_7")), class = "data.frame", row.names = c("1", "2", 
 "3", "4", "5", "6", "7", "8", "9", "10"))

df_2 <- structure(list(HGNC.ID = c("HGNC:25532", "HGNC:26776", "HGNC:16705", 
"HGNC:30758", "HGNC:26184", "HGNC:28981", "HGNC:25817", "HGNC:17264", 
 "HGNC:20454", "HGNC:20194"), symbol = c("MTPAP", "TENT2", "TENT4A", 
 "TENT4B", "TUT1", "TUT4", "TUT7", "POLR1A", "POLR1B", "POLR1C"
  )), class = "data.frame", row.names = c("1", "2", "3", "4", "5", 
 "6", "7", "8", "9", "10"))

答案 1 :(得分:1)

使用merge,然后使用dcast软件包中的reshape2

res <- merge(df_2[2], df_1, all.x = TRUE)
res$Sample_name <- as.character(res$Sample_name)
res$Sample_name[is.na(res$Sample_name)] <- ""

reshape2::dcast(res, symbol ~ Sample_name, value.var = "Sample_name")
#   symbol Var.2 sample_1 sample_3
#1   MTPAP     0        3        0
#2  POLR1A     0        0        1
#3  POLR1B     1        0        0
#4  POLR1C     1        0        0
#5   TENT2     0        1        0
#6  TENT4A     1        0        0
#7  TENT4B     1        0        0
#8    TUT1     1        0        0
#9    TUT4     1        0        0
#10   TUT7     1        0        0

dput格式的数据。

df_1 <-
structure(list(symbol = structure(c(5L, 5L, 5L, 
7L, 4L, 6L, 2L, 3L, 1L, 8L), .Label = c("ATF2", 
"CCDC138", "CCDC74A", "KIDINS220", "MTPAP", 
"POLR1A", "TENT2", "TLR9"), class = "factor"), 
Sample_name = structure(c(1L, 1L, 1L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L), .Label = c("sample_1", "sample_2", 
"sample_3", "sample_4", "sample_5", "sample_6", 
"sample_7"), class = "factor")), 
class = "data.frame", row.names = c("1", "2", 
"3", "4", "5", "6", "7", "8", "9", "10"))

df_2 <-
structure(list(HGNC.ID = structure(c(5L, 
8L, 1L, 10L, 7L, 9L, 6L, 2L, 4L, 3L), 
.Label = c("HGNC:16705", "HGNC:17264", "HGNC:20194", 
"HGNC:20454", "HGNC:25532", "HGNC:25817", 
"HGNC:26184", "HGNC:26776", "HGNC:28981", 
"HGNC:30758"), class = "factor"), 
symbol = structure(c(1L, 5L, 6L, 7L, 8L, 
9L, 10L, 2L, 3L, 4L), .Label = c("MTPAP", "POLR1A", 
"POLR1B", "POLR1C", "TENT2", "TENT4A", "TENT4B", 
"TUT1", "TUT4", "TUT7"), class = "factor")), 
class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

答案 2 :(得分:0)

我知道您已经有两个答案,但这就是我的做法:)

数据:

df_1 <- read.table(text = "symbol Sample_name                        
                   1 MTPAP       sample_1
                   2 MTPAP       sample_1
                   3 MTPAP          sample_1
                   4 TENT2      sample_1
                   5 KIDINS220   sample_2
                   6 POLR1A        sample_3
                   7 CCDC138     sample_4
                   8 CCDC74A     sample_5
                   9 ATF2        sample_6
                   10 TLR9        sample_7", header = TRUE, 
                   stringsAsFactors =T)
df_2 <- read.table(text = "      HGNC.ID         symbol
                   1  HGNC:25532           MTPAP
                   2  HGNC:26776           TENT2
                   3  HGNC:16705          TENT4A
                   4  HGNC:30758          TENT4B
                   5  HGNC:26184            TUT1
                   6  HGNC:28981            TUT4
                   7  HGNC:25817            TUT7
                   8  HGNC:17264          POLR1A
                   9  HGNC:20454          POLR1B
                   10 HGNC:20194          POLR1C", header= TRUE, 
                   stringsAsFactors =TRUE)

首先是一个空矩阵(不是真的是空的,而是用默认值... 0填充):

mat <- matrix(0, 
              ncol = length(unique(df_1$Sample_name)), 
              nrow = length(unique(df_2$symbol)),
              dimnames = list(unique(df_2$symbol), 
                              unique(df_1$Sample_name)))

在df_1中的符号和样本中制成表格:

library(dplyr)
mat_2 <- df_1  %>% 
  unique %>% table

现在我们使用行名从mat_2中获得所需的行

wanted_rows <- rownames(mat_2)[rownames(mat_2) %in% df_2$symbol]
mat[wanted_rows,] <- mat_2[wanted_rows,]