我有两个数据框df_1,是:
df1.reset_index().merge(df2).empty
和df_2是:
symbol Sample_name
1 MTPAP sample_1
2 MTPAP sample_1
3 MTPAP sample_1
4 TENT2 sample_1
5 KIDINS220 sample_2
6 POLR1A sample_3
7 CCDC138 sample_4
8 CCDC74A sample_5
9 ATF2 sample_6
10 TLR9 sample_7
我想创建一个矩阵,其列名与df_1中的“ Sample_name”中的值匹配,而行名与df_2中的“ symbol”相匹配。每个“ symbol” /“ sample_name”对的值应为“如果来自df_2的“符号”值在df_1中以“符号”的形式出现且样本名称在df_1中,则为“ 0”,如果给定的“符号” /“ sample_name”不存在于df_1中,则为“ 0”:
HGNC.ID symbol
1 HGNC:25532 MTPAP
2 HGNC:26776 TENT2
3 HGNC:16705 TENT4A
4 HGNC:30758 TENT4B
5 HGNC:26184 TUT1
6 HGNC:28981 TUT4
7 HGNC:25817 TUT7
8 HGNC:17264 POLR1A
9 HGNC:20454 POLR1B
10 HGNC:20194 POLR1C
我用以下方法创建了一个空矩阵:
Sample_1 Sample_2 Sampl_3
MTPAP 1 0 0
TENT2 1 0 0
TENT4A 0 0 0
TENT4B 0 0 0
TUT1 0 0 0
TUT4 0 0 0
TUT7 0 0 0
POLR1A 0 0 1
POLR1B 0 0 0
POLR1C 0 0 0
我曾尝试用df_1和df_2中的数据填充它,但到目前为止,我失败了……
有人可以帮我吗?
谢谢。
答案 0 :(得分:1)
一种选择是将两个数据集on
连接到“符号”列,然后将dcast
从“长”到“宽”,将fun.aggregate
指定为length
< / p>
library(data.table)
setDT(df_2)[df_1, Sample_name := Sample_name, on = .(symbol)]
df_2[, symbol := factor(symbol, levels = unique(symbol))]
dcast(df_2, symbol ~ factor(Sample_name, levels = paste0("sample_",
1:3)), length, drop = FALSE)
# symbol sample_1 sample_2 sample_3
# 1: MTPAP 1 0 0
# 2: TENT2 1 0 0
# 3: TENT4A 0 0 0
# 4: TENT4B 0 0 0
# 5: TUT1 0 0 0
# 6: TUT4 0 0 0
# 7: TUT7 0 0 0
# 8: POLR1A 0 0 1
# 9: POLR1B 0 0 0
#10: POLR1C 0 0 0
df_1 <- structure(list(symbol = c("MTPAP", "MTPAP", "MTPAP", "TENT2",
"KIDINS220", "POLR1A", "CCDC138", "CCDC74A", "ATF2", "TLR9"),
Sample_name = c("sample_1", "sample_1", "sample_1", "sample_1",
"sample_2", "sample_3", "sample_4", "sample_5", "sample_6",
"sample_7")), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))
df_2 <- structure(list(HGNC.ID = c("HGNC:25532", "HGNC:26776", "HGNC:16705",
"HGNC:30758", "HGNC:26184", "HGNC:28981", "HGNC:25817", "HGNC:17264",
"HGNC:20454", "HGNC:20194"), symbol = c("MTPAP", "TENT2", "TENT4A",
"TENT4B", "TUT1", "TUT4", "TUT7", "POLR1A", "POLR1B", "POLR1C"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10"))
答案 1 :(得分:1)
使用merge
,然后使用dcast
软件包中的reshape2
。
res <- merge(df_2[2], df_1, all.x = TRUE)
res$Sample_name <- as.character(res$Sample_name)
res$Sample_name[is.na(res$Sample_name)] <- ""
reshape2::dcast(res, symbol ~ Sample_name, value.var = "Sample_name")
# symbol Var.2 sample_1 sample_3
#1 MTPAP 0 3 0
#2 POLR1A 0 0 1
#3 POLR1B 1 0 0
#4 POLR1C 1 0 0
#5 TENT2 0 1 0
#6 TENT4A 1 0 0
#7 TENT4B 1 0 0
#8 TUT1 1 0 0
#9 TUT4 1 0 0
#10 TUT7 1 0 0
dput
格式的数据。
df_1 <-
structure(list(symbol = structure(c(5L, 5L, 5L,
7L, 4L, 6L, 2L, 3L, 1L, 8L), .Label = c("ATF2",
"CCDC138", "CCDC74A", "KIDINS220", "MTPAP",
"POLR1A", "TENT2", "TLR9"), class = "factor"),
Sample_name = structure(c(1L, 1L, 1L, 1L, 2L,
3L, 4L, 5L, 6L, 7L), .Label = c("sample_1", "sample_2",
"sample_3", "sample_4", "sample_5", "sample_6",
"sample_7"), class = "factor")),
class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))
df_2 <-
structure(list(HGNC.ID = structure(c(5L,
8L, 1L, 10L, 7L, 9L, 6L, 2L, 4L, 3L),
.Label = c("HGNC:16705", "HGNC:17264", "HGNC:20194",
"HGNC:20454", "HGNC:25532", "HGNC:25817",
"HGNC:26184", "HGNC:26776", "HGNC:28981",
"HGNC:30758"), class = "factor"),
symbol = structure(c(1L, 5L, 6L, 7L, 8L,
9L, 10L, 2L, 3L, 4L), .Label = c("MTPAP", "POLR1A",
"POLR1B", "POLR1C", "TENT2", "TENT4A", "TENT4B",
"TUT1", "TUT4", "TUT7"), class = "factor")),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
答案 2 :(得分:0)
我知道您已经有两个答案,但这就是我的做法:)
数据:
df_1 <- read.table(text = "symbol Sample_name
1 MTPAP sample_1
2 MTPAP sample_1
3 MTPAP sample_1
4 TENT2 sample_1
5 KIDINS220 sample_2
6 POLR1A sample_3
7 CCDC138 sample_4
8 CCDC74A sample_5
9 ATF2 sample_6
10 TLR9 sample_7", header = TRUE,
stringsAsFactors =T)
df_2 <- read.table(text = " HGNC.ID symbol
1 HGNC:25532 MTPAP
2 HGNC:26776 TENT2
3 HGNC:16705 TENT4A
4 HGNC:30758 TENT4B
5 HGNC:26184 TUT1
6 HGNC:28981 TUT4
7 HGNC:25817 TUT7
8 HGNC:17264 POLR1A
9 HGNC:20454 POLR1B
10 HGNC:20194 POLR1C", header= TRUE,
stringsAsFactors =TRUE)
首先是一个空矩阵(不是真的是空的,而是用默认值... 0填充):
mat <- matrix(0,
ncol = length(unique(df_1$Sample_name)),
nrow = length(unique(df_2$symbol)),
dimnames = list(unique(df_2$symbol),
unique(df_1$Sample_name)))
在df_1中的符号和样本中制成表格:
library(dplyr)
mat_2 <- df_1 %>%
unique %>% table
现在我们使用行名从mat_2
中获得所需的行
wanted_rows <- rownames(mat_2)[rownames(mat_2) %in% df_2$symbol]
mat[wanted_rows,] <- mat_2[wanted_rows,]