这是根据染色体的cn0文件和频率值
$gw6.00033
X Sample_Name Chr_No Copy_No Frequence
1 gw6.00033 chr1: cn=0 1
12 gw6.00033 chr2: cn=0 1
16 gw6.00033 chr4: cn=0 1
20 gw6.00033 chr6: cn=0 1
$gw6.0006
X Sample_Name Chr_No Copy_No Frequence
26 gw6.0006 chr1: cn=0 1
29 gw6.0006 chr10 cn=0 3
31 gw6.0006 chr11 cn=0 2
34 gw6.0006 chr13 cn=0 1
37 gw6.0006 chr15 cn=0 1
38 gw6.0006 chr16 cn=0 1
41 gw6.0006 chr2: cn=0 1
47 gw6.0006 chr3: cn=0 1
57 gw6.0006 chr8: cn=0 2
这是R中的代码:-
sp <- split(cn0, cn0$sample_name)
N <- 22
sp <- lapply(sp, function(DF){
M <- as.numeric(sub("[^[:digit:]]+([[:digit:]]+)[^[:digit:]]*", "\\1", DF$Chr_no))
Chr_no <- sub("[[:digit:]]+[^[:digit:]]*$", "", DF$Chr_no)
Chr_no <- paste0(Chr_no, 1:N)
Chr_no <- ifelse(nchar(Chr_no) == 4, paste0(Chr_no, ":"), Chr_no)
sample_name <- rep(DF$sample_name[1], length(Chr_no))
res <- data.frame(sample_name, Chr_no)
res$Frequence <- 0
res$Frequence[1:N %in% M] <- DF$Frequence
res
})
abc <- as.data.frame(t(sapply(sp, '[[', 'Frequence')))
names(abc) <- sp[[1]]$Chr_no
它给出输出:-
$gw6.00033
Sample_Name Chr_No Frequence
1 gw6.00033 chr1: 1
2 gw6.00033 chr2: 1
3 gw6.00033 chr3: 0
4 gw6.00033 chr4: 1
5 gw6.00033 chr5: 0
6 gw6.00033 chr6: 1
7 gw6.00033 chr7: 0
8 gw6.00033 chr8: 0
9 gw6.00033 chr9: 0
$gw6.0006
Sample_Name Chr_No Frequence
1 gw6.0006 chr1: 1
2 gw6.0006 chr2: 3?
3 gw6.0006 chr3: 2?
4 gw6.0006 chr4: 0
5 gw6.0006 chr5: 0
6 gw6.0006 chr6: 0
7 gw6.0006 chr7: 0
8 gw6.0006 chr8: 1?
它给出结果,但在创建矩阵时出现错误的频率,将丢失的染色体放在其位置,但没有以频率出现,因此应将0对应于丢失的染色体。对于样本gw6.00033,它创建正确的数据帧,因为所有染色体都按升序排列,但是对于样本gw6.0006,频率是错误的,需要对代码进行哪些改进。
答案 0 :(得分:1)
使用 tidyr :: complete :
df1 <- read.table(text = "X Sample_Name Chr_No Copy_No Frequence
1 gw6.00033 chr1: cn=0 1
12 gw6.00033 chr2: cn=0 1
16 gw6.00033 chr4: cn=0 1
20 gw6.00033 chr6: cn=0 1
26 gw6.0006 chr1: cn=0 1
29 gw6.0006 chr10 cn=0 3
31 gw6.0006 chr11 cn=0 2
34 gw6.0006 chr13 cn=0 1
37 gw6.0006 chr15 cn=0 1
38 gw6.0006 chr16 cn=0 1
41 gw6.0006 chr2: cn=0 1
47 gw6.0006 chr3: cn=0 1
57 gw6.0006 chr8: cn=0 2", header = TRUE, stringsAsFactors = FALSE)
library(dplyr)
library(tidyr)
# ordered factor vector
allChroms <- factor(c(paste0("chr", 1:9, ":"), paste0("chr", 10:22)),
levels = c(paste0("chr", 1:9, ":"), paste0("chr", 10:22)))
res <- complete(df1[, -1],
Sample_Name,
Chr_No = allChroms,
fill = list(Copy_No = "cn=0", Frequence = 0))
res
# # A tibble: 44 x 4
# Sample_Name Chr_No Copy_No Frequence
# <chr> <chr> <chr> <dbl>
# 1 gw6.00033 chr1: cn=0 1
# 2 gw6.00033 chr2: cn=0 1
# 3 gw6.00033 chr3: cn=0 0
# 4 gw6.00033 chr4: cn=0 1
# 5 gw6.00033 chr5: cn=0 0
# 6 gw6.00033 chr6: cn=0 1
# 7 gw6.00033 chr7: cn=0 0
# 8 gw6.00033 chr8: cn=0 0
# 9 gw6.00033 chr9: cn=0 0
# 10 gw6.00033 chr10 cn=0 0
编辑::检查它是否正常运行...
res[ res$Chr_No == "chr10", ]
# Sample_Name Chr_No Copy_No Frequence
# 1 gw6.00033 chr10 cn=0 0
# 2 gw6.0006 chr10 cn=0 3
res[ res$Chr_No == "chr1:", ]
# Sample_Name Chr_No Copy_No Frequence
# 1 gw6.00033 chr1: cn=0 1
# 2 gw6.0006 chr1: cn=0 1