我在优化循环时遇到问题,根据data.frame中的条件累积添加数字。下面是输入data.frame,其中包含接近一百万行的较大数据集中的几行:
inputData <- structure(list(SNP_pos = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L), .Label = c("SNP_1",
"SNP_2", "SNP_3", "SNP_4", "SNP_5", "SNP_6", "SNP_7", "SNP_8",
"SNP_9", "SNP_10", "SNP_11", "SNP_12", "SNP_13", "SNP_14"), class = "factor"),
sample_id = c(8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L), allele1 = structure(c(2L, 2L, 2L, 1L,
2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L,
4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L,
2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L,
3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L,
3L, 1L, 4L, 1L, 2L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"),
sample_id_x = c(8739L, 8739L, 8739L, 8739L, 8739L, 8739L,
8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8832L,
8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L,
8832L, 8832L, 8832L, 8832L, 8888L, 8888L, 8888L, 8888L, 8888L,
8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L,
9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L,
9056L, 9056L, 9056L, 9056L, 9056L, 9058L, 9058L, 9058L, 9058L,
9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L,
9058L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L,
9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9072L, 9072L, 9072L,
9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L,
9072L, 9072L, 9152L, 9152L), allele2 = structure(c(2L, 2L,
1L, 1L, 2L, 2L, 1L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 4L, 4L, 2L, 3L, 3L, 2L, 4L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 4L,
4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L,
2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L), .Label = c("A", "G", "T",
"C"), class = "factor"), snp_diff = c(0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0), IBS = c(1,
1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 0, 1), IBD = c(1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1)), .Names = c("SNP_pos",
"sample_id", "allele1", "sample_id_x", "allele2", "snp_diff",
"IBS", "IBD"), row.names = c(NA, 100L), class = "data.frame")
以下是预期的输出数据。框架:
outputData <- structure(list(SNP_pos = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L), .Label = c("SNP_1",
"SNP_2", "SNP_3", "SNP_4", "SNP_5", "SNP_6", "SNP_7", "SNP_8",
"SNP_9", "SNP_10", "SNP_11", "SNP_12", "SNP_13", "SNP_14"), class = "factor"),
sample_id = c(8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L,
8685L, 8685L, 8685L), allele1 = structure(c(2L, 2L, 2L, 1L,
2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L,
4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L,
2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L,
3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L,
3L, 1L, 4L, 1L, 2L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"),
sample_id_x = c(8739L, 8739L, 8739L, 8739L, 8739L, 8739L,
8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8832L,
8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L,
8832L, 8832L, 8832L, 8832L, 8888L, 8888L, 8888L, 8888L, 8888L,
8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L,
9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L,
9056L, 9056L, 9056L, 9056L, 9056L, 9058L, 9058L, 9058L, 9058L,
9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L,
9058L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L,
9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9072L, 9072L, 9072L,
9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L,
9072L, 9072L, 9152L, 9152L), allele2 = structure(c(2L, 2L,
1L, 1L, 2L, 2L, 1L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 4L, 4L, 2L, 3L, 3L, 2L, 4L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 4L,
4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L,
2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L), .Label = c("A", "G", "T",
"C"), class = "factor"), snp_diff = c(0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0), IBS = c(1,
1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 0, 1), IBD = c(NA, NA, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0,
0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 1, 1, 2, 0,
1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 0, 1, 2, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0,
1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 0, 0, 0, 1, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 0, 0, 1, 0, 1)), .Names = c("SNP_pos",
"sample_id", "allele1", "sample_id_x", "allele2", "snp_diff",
"IBS", "IBD"), row.names = c(NA, 100L), class = "data.frame")
以下是我用来生成输出文件的代码:
for (i in 1:nrow(inputData)) {
inputData$IBD<-ifelse(inputData$IBD==0,inputData$IBD<-inputData$IBD,ifelse (inputData$allele1==inputData$allele2&inputData$sample_id_x!=shift(inputData$sample_id_x),inputData$IBD<-inputData$IBD,ifelse (inputData$allele1==inputData$allele2&inputData$sample_id_x==shift(inputData$sample_id_x),inputData$IBD<-shift(inputData$IBD)+1,inputData$IBD<-inputData$IBD)))
}
请帮助优化代码或提出更好的代码......
答案 0 :(得分:0)
#First, create a vector with boolean where sub-conditions of the third condition are met
temp = as.numeric(c(FALSE, sapply(2:nrow(inputData), function(i)
inputData$sample_id_x[i] == inputData$sample_id_x[i-1])) & #1st sub-condition
(inputData$allele1 == inputData$allele2) & #2nd sub-condition
inputData$IBD != 0) #3rd sub-condition
#If the value in 'IBD' is zero, then temp2 = 0, otherwise 1
temp2 = as.numeric(temp + inputData$IBD != 0)
ave(temp2,
#Everytime 'temp' is zero, it starts a new group
cumsum(sapply(1:length(temp), function(x) ifelse(temp[x]==0, 1, 0) )),
FUN = cumsum)