我有一些这种结构的数据:
## Column examples generation
bases <- c("A", "T", "C", "G")
ID <- c(1,2,3,4,5,6)
SNP <- rep (c("F1", "F3", "F4"), each=length(ID))
Al_1 <- sample(bases, length(SNP), replace=T)
Al_2 <- sample(bases, length(SNP), replace=T)
tipo <- rep(c("."),length(SNP))
## Data frame generation:
ArrDat <- as.data.frame(cbind(ID, SNP, Al_1, Al_2, tipo))
ArrDat <- data.frame(lapply(ArrDat, as.character), stringsAsFactors = F)
OrderArr <- ArrDat[order(ArrDat$ID),]
## Column "tipo" values:
for (i in 1:nrow(OrderArr)) {
if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "a"
} else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "b"
} else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "c"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "d"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "e"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "f"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "g"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "h"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "i"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "j"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "k"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "l"
} else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "A"){
OrderArr$tipo[i] = "STHG.A"
} else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "T"){
OrderArr$tipo[i] = "STHG.T"
} else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "C"){
OrderArr$tipo[i] = "STHG.C"
} else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "G"){
OrderArr$tipo[i] = "STHG.G"
} else {OrderArr$tipo[i] = "x"}
}
以下是数据示例:
ID SNP Al_1 Al_2 tipo
1 1 F1 T A d
7 1 F3 C A g
13 1 F4 G C l
2 2 F1 T T STHG.T
8 2 F3 C C STHG.C
14 2 F4 C C STHG.C
我的问题是这些Al_1-Al_2组合的OrderArr $ tipo值:A-A,T-T,C-C或G-G。 这些组合的OrderArr $ tipo值可能等于具有相同OrderArr $ SNP值的其他行',因此我之前输入的数据应为:
ID SNP Al_1 Al_2 tipo
1 1 F1 T A d
7 1 F3 C A g
13 1 F4 G C l
2 2 F1 T T d
8 2 F3 C C g
14 2 F4 C C l
如何在代码中实现此功能?
非常感谢。
答案 0 :(得分:1)
我创建了一个数据框,其中每个SNP只有Al_1和Al_2的一个组合。
ID SNP Al_1 Al_2 combo tipo
1 1 F1 A T AT a
2 1 F4 G G GG z
3 1 D2 C T CT h
4 1 D4 T C TC e
5 1 HY7 A A AA z
6 1 HY66 T G TG f
7 1 XZD1 C A CA g
8 1 XZD33 G A GA j
9 2 F1 A A AA z
10 2 F4 C G CG i
11 2 D2 C C CC z
12 2 D4 T C TC e
13 2 HY7 A A AA z
14 2 HY66 G G GG z
15 2 XZD1 C A CA g
16 2 XZD33 G A GA j
17 3 F1 T T TT z
18 3 F4 C C CC z
19 3 D2 C T CT h
20 3 D4 T C TC e
21 3 HY7 A C AC b
22 3 HY66 G G GG z
23 3 XZD1 A A AA z
24 3 XZD33 A A AA z
25 4 F1 A T AT a
26 4 F4 C G CG i
27 4 D2 C T CT h
28 4 D4 T T TT z
29 4 HY7 C C CC z
30 4 HY66 T T TT z
31 4 XZD1 C A CA g
32 4 XZD33 A A AA z
33 5 F1 T T TT z
34 5 F4 C G CG i
35 5 D2 T T TT z
36 5 D4 T T TT z
37 5 HY7 A A AA z
38 5 HY66 T G TG f
39 5 XZD1 A A AA z
40 5 XZD33 G G GG z
41 6 F1 A T AT a
42 6 F4 G G GG z
43 6 D2 T T TT z
44 6 D4 C C CC z
45 6 HY7 C C CC z
46 6 HY66 T T TT z
47 6 XZD1 C C CC z
48 6 XZD33 G A GA j
我认为我的问题有一个答案。
data$combo <- paste0(data$Al_1, data$Al_2)
snp <- unique(data$SNP)
for (i in 1:nrow(data)){
if(data$Al_1[i] == data$Al_2[i]) data$tipo[i] ='z'
else if (data$Al_1[i] == 'A') {
if (data$Al_2[i] == 'T') data$tipo[i] = 'a'
else if (data$Al_2[i] == 'C') data$tipo[i] = 'b'
else if (data$Al_2[i] == 'G') data$tipo[i] = 'c'
}
else if (data$Al_1[i] == 'T') {
if (data$Al_2[i] == 'A') data$tipo[i] = 'd'
else if (data$Al_2[i] == 'C') data$tipo[i] = 'e'
else if (data$Al_2[i] == 'G') data$tipo[i] = 'f'
}
else if (data$Al_1[i] == 'C') {
if (data$Al_2[i] == 'A') data$tipo[i] = 'g'
else if (data$Al_2[i] == 'T') data$tipo[i] = 'h'
else if (data$Al_2[i] == 'G') data$tipo[i] = 'i'
}
else if (data$Al_1[i] == 'G') {
if (data$Al_2[i] == 'A') data$tipo[i] = 'j'
else if (data$Al_2[i] == 'T') data$tipo[i] = 'k'
else if (data$Al_2[i] == 'C') data$tipo[i] = 'l'
}
}
ord.data <- data
ord.data2 <- data.frame()
for (j in 1:length(snp)){
temp <- ord.data[as.numeric(as.factor(ord.data$SNP)) == j, ]
for (h in 1:nrow(temp)){
if (temp$tipo[h] == 'z') {
if (temp$Al_1[h] == 'A') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
else if (temp$Al_1[h] == 'T') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
else if (temp$Al_1[h] == 'C') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
else if (temp$Al_1[h] == 'G') {
tempi <- subset(temp, tipo != 'z')
letra <- unique(tempi$tipo)
temp$tipo[h] = letra
}
}
}
ord.data2 <- rbind(ord.data2, temp)
}