列值基于另一列

时间:2017-02-08 07:48:04

标签: r

我有一些这种结构的数据:

## Column examples generation
bases <- c("A", "T", "C", "G")
ID <- c(1,2,3,4,5,6)
SNP <- rep (c("F1", "F3", "F4"), each=length(ID))
Al_1 <- sample(bases, length(SNP), replace=T)
Al_2 <- sample(bases, length(SNP), replace=T)
tipo <- rep(c("."),length(SNP))

## Data frame generation:
ArrDat <- as.data.frame(cbind(ID, SNP, Al_1, Al_2, tipo))
ArrDat <- data.frame(lapply(ArrDat, as.character), stringsAsFactors = F)
OrderArr <- ArrDat[order(ArrDat$ID),]

## Column "tipo" values:
for (i in 1:nrow(OrderArr)) {
if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "T"){
    OrderArr$tipo[i] = "a"
  } else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "C"){
    OrderArr$tipo[i] = "b"
  } else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "G"){
    OrderArr$tipo[i] = "c"
  } else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "A"){
    OrderArr$tipo[i] = "d"
  } else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "C"){
    OrderArr$tipo[i] = "e"
  } else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "G"){
    OrderArr$tipo[i] = "f"
  } else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "A"){
    OrderArr$tipo[i] = "g"
  } else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "T"){
    OrderArr$tipo[i] = "h"
  } else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "G"){
    OrderArr$tipo[i] = "i"
  } else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "A"){
    OrderArr$tipo[i] = "j"
  } else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "T"){
    OrderArr$tipo[i] = "k"
  } else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "C"){
    OrderArr$tipo[i] = "l"
  } else if (OrderArr$Al_1[i] == "A" & OrderArr$Al_2[i] == "A"){
    OrderArr$tipo[i] = "STHG.A"
  } else if (OrderArr$Al_1[i] == "T" & OrderArr$Al_2[i] == "T"){
    OrderArr$tipo[i] = "STHG.T"
  } else if (OrderArr$Al_1[i] == "C" & OrderArr$Al_2[i] == "C"){
    OrderArr$tipo[i] = "STHG.C"
  } else if (OrderArr$Al_1[i] == "G" & OrderArr$Al_2[i] == "G"){
    OrderArr$tipo[i] = "STHG.G"
  } else {OrderArr$tipo[i] = "x"}
}

以下是数据示例:

   ID SNP Al_1 Al_2   tipo
1   1  F1    T    A      d
7   1  F3    C    A      g
13  1  F4    G    C      l
2   2  F1    T    T STHG.T
8   2  F3    C    C STHG.C
14  2  F4    C    C STHG.C

我的问题是这些Al_1-Al_2组合的OrderArr $ tipo值:A-A,T-T,C-C或G-G。 这些组合的OrderArr $ tipo值可能等于具有相同OrderArr $ SNP值的其他行',因此我之前输入的数据应为:

   ID SNP Al_1 Al_2   tipo
1   1  F1    T    A      d
7   1  F3    C    A      g
13  1  F4    G    C      l
2   2  F1    T    T      d
8   2  F3    C    C      g
14  2  F4    C    C      l

如何在代码中实现此功能?

非常感谢。

1 个答案:

答案 0 :(得分:1)

我创建了一个数据框,其中每个SNP只有Al_1和Al_2的一个组合。

   ID   SNP Al_1 Al_2 combo tipo
1   1    F1    A    T    AT    a
2   1    F4    G    G    GG    z
3   1    D2    C    T    CT    h
4   1    D4    T    C    TC    e
5   1   HY7    A    A    AA    z
6   1  HY66    T    G    TG    f
7   1  XZD1    C    A    CA    g
8   1 XZD33    G    A    GA    j
9   2    F1    A    A    AA    z
10  2    F4    C    G    CG    i
11  2    D2    C    C    CC    z
12  2    D4    T    C    TC    e
13  2   HY7    A    A    AA    z
14  2  HY66    G    G    GG    z
15  2  XZD1    C    A    CA    g
16  2 XZD33    G    A    GA    j
17  3    F1    T    T    TT    z
18  3    F4    C    C    CC    z
19  3    D2    C    T    CT    h
20  3    D4    T    C    TC    e
21  3   HY7    A    C    AC    b
22  3  HY66    G    G    GG    z
23  3  XZD1    A    A    AA    z
24  3 XZD33    A    A    AA    z
25  4    F1    A    T    AT    a
26  4    F4    C    G    CG    i
27  4    D2    C    T    CT    h
28  4    D4    T    T    TT    z
29  4   HY7    C    C    CC    z
30  4  HY66    T    T    TT    z
31  4  XZD1    C    A    CA    g
32  4 XZD33    A    A    AA    z
33  5    F1    T    T    TT    z
34  5    F4    C    G    CG    i
35  5    D2    T    T    TT    z
36  5    D4    T    T    TT    z
37  5   HY7    A    A    AA    z
38  5  HY66    T    G    TG    f
39  5  XZD1    A    A    AA    z
40  5 XZD33    G    G    GG    z
41  6    F1    A    T    AT    a
42  6    F4    G    G    GG    z
43  6    D2    T    T    TT    z
44  6    D4    C    C    CC    z
45  6   HY7    C    C    CC    z
46  6  HY66    T    T    TT    z
47  6  XZD1    C    C    CC    z
48  6 XZD33    G    A    GA    j

我认为我的问题有一个答案。

data$combo <- paste0(data$Al_1, data$Al_2)
snp <- unique(data$SNP)

for (i in 1:nrow(data)){
  if(data$Al_1[i] == data$Al_2[i]) data$tipo[i] ='z'
  else if (data$Al_1[i] == 'A') {
    if (data$Al_2[i] == 'T') data$tipo[i] = 'a'
    else if (data$Al_2[i] == 'C') data$tipo[i] = 'b'
    else if (data$Al_2[i] == 'G') data$tipo[i] = 'c'
  }
  else if (data$Al_1[i] == 'T') {
    if (data$Al_2[i] == 'A') data$tipo[i] = 'd'
    else if (data$Al_2[i] == 'C') data$tipo[i] = 'e'
    else if (data$Al_2[i] == 'G') data$tipo[i] = 'f'
  }
  else if (data$Al_1[i] == 'C') {
    if (data$Al_2[i] == 'A') data$tipo[i] = 'g'
    else if (data$Al_2[i] == 'T') data$tipo[i] = 'h'
    else if (data$Al_2[i] == 'G') data$tipo[i] = 'i'
  }
  else if (data$Al_1[i] == 'G') {
    if (data$Al_2[i] == 'A') data$tipo[i] = 'j'
    else if (data$Al_2[i] == 'T') data$tipo[i] = 'k'
    else if (data$Al_2[i] == 'C') data$tipo[i] = 'l'
  }
}

ord.data <- data
ord.data2 <- data.frame()

for (j in 1:length(snp)){
  temp <- ord.data[as.numeric(as.factor(ord.data$SNP)) == j, ]
  for (h in 1:nrow(temp)){
    if (temp$tipo[h] == 'z') { 
      if (temp$Al_1[h] == 'A') {
        tempi <- subset(temp, tipo != 'z')
        letra <- unique(tempi$tipo)
        temp$tipo[h] = letra
      }
      else if (temp$Al_1[h] == 'T') {
        tempi <- subset(temp, tipo != 'z')
        letra <- unique(tempi$tipo)
        temp$tipo[h] = letra
      }
      else if (temp$Al_1[h] == 'C') {
        tempi <- subset(temp, tipo != 'z')
        letra <- unique(tempi$tipo)
        temp$tipo[h] = letra
      }
      else if (temp$Al_1[h] == 'G') {
        tempi <- subset(temp, tipo != 'z')
        letra <- unique(tempi$tipo)
        temp$tipo[h] = letra
      }
    }
  }
  ord.data2 <- rbind(ord.data2, temp)
}