通过交换以其他四列的顺序调整列更改

时间:2015-12-30 08:58:54

标签: r data.table

我需要根据第4栏(等位基因)的变化交换4列(6-7和10-11)的顺序,其中:

A/C should be C/A
G/C should be C/G 
A/T should be T/A
G/A should be A/G 
G/T should be T/G
T/C should be C/T

当列4改变时,必须交换各行上的列5:6(Major_Allele_Frequency Minor_Allele_Frequency)和10:11(X.HomA:X.HomB)的顺序。 例如:

library(data.table)
data <- "chr start tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB
chr1 1 chr1-1 A 23 1 0 0 0.00 100.00 0.00
chr1 2 chr1-2 A/C 23 0.58696 0.41304 9 39.13 35.73 25.14
chr1 3 chr1-3 C/A 23 0.88636 0.11364 3 13.04 77.07 9.88
chr1 4 chr1-4 C/G 23 1 0 0 0.00 100.00 0.00
chr1 5 chr1-5 C/T 23 0.52174 0.47826 18 78.26 11.34 10.40
chr1 6 chr1-6 G 23 1 0 0 0.00 100.00 0.00
chr1 7 chr1-7 G/C 23 0.97727 0.02273 1 4.35 93.48 2.17
chr1 8 chr1-8 T 23 1 0 0 0.00 100.00 0.00
chr1 9 chr1-9 T/C 23 0.88636 0.11364 5 21.74 69.37 8.89
chr1 10 chr1-10 A/G 23 0.5 0.5 6 26.09 36.96 36.96
chr1 11 chr1-11 A/T 23 0.52174 0.47826 12 52.17 24.95 22.87
chr1 12 chr1-12 T/A 23 0.80435 0.19565 9 39.13 48.96 11.91
chr1 13 chr1-13 G/A 23 1 0 0 0.00 100.00 0.00
chr1 14 chr1-14 G/T 23 0.475 0.525  17 73.91 12.39 13.70
chr2 1 chr2-1 T/G 23 0.525 0.475 17 73.91 13.70 12.39
chr2 2 chr2-2 C 23 100 0 0 0 100 0"
data <- read.table(text=data, header=T)


Expected outcome:
expected <- "chr start tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB
chr1 1 chr1-1 A 23 1 0 0 0.00 100.00 0.00
chr1 2 chr1-2 C/A 23 0.41304 0.58696 9 39.13 25.14 35.73
chr1 3 chr1-3 C/A 23 0.88636 0.11364 3 13.04 77.07 9.88
chr1 4 chr1-4 C/G 23 1 0 0 0.00 100.00 0.00
chr1 5 chr1-5 C/T 23 0.52174 0.47826 18 78.26 11.34 10.40
chr1 6 chr1-6 G 23 1 0 0 0.00 100.00 0.00
chr1 7 chr1-7 C/G 23 0.02273 0.97727 1 4.35 2.17 93.48
chr1 8 chr1-8 T 23 1 0 0 0.00 100.00 0.00
chr1 9 chr1-9 C/T 23 0.11364 0.88636 5 21.74 8.89 69.37
chr1 10 chr1-10 A/G 23 0.5 0.5 6 26.09 36.96 36.96
chr1 11 chr1-11 T/A 23 0.47826 0.52174 12 52.17 22.87 24.95
chr1 12 chr1-12 T/A 23 0.80435 0.19565 9 39.13 48.96 11.91
chr1 13 chr1-13 A/G 23 1 0 0 0.00 100.00 0.00
chr1 14 chr1-14 T/G 23 0.525 0.475 17 73.91 13.70 12.39
chr2 1 chr2-1 T/G 23 0.525 0.475 17 73.91 13.70 12.39
chr2 2 chr2-2 C 23 100 0 0 0 100 0"
expected <- read.table(text=expected, header=T)

4 个答案:

答案 0 :(得分:3)

由于等位基因的字符编码显然是错误的顺序,您可以拆分alleles列中的值,颠倒顺序并将其与指定的strReverse函数一起粘贴回来在?strsplit中,并使用以下内容反转上述列:

# string reverse function from '?strsplit'
strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "")

library(data.table)
setDT(dat)[alleles %in% c("A/C","G/C","A/T","G/A","G/T","T/C"),
           `:=` (alleles = strReverse(as.character(alleles)),
                 Major_Allele_Frequency = Minor_Allele_Frequency,
                 Minor_Allele_Frequency = Major_Allele_Frequency,
                 X.HomA = X.HomB,
                 X.HomB = X.HomA)]

或者,您可以使用stri_reverse包中的stringi功能:

library(stringi)
library(data.table)
setDT(dat)[alleles %in% c("A/C","G/C","A/T","G/A","G/T","T/C"),
           `:=` (alleles = stri_reverse(alleles),
                 Major_Allele_Frequency = Minor_Allele_Frequency,
                 Minor_Allele_Frequency = Major_Allele_Frequency,
                 X.HomA = X.HomB,
                 X.HomB = X.HomA)]

两者都给出了:

> dat
     chr start     tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB
 1: chr1     1  chr1-1       A             23                1.00000                0.00000                   0           0.00 100.00   0.00
 2: chr1     2  chr1-2     C/A             23                0.41304                0.58696                   9          39.13  25.14  35.73
 3: chr1     3  chr1-3     C/A             23                0.88636                0.11364                   3          13.04  77.07   9.88
 4: chr1     4  chr1-4     C/G             23                1.00000                0.00000                   0           0.00 100.00   0.00
 5: chr1     5  chr1-5     C/T             23                0.52174                0.47826                  18          78.26  11.34  10.40
 6: chr1     6  chr1-6       G             23                1.00000                0.00000                   0           0.00 100.00   0.00
 7: chr1     7  chr1-7     C/G             23                0.02273                0.97727                   1           4.35   2.17  93.48
 8: chr1     8  chr1-8       T             23                1.00000                0.00000                   0           0.00 100.00   0.00
 9: chr1     9  chr1-9     C/T             23                0.11364                0.88636                   5          21.74   8.89  69.37
10: chr1    10 chr1-10     A/G             23                0.50000                0.50000                   6          26.09  36.96  36.96
11: chr1    11 chr1-11     T/A             23                0.47826                0.52174                  12          52.17  22.87  24.95
12: chr1    12 chr1-12     T/A             23                0.80435                0.19565                   9          39.13  48.96  11.91
13: chr1    13 chr1-13     A/G             23                0.00000                1.00000                   0           0.00   0.00 100.00
14: chr1    14 chr1-14     T/G             23                0.52500                0.47500                  17          73.91  13.70  12.39
15: chr2     1  chr2-1     T/G             23                0.52500                0.47500                  17          73.91  13.70  12.39
16: chr2     2  chr2-2       C             23              100.00000                0.00000                   0           0.00 100.00   0.00

PS:最好不要为数据集data命名,因此我使用dat作为名称

答案 1 :(得分:2)

这很简单:

data <- as.data.table(data)
tab <- data.table(original = c('A/C', 'G/C', 'A/T', 'G/A', 'G/T'), change = c('C/A', 'C/G', 'T/A', 'A/G', 'T/G'))

for(i in 1:tab[, .N]) {
  data[alleles == tab[i, original], c('alleles', 'Major_Allele_Frequency', 'Minor_Allele_Frequency', 'X.HomA', 'X.HomB') := 
         list(tab[i, change], Minor_Allele_Frequency, Major_Allele_Frequency, X.HomB, X.HomA)]
}

答案 2 :(得分:1)

今天老式回答。我已将变量名称更改为data1

for (i in 1:nrow(data1))
{
  if (data1$alleles[i]=="A/C")
  {
    data1$alleles[i]='C/A'
   temp<-data1$Major_Allele_Frequency[i]
   data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
   data1$Minor_Allele_Frequency[i]<-temp
   temp<-data1$X.HomA[i]
   data1$X.HomA[i]<-data1$X.HomB[i]
   data1$X.HomB[i]<-temp
  }

  if (data1$alleles[i]=="G/C")
  {
    data1$alleles[i]='C/G'
  temp<-data1$Major_Allele_Frequency[i]
  data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
  data1$Minor_Allele_Frequency[i]<-temp
  temp<-data1$X.HomA[i]
  data1$X.HomA[i]<-data1$X.HomB[i]
  data1$X.HomB[i]<-temp
  }

  if (data1$alleles[i]=="A/T")
  {
    data1$alleles[i]='T/A'
    temp<-data1$Major_Allele_Frequency[i]
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
    data1$Minor_Allele_Frequency[i]<-temp
    temp<-data1$X.HomA[i]
    data1$X.HomA[i]<-data1$X.HomB[i]
    data1$X.HomB[i]<-temp
  }

  if (data1$alleles[i]=="G/A")
  {
    data1$alleles[i]='A/G'
    temp<-data1$Major_Allele_Frequency[i]
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
    data1$Minor_Allele_Frequency[i]<-temp
    temp<-data1$X.HomA[i]
    data1$X.HomA[i]<-data1$X.HomB[i]
    data1$X.HomB[i]<-temp
  }
  if (data1$alleles[i]=="G/T")
  {
    data1$alleles[i]='T/G'
    temp<-data1$Major_Allele_Frequency[i]
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
    data1$Minor_Allele_Frequency[i]<-temp
    temp<-data1$X.HomA[i]
    data1$X.HomA[i]<-data1$X.HomB[i]
    data1$X.HomB[i]<-temp
  }

}

答案 3 :(得分:0)

使用基数R,如果我假设等位基因的任何变化是交换,我不进一步检查:

swapped <- data
alleles <- as.character(expected$alleles) # or other vector, since I think expected won't exist yet
changes <- which(as.character(data$alleles) != alleles)
swapped[changes, c(6, 7, 10, 11)] <- data[changes, c(7, 6, 11, 10)]  # this is the swap
swapped$alleles[changes] <- alleles