我需要根据第4栏(等位基因)的变化交换4列(6-7和10-11)的顺序,其中:
A/C should be C/A
G/C should be C/G
A/T should be T/A
G/A should be A/G
G/T should be T/G
T/C should be C/T
当列4改变时,必须交换各行上的列5:6(Major_Allele_Frequency Minor_Allele_Frequency)和10:11(X.HomA:X.HomB)的顺序。 例如:
library(data.table)
data <- "chr start tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB
chr1 1 chr1-1 A 23 1 0 0 0.00 100.00 0.00
chr1 2 chr1-2 A/C 23 0.58696 0.41304 9 39.13 35.73 25.14
chr1 3 chr1-3 C/A 23 0.88636 0.11364 3 13.04 77.07 9.88
chr1 4 chr1-4 C/G 23 1 0 0 0.00 100.00 0.00
chr1 5 chr1-5 C/T 23 0.52174 0.47826 18 78.26 11.34 10.40
chr1 6 chr1-6 G 23 1 0 0 0.00 100.00 0.00
chr1 7 chr1-7 G/C 23 0.97727 0.02273 1 4.35 93.48 2.17
chr1 8 chr1-8 T 23 1 0 0 0.00 100.00 0.00
chr1 9 chr1-9 T/C 23 0.88636 0.11364 5 21.74 69.37 8.89
chr1 10 chr1-10 A/G 23 0.5 0.5 6 26.09 36.96 36.96
chr1 11 chr1-11 A/T 23 0.52174 0.47826 12 52.17 24.95 22.87
chr1 12 chr1-12 T/A 23 0.80435 0.19565 9 39.13 48.96 11.91
chr1 13 chr1-13 G/A 23 1 0 0 0.00 100.00 0.00
chr1 14 chr1-14 G/T 23 0.475 0.525 17 73.91 12.39 13.70
chr2 1 chr2-1 T/G 23 0.525 0.475 17 73.91 13.70 12.39
chr2 2 chr2-2 C 23 100 0 0 0 100 0"
data <- read.table(text=data, header=T)
Expected outcome:
expected <- "chr start tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB
chr1 1 chr1-1 A 23 1 0 0 0.00 100.00 0.00
chr1 2 chr1-2 C/A 23 0.41304 0.58696 9 39.13 25.14 35.73
chr1 3 chr1-3 C/A 23 0.88636 0.11364 3 13.04 77.07 9.88
chr1 4 chr1-4 C/G 23 1 0 0 0.00 100.00 0.00
chr1 5 chr1-5 C/T 23 0.52174 0.47826 18 78.26 11.34 10.40
chr1 6 chr1-6 G 23 1 0 0 0.00 100.00 0.00
chr1 7 chr1-7 C/G 23 0.02273 0.97727 1 4.35 2.17 93.48
chr1 8 chr1-8 T 23 1 0 0 0.00 100.00 0.00
chr1 9 chr1-9 C/T 23 0.11364 0.88636 5 21.74 8.89 69.37
chr1 10 chr1-10 A/G 23 0.5 0.5 6 26.09 36.96 36.96
chr1 11 chr1-11 T/A 23 0.47826 0.52174 12 52.17 22.87 24.95
chr1 12 chr1-12 T/A 23 0.80435 0.19565 9 39.13 48.96 11.91
chr1 13 chr1-13 A/G 23 1 0 0 0.00 100.00 0.00
chr1 14 chr1-14 T/G 23 0.525 0.475 17 73.91 13.70 12.39
chr2 1 chr2-1 T/G 23 0.525 0.475 17 73.91 13.70 12.39
chr2 2 chr2-2 C 23 100 0 0 0 100 0"
expected <- read.table(text=expected, header=T)
答案 0 :(得分:3)
由于等位基因的字符编码显然是错误的顺序,您可以拆分alleles
列中的值,颠倒顺序并将其与指定的strReverse
函数一起粘贴回来在?strsplit
中,并使用以下内容反转上述列:
# string reverse function from '?strsplit'
strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "")
library(data.table)
setDT(dat)[alleles %in% c("A/C","G/C","A/T","G/A","G/T","T/C"),
`:=` (alleles = strReverse(as.character(alleles)),
Major_Allele_Frequency = Minor_Allele_Frequency,
Minor_Allele_Frequency = Major_Allele_Frequency,
X.HomA = X.HomB,
X.HomB = X.HomA)]
或者,您可以使用stri_reverse
包中的stringi
功能:
library(stringi)
library(data.table)
setDT(dat)[alleles %in% c("A/C","G/C","A/T","G/A","G/T","T/C"),
`:=` (alleles = stri_reverse(alleles),
Major_Allele_Frequency = Minor_Allele_Frequency,
Minor_Allele_Frequency = Major_Allele_Frequency,
X.HomA = X.HomB,
X.HomB = X.HomA)]
两者都给出了:
> dat
chr start tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB
1: chr1 1 chr1-1 A 23 1.00000 0.00000 0 0.00 100.00 0.00
2: chr1 2 chr1-2 C/A 23 0.41304 0.58696 9 39.13 25.14 35.73
3: chr1 3 chr1-3 C/A 23 0.88636 0.11364 3 13.04 77.07 9.88
4: chr1 4 chr1-4 C/G 23 1.00000 0.00000 0 0.00 100.00 0.00
5: chr1 5 chr1-5 C/T 23 0.52174 0.47826 18 78.26 11.34 10.40
6: chr1 6 chr1-6 G 23 1.00000 0.00000 0 0.00 100.00 0.00
7: chr1 7 chr1-7 C/G 23 0.02273 0.97727 1 4.35 2.17 93.48
8: chr1 8 chr1-8 T 23 1.00000 0.00000 0 0.00 100.00 0.00
9: chr1 9 chr1-9 C/T 23 0.11364 0.88636 5 21.74 8.89 69.37
10: chr1 10 chr1-10 A/G 23 0.50000 0.50000 6 26.09 36.96 36.96
11: chr1 11 chr1-11 T/A 23 0.47826 0.52174 12 52.17 22.87 24.95
12: chr1 12 chr1-12 T/A 23 0.80435 0.19565 9 39.13 48.96 11.91
13: chr1 13 chr1-13 A/G 23 0.00000 1.00000 0 0.00 0.00 100.00
14: chr1 14 chr1-14 T/G 23 0.52500 0.47500 17 73.91 13.70 12.39
15: chr2 1 chr2-1 T/G 23 0.52500 0.47500 17 73.91 13.70 12.39
16: chr2 2 chr2-2 C 23 100.00000 0.00000 0 0.00 100.00 0.00
PS:最好不要为数据集data
命名,因此我使用dat
作为名称
答案 1 :(得分:2)
这很简单:
data <- as.data.table(data)
tab <- data.table(original = c('A/C', 'G/C', 'A/T', 'G/A', 'G/T'), change = c('C/A', 'C/G', 'T/A', 'A/G', 'T/G'))
for(i in 1:tab[, .N]) {
data[alleles == tab[i, original], c('alleles', 'Major_Allele_Frequency', 'Minor_Allele_Frequency', 'X.HomA', 'X.HomB') :=
list(tab[i, change], Minor_Allele_Frequency, Major_Allele_Frequency, X.HomB, X.HomA)]
}
答案 2 :(得分:1)
今天老式回答。我已将变量名称更改为data1
for (i in 1:nrow(data1))
{
if (data1$alleles[i]=="A/C")
{
data1$alleles[i]='C/A'
temp<-data1$Major_Allele_Frequency[i]
data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
data1$Minor_Allele_Frequency[i]<-temp
temp<-data1$X.HomA[i]
data1$X.HomA[i]<-data1$X.HomB[i]
data1$X.HomB[i]<-temp
}
if (data1$alleles[i]=="G/C")
{
data1$alleles[i]='C/G'
temp<-data1$Major_Allele_Frequency[i]
data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
data1$Minor_Allele_Frequency[i]<-temp
temp<-data1$X.HomA[i]
data1$X.HomA[i]<-data1$X.HomB[i]
data1$X.HomB[i]<-temp
}
if (data1$alleles[i]=="A/T")
{
data1$alleles[i]='T/A'
temp<-data1$Major_Allele_Frequency[i]
data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
data1$Minor_Allele_Frequency[i]<-temp
temp<-data1$X.HomA[i]
data1$X.HomA[i]<-data1$X.HomB[i]
data1$X.HomB[i]<-temp
}
if (data1$alleles[i]=="G/A")
{
data1$alleles[i]='A/G'
temp<-data1$Major_Allele_Frequency[i]
data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
data1$Minor_Allele_Frequency[i]<-temp
temp<-data1$X.HomA[i]
data1$X.HomA[i]<-data1$X.HomB[i]
data1$X.HomB[i]<-temp
}
if (data1$alleles[i]=="G/T")
{
data1$alleles[i]='T/G'
temp<-data1$Major_Allele_Frequency[i]
data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i]
data1$Minor_Allele_Frequency[i]<-temp
temp<-data1$X.HomA[i]
data1$X.HomA[i]<-data1$X.HomB[i]
data1$X.HomB[i]<-temp
}
}
答案 3 :(得分:0)
使用基数R,如果我假设等位基因的任何变化是交换,我不进一步检查:
swapped <- data
alleles <- as.character(expected$alleles) # or other vector, since I think expected won't exist yet
changes <- which(as.character(data$alleles) != alleles)
swapped[changes, c(6, 7, 10, 11)] <- data[changes, c(7, 6, 11, 10)] # this is the swap
swapped$alleles[changes] <- alleles