我有一个这样的VCF文件:
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1
scaffold1 12173 . G A 15.6 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,4:4:12:114
scaffold2 219 . C T 20 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 0/1:0,5:4:12:118
scaffold3 5896 . G C 18 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:125
scaffold4 254496 . G T 18.8 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:104
...
我想根据这样的列表文件替换CHROM和POS:
##OldChr Start End NChr Start End
scaffold1 1 23736781 ChrN1 1 23736781
scaffold2 1 22030373 ChrN1 23736982 45767354
scaffold3 1 20070608 ChrN1 45767555 65838162
scaffold4 1 11585491 ChrN2 1 11585491
...
并在新的CHROM范围内获得POS字段。并得到这样的输出:
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1
ChrN1 12173 . G A 15.6 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,4:4:12:114
ChrN1 23737201 . C T 20 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 0/1:0,5:4:12:118
ChrN1 45773451 . G C 18 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:124
ChrN2 254496 . G C 18.8 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:104
...
你可以帮我解决这个问题吗?
答案 0 :(得分:2)
希望这有帮助!
library(dplyr)
df1 %>%
left_join(df2, by = c("CHROM" = "OldChr")) %>%
mutate(POS_new = ifelse(Start == Start.1, POS, POS + Start.1)) %>%
select(-CHROM, -POS, -Start, -End, -Start.1, -End.1) %>%
rename(POS = POS_new, CHROM = NChr)
输出是:
ID REF ALT QUAL FILTER INFO FORMAT Sample1 CHROM POS
1 . G A 15.6 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,4:4:12:114 ChrN1 12173
2 . C T 20.0 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 0/1:0,5:4:12:118 ChrN1 23737201
3 . G C 18.0 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:125 ChrN1 45773451
4 . G T 18.8 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:104 ChrN2 254496
示例数据:
df1 <- structure(list(CHROM = c("scaffold1", "scaffold2", "scaffold3",
"scaffold4"), POS = c(12173L, 219L, 5896L, 254496L), ID = c(".",
".", ".", "."), REF = c("G", "C", "G", "G"), ALT = c("A", "T",
"C", "T"), QUAL = c(15.6, 20, 18, 18.8), FILTER = c("PASSED",
"PASSED", "PASSED", "PASSED"), INFO = c("AC=1;AF=0.500;AN=2;...",
"AC=1;AF=0.500;AN=2;...", "AC=1;AF=0.500;AN=2;...", "AC=1;AF=0.500;AN=2;..."
), FORMAT = c("GT:AD:DP:GQ:PL", "GT:AD:DP:GQ:PL", "GT:AD:DP:GQ:PL",
"GT:AD:DP:GQ:PL"), Sample1 = c("1/1:0,4:4:12:114", "0/1:0,5:4:12:118",
"1/1:0,5:4:12:125", "1/1:0,5:4:12:104")), .Names = c("CHROM",
"POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT",
"Sample1"), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(OldChr = c("scaffold1", "scaffold2", "scaffold3",
"scaffold4"), Start = c(1L, 1L, 1L, 1L), End = c(23736781L, 22030373L,
20070608L, 11585491L), NChr = c("ChrN1", "ChrN1", "ChrN1", "ChrN2"
), Start.1 = c(1L, 23736982L, 45767555L, 1L), End.1 = c(23736781L,
45767354L, 65838162L, 11585491L)), .Names = c("OldChr", "Start",
"End", "NChr", "Start.1", "End.1"), class = "data.frame", row.names = c(NA,
-4L))
答案 1 :(得分:1)
您可以使用以下awk
命令来实现此目的:
$ more file.vcf mapping
::::::::::::::
file.vcf
::::::::::::::
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1
scaffold1 12173 . G A 15.6 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,4:4:12:114
scaffold2 219 . C T 20 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 0/1:0,5:4:12:118
scaffold3 5896 . G C 18 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:125
scaffold4 254496 . G T 18.8 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:104
::::::::::::::
mapping
::::::::::::::
##OldChr Start End NChr Start End
scaffold1 1 23736781 ChrN1 1 23736781
scaffold2 1 22030373 ChrN1 23736982 45767354
scaffold3 1 20070608 ChrN1 45767555 65838162
scaffold4 1 11585491 ChrN2 1 11585491
$ awk 'NR==FNR && NR>1 {tmp4[NR]=$4;tmp5[NR]=$5;next} FNR>1{$1=tmp4[FNR];$2=(tmp5[FNR]>1)?tmp5[FNR]+$2:$2} FNR>1 || NR>1' mapping file.vcf | column -t
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1
ChrN1 12173 . G A 15.6 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,4:4:12:114
ChrN1 23737201 . C T 20 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 0/1:0,5:4:12:118
ChrN1 45773451 . G C 18 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:125
ChrN2 254496 . G T 18.8 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:104