如何根据txt文件

时间:2018-02-07 05:38:43

标签: r awk sed

我有一个这样的VCF文件:

#CHROM      POS     ID      REF ALT QUAL    FILTER  INFO                     FORMAT         Sample1
scaffold1   12173   .       G   A   15.6    PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,4:4:12:114
scaffold2   219     .       C   T   20      PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 0/1:0,5:4:12:118
scaffold3   5896    .       G   C   18      PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,5:4:12:125
scaffold4   254496  .       G   T   18.8    PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,5:4:12:104
...

我想根据这样的列表文件替换CHROM和POS:

##OldChr    Start   End         NChr    Start       End
scaffold1   1       23736781    ChrN1   1           23736781
scaffold2   1       22030373    ChrN1   23736982    45767354
scaffold3   1       20070608    ChrN1   45767555    65838162
scaffold4   1       11585491    ChrN2   1           11585491
...

并在新的CHROM范围内获得POS字段。并得到这样的输出:

#CHROM      POS     ID      REF ALT QUAL    FILTER  INFO                     FORMAT         Sample1
ChrN1      12173    .       G   A   15.6    PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,4:4:12:114
ChrN1      23737201 .       C   T   20      PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 0/1:0,5:4:12:118
ChrN1      45773451 .       G   C   18      PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,5:4:12:124
ChrN2      254496   .       G   C   18.8    PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,5:4:12:104
...

你可以帮我解决这个问题吗?

2 个答案:

答案 0 :(得分:2)

希望这有帮助!

library(dplyr)

df1 %>%
  left_join(df2, by = c("CHROM" = "OldChr")) %>%
  mutate(POS_new = ifelse(Start == Start.1, POS, POS + Start.1)) %>%
  select(-CHROM, -POS, -Start, -End, -Start.1, -End.1) %>%
  rename(POS = POS_new, CHROM = NChr)

输出是:

  ID REF ALT QUAL FILTER                   INFO         FORMAT          Sample1 CHROM      POS
1  .   G   A 15.6 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,4:4:12:114 ChrN1    12173
2  .   C   T 20.0 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 0/1:0,5:4:12:118 ChrN1 23737201
3  .   G   C 18.0 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:125 ChrN1 45773451
4  .   G   T 18.8 PASSED AC=1;AF=0.500;AN=2;... GT:AD:DP:GQ:PL 1/1:0,5:4:12:104 ChrN2   254496

示例数据:

df1 <- structure(list(CHROM = c("scaffold1", "scaffold2", "scaffold3", 
"scaffold4"), POS = c(12173L, 219L, 5896L, 254496L), ID = c(".", 
".", ".", "."), REF = c("G", "C", "G", "G"), ALT = c("A", "T", 
"C", "T"), QUAL = c(15.6, 20, 18, 18.8), FILTER = c("PASSED", 
"PASSED", "PASSED", "PASSED"), INFO = c("AC=1;AF=0.500;AN=2;...", 
"AC=1;AF=0.500;AN=2;...", "AC=1;AF=0.500;AN=2;...", "AC=1;AF=0.500;AN=2;..."
), FORMAT = c("GT:AD:DP:GQ:PL", "GT:AD:DP:GQ:PL", "GT:AD:DP:GQ:PL", 
"GT:AD:DP:GQ:PL"), Sample1 = c("1/1:0,4:4:12:114", "0/1:0,5:4:12:118", 
"1/1:0,5:4:12:125", "1/1:0,5:4:12:104")), .Names = c("CHROM", 
"POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", 
"Sample1"), class = "data.frame", row.names = c(NA, -4L))

df2 <- structure(list(OldChr = c("scaffold1", "scaffold2", "scaffold3", 
"scaffold4"), Start = c(1L, 1L, 1L, 1L), End = c(23736781L, 22030373L, 
20070608L, 11585491L), NChr = c("ChrN1", "ChrN1", "ChrN1", "ChrN2"
), Start.1 = c(1L, 23736982L, 45767555L, 1L), End.1 = c(23736781L, 
45767354L, 65838162L, 11585491L)), .Names = c("OldChr", "Start", 
"End", "NChr", "Start.1", "End.1"), class = "data.frame", row.names = c(NA, 
-4L))

答案 1 :(得分:1)

您可以使用以下awk命令来实现此目的:

$ more file.vcf mapping 
::::::::::::::
file.vcf
::::::::::::::
#CHROM      POS     ID      REF ALT QUAL    FILTER  INFO                     FORMAT         Sample1
scaffold1   12173   .       G   A   15.6    PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,4:4:12:114
scaffold2   219     .       C   T   20      PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 0/1:0,5:4:12:118
scaffold3   5896    .       G   C   18      PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,5:4:12:125
scaffold4   254496  .       G   T   18.8    PASSED  AC=1;AF=0.500;AN=2;...   GT:AD:DP:GQ:PL 1/1:0,5:4:12:104
::::::::::::::
mapping
::::::::::::::
##OldChr    Start   End         NChr    Start       End
scaffold1   1       23736781    ChrN1   1           23736781
scaffold2   1       22030373    ChrN1   23736982    45767354
scaffold3   1       20070608    ChrN1   45767555    65838162
scaffold4   1       11585491    ChrN2   1           11585491

$ awk 'NR==FNR && NR>1 {tmp4[NR]=$4;tmp5[NR]=$5;next} FNR>1{$1=tmp4[FNR];$2=(tmp5[FNR]>1)?tmp5[FNR]+$2:$2} FNR>1 || NR>1' mapping file.vcf | column -t
#CHROM  POS       ID  REF  ALT  QUAL  FILTER  INFO                    FORMAT          Sample1
ChrN1   12173     .   G    A    15.6  PASSED  AC=1;AF=0.500;AN=2;...  GT:AD:DP:GQ:PL  1/1:0,4:4:12:114
ChrN1   23737201  .   C    T    20    PASSED  AC=1;AF=0.500;AN=2;...  GT:AD:DP:GQ:PL  0/1:0,5:4:12:118
ChrN1   45773451  .   G    C    18    PASSED  AC=1;AF=0.500;AN=2;...  GT:AD:DP:GQ:PL  1/1:0,5:4:12:125
ChrN2   254496    .   G    T    18.8  PASSED  AC=1;AF=0.500;AN=2;...  GT:AD:DP:GQ:PL  1/1:0,5:4:12:104