用于从次要等位基因频率(MAF)计算Hudson的Fst的R脚本出错

时间:2015-11-20 20:22:45

标签: r hudson fst

我有一个R脚本,用于使用Hudson估算器从次要等位基因频率计算Fst。当我运行脚本时,我在第74行收到以下错误消息(参见下面的代码):

Error in if (p < 2e-16) output <- list() : 
missing value where TRUE/FALSE needed

由于我是R和编程的新手,我不确切地知道问题出在哪里或者应该放置TRUE / FALSE。欢迎任何帮助或建议!

# Code for calculating Hudson's Fst from minor allele frequencies:
# input data frame pop1 is a N x 4 matrix
# where N is the number of SNPs
# row names correspond to the SNP name
# MAF represent the minor allele frequency
# NCHROBS represent the number of chromosome observed (2 x sample size)
# A1 common allele
# A2 variant allele
# example

"""
> head(pop1a,5)
          A1_maj A2_min MAF NCHROBS
rs3094315     G  A   0.18590    156
rs3131972     A  G   0.18350    158
rs3115860     C  A   0.13160    152
rs12562034    A  G   0.09615    156
rs12124819    G  A   0.20950    148
rs2980300     A  G   0.13290    158
"""

# similarly for pop2a:

Hudson.Fst <- function(pop1a,pop2a,call.rate = 0.95,top.number = 10){
  # remove the SNPs that are not in common between the 2 populations
  snp.to.keep <- intersect(row.names(pop1a),row.names(pop2a))
  if (length(snp.to.keep) == 0){print("Error: no SNP in common");return(NULL)}
  pop1a.k <- pop1a[snp.to.keep,]
  pop2a.k <- pop2a[snp.to.keep,]

  # change the reference allele if is not concordant between the 2 populations
  if (sum(pop1a.k$A1_maj == pop2a.k$A1_maj) != length(snp.to.keep)){
    idx <- which(pop1a.k$A1_maj != pop2a.k$A1_maj)
    idx.rev <- which(pop1a.k$A1_maj != pop2a.k$A1_maj & pop1a.k$A1_maj ==     pop2a.k$A2_min)
    idx.rm  <- which(pop1a.k$A1_maj  != pop2a.k$A1_maj & pop1a.k$A1_maj  != pop2a.k$A2_min)
    if(length(idx.rev) > 0){
      provv <- pop1a.k$A1_maj[idx.rev]
      pop1a.k$A1_maj[idx.rev] <- pop1a.k$A2_min[idx.rev]
      pop1a.k$A2_min[idx.rev] <- provv
      pop1a.k$MAF[idx.rev] <- 1 - pop1a.k$MAF[idx.rev]
    }
    if(length(idx.rm) > 0){      
      pop1a.k <- pop1a.k[-idx.rm,]
      pop2a.k <- pop2a.k[-idx.rm,]}}
  # remove SNPs with low call rate in one or both populations
  N1 <- pop1a.k$NCHROBS
  N2 <- pop1a.k$NCHROBS
  idx.rm.pop1a <- which(N1 < max(N1)*call.rate)
  idx.rm.pop2a <- which(N2 < max(N2)*call.rate)
  idx.rm.all <- union(idx.rm.pop1a,idx.rm.pop2a)
  pop1a.k <- pop1a.k[-idx.rm.all,]
  pop2a.k <- pop2a.k[-idx.rm.all,]
  # compute Hudson SNP_Fst and global Fst estimators
  p1 <- pop1a.k$MAF
  p2 <- pop2a.k$MAF
  n1 <- pop1a.k$NCHROBS
  n2 <- pop2a.k$NCHROBS
  fst.N <- (p1 - p2)^2 - p1*(1-p1)/(n1-1) - p2*(1-p2)/(n2-1)
  fst.D <- p1*(1-p2) + p2*(1-p1)
  Fst.v <- fst.N/fst.D
  names(Fst.v) <- row.names(pop1a.k[-idx.rm.all,])
  Fst.o <- Fst.v[order(Fst.v,decreasing=TRUE)]
  mu1 <- mean(fst.N)
  mu2 <- mean(fst.D)
  se1 <- sd(fst.N)/sqrt(length(fst.N))
  se2 <- sd(fst.D)/sqrt(length(fst.D))
  F.global <- mu1/mu2
  se.F <- sqrt(se1^2+se2^2)
  F_L95 <- F.global - 1.96*se.F
  F_U95 <- F.global + 1.96*se.F   
  Z <- F.global/se.F
  p <- 2*(1 - pnorm(Z))
  if(p < 2e-16) {p <- "less than 2e-16"}
# Error in if (p < 2e-16) { : missing value where TRUE/FALSE needed
  output[[1]] <- c(F.global,F_L95,F_U95,p)
  names(output[[1]]) <- c("Hudson.Fst","L.95%.CI","U.95%.CI","p.val")
  output[[2]] <- data.frame(Fst.o[1:top.number])
  names(output[[2]]) <- c("Hudson.Fst")
  return(output)}

# Run Fst for all columns

data1 <- Hudson.Fst(pop1a,pop2a,call.rate = 0.95, top.number = 10)

0 个答案:

没有答案