
时间:2015-08-20 16:12:10

标签: r aggregate apply




Model Factor1 Factor2 stats1 stats2
M1    0.0001  0.2     -1.0   0.9
M1    0.0001  0.2     -1.3   0.5
M1    0.0002  0.3     -1.9   0.2
M2    0.0001  0.2     -2.0   0.2
M2    0.0001  0.2     -2.0   0.2
M2    0.0002  0.3     -2.1   0.4
M3    0.0001  0.2      9.9   0.4
M3    0.0001  0.2      8.3   0.4
M3    0.0002  0.3      8.0   0.4
M4    0.0001  0.2      3.0   0.1
M4    0.0001  0.2      3.5   0.3
M4    0.0002  0.3      3.2   0.3



 test.logs <- function(log.num){

  log.num[log.num == -Inf | log.num == Inf] <- 0
  return (log.num)



kl.dist <- function(x.p, y.p) {
  # x.p, y.p: probability vectors for x and y distributions

  log.x <- test.logs(log(x.p))
  log.y <- test.logs(log(y.p))

  sum(x.p * (log.x - log.y))



js.dist <- function(x.p, y.p, M.p){
  0.5 * kl.dist(x.p, M.p) + .5 * kl.dist(y.p, M.p)




x.d <- density(x, n=512, from=min, to=max)
y.d <- density(y, n=512, from=min, to=max)
M.d <- (x.d$y + y.d$y)/2

# width of the histogram
w <- x.d$x[2] - x.d$x[1]

# probability of x value in n-th bin
x.p <- x.d$y * w # (hist hight) * (bin width)
y.p <- y.d$y * w
M.p <- M.d * w



density_js.dist <- function(data.df){
# gets the unique values for mutation rate
factor1 <- unique(data.df$Factor1)
# gets the unique values for rate of new copies
factor2 <- unique(data.df$factor2)

# calculates the minimum and maximum value for each of the statistics
# showing only for stats1
stats1.min <- min(data.df$stats1)
stats1.max <- max(data.df$stats1)

# for loop to calculate the densities and probabilities and JS distance for each combination of factor1 and factor2

for (f1 in factor1){
  for (f2 in factor2){

  new.df <- subset(data.df, factor1 == f1 & factor2 == f2)

  # subsetting data. One data frame for each of the four models
  MM.df <- subset(new.df, Model == "M1")
  TM.df <- subset(new.df, Model == "M2")

  MI.df <- subset(new.df, Model == "M3")
  TI.df <- subset(new.df, Model == "M4")

  # densitiy and probability for each stats

  # calculating densities for model M1 and M2
  MM1.d <- density(MM.df$stats1, n=512, from=stats1.min, to=stats1.max)
  TM1.d <- density(TM.df$stats1, n=512, from=stats1.min, to=stats1.max)

  # Density for the middle distribution between models M1 and M2 
  Middle12.d <- (MM1.d$y + TM1.d$y)/2

  # width for models
  w12 <- MM1.d$x[2] - MM1.d$x[1]

  # calculating probabilities for each models
  MM1.p <- MM1.d$y * w12 # (hist hight) * (bin width)
  TM1.p <- TM1.d$y * w12
  Middle12.p <- Middle12.d * w12 

  # calculating densities for models M3 and M4
  MI1.d <- density(MI.df$stats1, n=512, from=stats1.min, to=stats1.max)
  TI1.d <- density(TI.df$stats1, n=512, from=stats1.min, to=stats1.max)
  Middle34.d <- (MI1.d$y + TI1.d$y)/2

  w34 <- MI1.d$x[2] - MI1.d$x[1]

  # calculating probabilities for M3 and M4 models
  MI1.p <- MM1.d$y * w34 
  TI1.p <- TM1.d$y * w34
  Middle34.p <- Middle34.d * w34 

 js.dist(MM1.p, TM1.p, Middle12.p)
 js.dist(MI1.p, TI1.p, Middle34.p)


我曾尝试使用apply或aggregate,但是我无法弄清楚如何将每个统计数据的min和max作为参数传递,以便能够创建密度曲线? 请注意,此min和max是针对因子和模型的所有组合而不是针对每个子集计算的。例如,为了进行比较,我无法通过因子和模型计算子集的最小值和最大值。

我的数据实际上要复杂得多。我有10个不同的统计信息,我想按因子计算两个分布之间的JS.dist。我的两个发行版是M1和M2,M3和M4。 上面的代码有效,但它需要我写更多的700行,我真的认为它不是很聪明。


1 个答案:

答案 0 :(得分:1)


L = list()

  # gets the unique values for mutation rate
  factor1 <- unique(data.df$Factor1)
  # gets the unique values for rate of new copies
  factor2 <- unique(data.df$Factor2)

  # calculates the minimum and maximum value for each of the statistics
  # Store all 10 min and max in a vector
  vector.min <- lapply(data.df %>% select(stats1:stats10), min)
  vector.max <- lapply(data.df %>% select(stats1:stats10), max)

  # for loop to calculate the densities and probabilities and JS distance for each combination of factor1 and factor2

  for (f1 in factor1){
    for (f2 in factor2){
      new.df <- subset(data.df, factor1 == f1 & factor2 == f2)    
      # subsetting data. One data frame for each of the four models
      MM.df <- subset(new.df, Model == "M1")
      TM.df <- subset(new.df, Model == "M2")
      MI.df <- subset(new.df, Model == "M3")
      TI.df <- subset(new.df, Model == "M4")

      # densitiy and probability for each stats

      # calculating densities for model M1 and M2
      MM.d = lapply(1:10, function(i) density(MM.df %>% select(i+3) %>% unlist, n = 512, from = vector.min[[i]], to = vector.min[[i]]))
      TM.d = lapply(1:10, function(i) density(TM.df %>% select(i+3) %>% unlist, n = 512, from = vector.min[[i]], to = vector.min[[i]]))

      # Density for the middle distribution between models M1 and M2 
      Middle12.d <- mapply(function(d1, d2) (d1$y+d2$y)/2, MM.d, TM.d, SIMPLIFY = F)

      # width for models
      w12 = lapply(MM.d, function(y) {y$x[2] - y$x[1]})

      # calculating probabilities for each models
      MM1.p = mapply(function(arg1, arg2) {arg1$y * arg2}, MM.d, w12)  # (hist hight) * (bin width)
      TM1.p = mapply(function(arg1, arg2) {arg1$y * arg2}, TM.d, w12)
      Middle12.p = mapply("*", Middle12.d, w12)

      # calculating densities for models M3 and M4
      MI.d = lapply(1:10, function(i) density(MI.df %>% select(i+3) %>% unlist, n = 512, from = vector.min[[i]], to = vector.min[[2]]))
      TI.d = lapply(1:10, function(i) density(TI.df %>% select(i+3) %>% unlist, n = 512, from = vector.min[[i]], to = vector.min[[2]]))
      Middle34.d <- mapply(function(d1, d2) (d1$y+d2$y)/2, MI.d, TI.d)

      w34 = lapply(MI.d, function(y) {y$x[2] - y$x[1]})      

      # calculating probabilities for M3 and M4 models
      MI1.p = mapply(function(arg1, arg2) {arg1$y * arg2}, MI.d, w34)  # (hist hight) * (bin width)
      TI1.p = mapply(function(arg1, arg2) {arg1$y * arg2}, TI.d, w34)
      Middle34.p = mapply("*", Middle34.d, w34)

      L = c(L, list(mapply(js.dist, MM1.p, TM1.p, Middle12.p), mapply(js.dist, MI1.p, TI1.p, Middle34.p)))