使用tidyverse / dplyr使用观察值的直方图重新分配未观察到的值

时间:2017-12-01 10:03:03

标签: r dplyr tidyverse

在野外电镀过程中对鳗进行取样。大多数是测量的(批次= S), 有些不是(批次= L)。 我想重新分配L批次中丢失的鳗鱼,使用最近的10毫米值和 在单个“S”测量中观察到的尺寸结构。

eel <- structure(list(op = c(529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 529L, 
            529L, 529L, 529L, 529L, 529L, 529L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 545L, 
            545L, 545L), size = c(101L, 103L, 110L, 112L, 115L, 119L, 120L, 
            121L, 121L, 121L, 123L, 127L, 128L, 129L, 135L, 140L, 146L, 147L, 
            147L, 148L, 150L, 152L, 152L, 155L, 159L, 160L, 164L, 164L, 164L, 
            175L, 180L, 184L, 190L, 192L, 193L, 213L, 216L, 227L, 233L, 235L, 
            240L, 253L, 256L, 278L, 287L, 289L, 303L, 307L, 312L, 323L, 80L, 
            82L, 92L, 93L, 100L, 112L, 114L, 120L, 121L, 122L, 128L, 131L, 
            147L, 149L, 151L, 156L, 159L, 161L, 164L, 165L, 167L, 168L, 172L, 
            195L, 222L, 228L, 242L, 257L, 265L, 265L, 275L, 290L, 294L, 294L, 
            307L, 310L, 315L, 330L, 374L, 80L, 143L, 176L, 165L, 141L, 139L, 
            93L, 138L, 129L, 143L, 139L, 126L, 84L, 126L, 119L, 129L, 111L, 
            112L, 426L, 188L, 186L, 293L, 235L, 188L, 173L, 177L, 176L, 165L, 
            165L, 166L, 141L, 231L, 168L, 167L, 186L, 168L, 161L, 187L, 129L, 
            155L, 84L), batch = c("S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "L", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", "S", 
            "S", "S", "S", "S", "L"), number = c(0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 133L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
            0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 67L)), .Names = c("op", 
        "size", "batch", "number"), row.names = 4:133, class = "data.frame")

我尝试使用直方图进行tidyverse,我使用我的大小结构提取数据 以下函数(将确保所有观察值都在中断范围内)。我希望每10毫米放置一个新尺寸。

fn<-function(x) hist(x,
      breaks=seq(min(plyr::round_any(x, 10,f=floor)),plyr::round_any(max(x),10,f=ceiling),by=10),
      plot=FALSE)

然后我应用以下代码

hist <- eel%>% 
    filter(batch=='S') %>%
    select (size,op) %>% 
    group_by(op)  %>%
    by_slice(~fn(.x$size))  

我在.out列中有breakscounts的直方图,并使用我想要的那些 在我的数据框中创建新行。任何帮助将不胜感激。

1 个答案:

答案 0 :(得分:0)

我找到了一种方法,可能不是最好的,我使用browser参数来弄清楚细节。其中一个难点是重新分配的数字需要是整数,当根据每个类大小的百分比舍入数字时,会增加或丢失一些计数。所以我不得不将一些鳗鱼随机重新分配到大小结构中。为了加工容差,参数sample(1:nrow(df),rr)不起作用,我不得不围绕rr。请注意,我尝试使用函数mapmap2并且没有管理它,所以任何其他更简单的方法都会非常受欢迎。

group_sample <-    
    eel%>%     
    filter(batch=='L')%>%
    select (op,number)


individual_sample <- 
    eel%>% 
    filter(batch=='S') %>%
    select (size,op)%>%
    group_by(op)  %>%   
    by_slice(~fn(.x$size)) %>%
    rename(hist=.out)

reassigned_sample<- inner_join(individual_sample,group_sample,by=c("op"))  %>%
    by_row(..f=function(this_row){
          #browser()
          # frequencies
          vec <-this_row["hist"][[1]][[1]]$counts/sum(this_row["hist"][[1]][[1]]$counts)*pull(this_row["number"])
          # numbers are rounded, but there is a problem with sum
          roundvec <- round(vec)
          sumvec <- sum(vec)
          sumroundvec <- sum(roundvec)
          # difference between rounded numbers and numbers
          rr <- sumroundvec-sumvec
          # creation du jeu de données ressemblant au tableau de départ (moins id première colonne)
          df <- data.frame("op"=pull(this_row["op"]),
              "size"=this_row["hist"][[1]][[1]]$mids-5,
              "batch"="SL",
              "number"=roundvec
             )  
          # remove lines with 0 number
          df<-df[df$number>0,]
          if (rr >0) {
            # randomly removing eels from some samples
            # round(rr) necessary otherwise might not be exact integer
            sss <- sample(1:nrow(df),round(rr))
            df[sss,"number"]<-df[sss,"number"]-1
            # randomly adding eels for some samples
          } else if (rr <0){            
            sss<-sample(1:nrow(df),round(-rr))
            df[sss,"number"]<-df[sss,"number"]+1
          } else {
            # do nothing
          }
          stopifnot(round(sum(df$number))==round(sumvec))
          return(df)          
        }) %>%
    rename(table=.out)

bind_rows(reassigned_sample$table)


op size batch number
1  529   80    SL      3
2  529   90    SL      4
3  529  100    SL      4