有没有一种更简单的方法来计算均值同时降低负值?

时间:2018-10-24 16:27:36

标签: r dplyr

我想做的是计算三个值的平均值,不包括任何负值。也许有一种更简单的方法?

#repro eg.
df1 <- structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293, 
                                 0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5), Replicate = c(1.44558642857143, 
                                                                                             1.15371058441558, 1.02689350649351, 0.868325194805193, 0.677496493506493, 
                                                                                             0.526922597402598, 0.371443376623376, 0.252155129870129, 0.183662272727273, 
                                                                                             0.122282922077922, 0.0892741558441554, 0.0637236363636363), Replicate.1 = c(1.41649441558442, 
                                                                                                                                                                         1.11617954545455, 1.00826512987013, 0.851684350649351, 0.677447077922078, 
                                                                                                                                                                         0.523192987012987, 0.368280584415585, 0.262413311688312, 0.175215584415585, 
                                                                                                                                                                         0.129054415584416, 0.092797987012987, 0.0627326623376624), Replicate.2 = c(1.35938512987013, 
                                                                                                                                                                                                                                                    1.21117383116883, 1.01522181818182, 0.891895324675324, 0.695687207792208, 
                                                                                                                                                                                                                                                    0.518078831168831, 0.361077272727272, 0.25113487012987, 0.167685064935065, 
                                                                                                                                                                                                                                                    0.121838701298701, 0.0813138961038961, 0.0731186363636365)), class = c("rowwise_df", 
                                                                                                                                                                                                                                                                                                                           "tbl_df", "tbl", "data.frame"), .Names = c("concentration", "Replicate", 
                                                                                                                                                                                                                                                                                                                                                                      "Replicate.1", "Replicate.2"), row.names = c(NA, 12L))
docv <- function(df1){
  df1 %>% rename(Replicate.1=Replicate,Replicate.2=Replicate.1,Replicate.3=Replicate.2) %>% 
    mutate(tnegcount=sum(c(Replicate.1<0,Replicate.2<0,Replicate.3<0))) %>%
    mutate(averagev=case_when(tnegcount==0 ~ mean(c(Replicate.1,Replicate.2,Replicate.3)),
                              tnegcount>0 ~ c(Replicate.1,Replicate.2,Replicate.3)[c(Replicate.1,Replicate.2,Replicate.3)>0] %>% mean()
    )) %>% return()
}

docv(df1)

3 个答案:

答案 0 :(得分:0)

我通过三种方式更改您的数据:

  • 我要删除其上的rowwise类;
  • 我要介绍一些否定值来检验您的问题;和
  • 我要添加一个rowid字段,以便将摘要数据返回到主列。 (联接之后可以毫无问题地删除此列。如果您的真实数据具有内置的“ id”字段,请随意使用它。)

我选择重塑/连接/重塑方法的原因是,如果您有不完全是3列的情况。另外,我在这里不处理重命名,在管道之后执行此操作很容易。

df1 %>%
  tidyr::gather(repl, v, -rowid, -concentration) %>%
  filter(v > 0) %>%
  group_by(rowid) %>%
  summarize(mu = mean(v, na.rm=TRUE)) %>%
  left_join(df1, ., by = "rowid")
# # A tibble: 12 x 6
#    concentration Replicate Replicate.1 Replicate.2 rowid      mu
#            <dbl>     <dbl>       <dbl>       <dbl> <int>   <dbl>
#  1        0         1.45       -1.42        1.36       1  1.40  
#  2        0.0867    1.15       -1.12       -1.21       2  1.15  
#  3        0.13      1.03        1.01        1.02       3  1.02  
#  4        0.195     0.868       0.852       0.892      4  0.871 
#  5        0.293     0.677       0.677       0.696      5  0.684 
#  6        0.439     0.527       0.523       0.518      6  0.523 
#  7        0.658     0.371       0.368       0.361      7  0.367 
#  8        0.988     0.252       0.262       0.251      8  0.255 
#  9        1.48      0.184       0.175       0.168      9  0.176 
# 10        2.22      0.122       0.129       0.122     10  0.124 
# 11        3.33      0.0893      0.0928      0.0813    11  0.0878
# 12        5        -0.0637     -0.0627     -0.0731    12 NA     

数据:

df1 <- structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293, 0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5),
                      Replicate = c(1.44558642857143, 1.15371058441558, 1.02689350649351, 0.868325194805193, 0.677496493506493, 0.526922597402598, 0.371443376623376, 0.252155129870129, 0.183662272727273, 0.122282922077922, 0.0892741558441554, -0.0637236363636363),
                      Replicate.1 = c(-1.41649441558442, -1.11617954545455, 1.00826512987013, 0.851684350649351, 0.677447077922078, 0.523192987012987, 0.368280584415585, 0.262413311688312, 0.175215584415585, 0.129054415584416, 0.092797987012987, -0.0627326623376624),
                      Replicate.2 = c(1.35938512987013, -1.21117383116883, 1.01522181818182, 0.891895324675324, 0.695687207792208, 0.518078831168831, 0.361077272727272, 0.25113487012987, 0.167685064935065, 0.121838701298701, 0.0813138961038961, -0.0731186363636365)),
                 class = c("rowwise_df", "tbl_df", "tbl", "data.frame"),
                 .Names = c("concentration", "Replicate", "Replicate.1", "Replicate.2"),
                 row.names = c(NA, 12L))


df1 <- ungroup(df1) %>% mutate(rowid = row_number())

答案 1 :(得分:0)

您可以使用基数R:

df1 = structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293, 0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5), 
                 Replicate = c(-0.4689826737158, -0.25575220072642, 0.145706726703793, 0.816415579989552, -0.596636137925088, 0.796779369935393, 0.889350537210703, 0.321595584973693, 0.258228087797761, -0.876427459064871, -0.588050850201398, -0.646886494942009), 
                 Replicate.1 = c(0.374045693315566, -0.231792563572526, 0.539682839997113, -0.00460151582956314, 0.435237016528845, 0.983812189660966, -0.239929641131312, 0.554890442639589, 0.869410462211818, -0.575714957434684, 0.303347532171756, -0.748889808077365), 
                 Replicate.2 = c(-0.465558662544936, -0.227771814912558, -0.973219333682209, -0.235224085859954, 0.73938169144094, -0.319302006624639, -0.0358397690579295, 0.199131650850177, -0.0129173859022558, -0.627564797177911, 0.654746637213975, 0.336933476384729)),
            .Names = c("concentration", "Replicate", "Replicate.1", "Replicate.2"), 
            row.names = c(NA, 12L), 
            class = c("rowwise_df", "tbl_df", "tbl", "data.frame"))


df1$averageV = apply(df1[,2:4], 1, function(x){mean(x[x>0])})

这将产生以下结果:

   concentration  Replicate  Replicate.1 Replicate.2  averageV
1         0.0000 -0.4689827  0.374045693 -0.46555866 0.3740457
2         0.0867 -0.2557522 -0.231792564 -0.22777181       NaN
3         0.1300  0.1457067  0.539682840 -0.97321933 0.3426948
4         0.1950  0.8164156 -0.004601516 -0.23522409 0.8164156
5         0.2930 -0.5966361  0.435237017  0.73938169 0.5873094
6         0.4390  0.7967794  0.983812190 -0.31930201 0.8902958
7         0.6580  0.8893505 -0.239929641 -0.03583977 0.8893505
8         0.9880  0.3215956  0.554890443  0.19913165 0.3585392
9         1.4810  0.2582281  0.869410462 -0.01291739 0.5638193
10        2.2220 -0.8764275 -0.575714957 -0.62756480       NaN
11        3.3330 -0.5880509  0.303347532  0.65474664 0.4790471
12        5.0000 -0.6468865 -0.748889808  0.33693348 0.3369335

答案 2 :(得分:0)

这是一个稍微复杂一些的版本,但提供了一些进一步的概括:

  1. 您可以有多个参考列。
  2. 您可以有任意数量的列来取平均值。
  3. 您可以指定要添加的新列的名称。
  4. 您可以选择重命名用于取平均值的列。

数据:

df1 <- structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293, 0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5), 
                      Replicate.1 = c(1.44558642857143, 1.15371058441558, 1.02689350649351, 0.868325194805193, 
                                    0.677496493506493, 0.526922597402598, 0.371443376623376, 0.252155129870129, 
                                    0.183662272727273, 0.122282922077922, 0.0892741558441554, 0.0637236363636363), 
                      Replicate.2 = c(1.41649441558442, 1.11617954545455, 1.00826512987013, 0.851684350649351,
                                      0.677447077922078, 0.523192987012987, 0.368280584415585, 0.262413311688312, 
                                      0.175215584415585, 0.129054415584416, 0.092797987012987, 0.0627326623376624), 
                      Replicate.3 = c(1.35938512987013, 1.21117383116883, 1.01522181818182, 0.891895324675324, 
                                      0.695687207792208, 0.518078831168831, 0.361077272727272, 0.25113487012987, 
                                      0.167685064935065, 0.121838701298701, 0.0813138961038961, 0.0731186363636365)), 
                 class = c("rowwise_df", "tbl_df", "tbl", "data.frame"), 
                 .Names = c("concentration", "Replicate", "Replicate.1", "Replicate.2"), row.names = c(NA, 12L))

### Add negative rows
extraRows_v <- rbind(c(5.1, -1, 5, 10), c(5.5, -3, -5, -2), c(6, 4, 3, -8))
colnames(extraRows_v) <- colnames(df1)
df2 <- rbind(df1, extraRows_v)

### Add extra reference column
random_v <- rep(c("A", "B", "C"), 5)
df3 <- cbind("Random" = random_v, df2)

功能:

meanPosOnly <- function(data, refCol_v, calcCol_v = NA, negCountName_v = "tnegcount", meanName_v = "averagev", rename_v = T) {
  #' Calculate means of positive values
  #' @description Calculate the mean value of all positive values in all rows of a data.frame, matrix, etc.
  #' @param data - data.frame, matrix, etc. Table of values
  #' @param refCol_v - character vector - Name of column(s) that will not be used in taking the mean. 
  #' Some sort of reference/metadata column(s). Must be before other columns.
  #' @param calcCol_v - vector (character or numeric) - 
  #' character - Name of column(s) that will be used in taking the mean. Default is NA, which will use all columns not in refCol_v.
  #' numeric - column indices of column(s) that will be used in taking the mean. 
  #' @param negCountName_v - character vector - name of column that will tally number of negative values in each row
  #' @param meanName_v - character vector - name of column that will contain the resulting average of all positive values in each row
  #' @param rename_v - logical - rename the calc columns by adding ".[0-9]" where [0-9] is 1 more than currently in name
  #' @value data.frame of same dimensions as data, with 2 extra columns denoting the number of negatives in each row and the mean of all positive values.
  #' @export

  ## Get column indices
  if (is.na(calcCol_v[1])) {
    cols_v <- grep(paste(refCol_v, collapse = "|"), colnames(data), invert = T)
  } else if (is.character(calcCol_v)) {
    cols_v <- which(colnames(data) %in% calcCol_v)
  } else {
    cols_v <- calcCol_v
  } # fi

  ## Get numeric columns
  whichNum_v <- which(sapply(data, class) == "numeric")

  ## Get result
  out_df <- as.data.frame(t(apply(data, 1, function(x) {
    whichMean_v <- which(as.numeric(x[cols_v]) >= 0)
    num0_v <- length(cols_v) - length(whichMean_v)
    y <- mean(as.numeric(x[cols_v][whichMean_v]))
    z <- c(x, num0_v, y)
    return(z)
  })))

  ## Add names
  colnames(out_df)[c(ncol(out_df)-1,ncol(out_df))] <- c(negCountName_v, meanName_v)

  ## Fix numeric columns
  for (c_v in c(whichNum_v, ncol(out_df)-1, ncol(out_df))) out_df[,c_v] <- as.numeric(as.character(out_df[,c_v]))

  ## Fix calc names
  colNames_v <- colnames(out_df)[cols_v]
  if (rename_v) {
    colNames_v <- sapply(colNames_v, function(x) {
      y <- strsplit(x, split = "\\.")[[1]]
      z <- ifelse(is.na(y[2]),
                  paste0(y, ".1"),
                  paste0(y[1], ".", (as.numeric(y[2])+1)))
      return(z)})
  } # fi
  colnames(out_df)[cols_v] <- colNames_v

  ## Return
  return(out_df)
} # meanPosOnly

用法:

### Standard
meanPosOnly(df1, refCol_v = "concentration")
meanPosOnly(df2, refCol_v = "concentration")

### Only 2 columns
meanPosOnly(df1, refCol_v = "concentration", calcCol_v = c("Replicate.1", "Replicate.2"))
meanPosOnly(df2, refCol_v = "concentration", calcCol_v = c("Replicate.1", "Replicate.2"))

### No rename
meanPosOnly(df2, refCol_v = "concentration", rename_v = F)

### 2 columns, no rename
meanPosOnly(df2, refCol_v = "concentration", calcCol_v = c(3,4), rename_v = F)

### Extra reference column
meanPosOnly(df3, refCol_v = c("concentration", "Random"))