我想做的是计算三个值的平均值,不包括任何负值。也许有一种更简单的方法?
#repro eg.
df1 <- structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293,
0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5), Replicate = c(1.44558642857143,
1.15371058441558, 1.02689350649351, 0.868325194805193, 0.677496493506493,
0.526922597402598, 0.371443376623376, 0.252155129870129, 0.183662272727273,
0.122282922077922, 0.0892741558441554, 0.0637236363636363), Replicate.1 = c(1.41649441558442,
1.11617954545455, 1.00826512987013, 0.851684350649351, 0.677447077922078,
0.523192987012987, 0.368280584415585, 0.262413311688312, 0.175215584415585,
0.129054415584416, 0.092797987012987, 0.0627326623376624), Replicate.2 = c(1.35938512987013,
1.21117383116883, 1.01522181818182, 0.891895324675324, 0.695687207792208,
0.518078831168831, 0.361077272727272, 0.25113487012987, 0.167685064935065,
0.121838701298701, 0.0813138961038961, 0.0731186363636365)), class = c("rowwise_df",
"tbl_df", "tbl", "data.frame"), .Names = c("concentration", "Replicate",
"Replicate.1", "Replicate.2"), row.names = c(NA, 12L))
docv <- function(df1){
df1 %>% rename(Replicate.1=Replicate,Replicate.2=Replicate.1,Replicate.3=Replicate.2) %>%
mutate(tnegcount=sum(c(Replicate.1<0,Replicate.2<0,Replicate.3<0))) %>%
mutate(averagev=case_when(tnegcount==0 ~ mean(c(Replicate.1,Replicate.2,Replicate.3)),
tnegcount>0 ~ c(Replicate.1,Replicate.2,Replicate.3)[c(Replicate.1,Replicate.2,Replicate.3)>0] %>% mean()
)) %>% return()
}
docv(df1)
答案 0 :(得分:0)
我通过三种方式更改您的数据:
rowwise
类; rowid
字段,以便将摘要数据返回到主列。 (联接之后可以毫无问题地删除此列。如果您的真实数据具有内置的“ id”字段,请随意使用它。)我选择重塑/连接/重塑方法的原因是,如果您有不完全是3列的情况。另外,我在这里不处理重命名,在管道之后执行此操作很容易。
df1 %>%
tidyr::gather(repl, v, -rowid, -concentration) %>%
filter(v > 0) %>%
group_by(rowid) %>%
summarize(mu = mean(v, na.rm=TRUE)) %>%
left_join(df1, ., by = "rowid")
# # A tibble: 12 x 6
# concentration Replicate Replicate.1 Replicate.2 rowid mu
# <dbl> <dbl> <dbl> <dbl> <int> <dbl>
# 1 0 1.45 -1.42 1.36 1 1.40
# 2 0.0867 1.15 -1.12 -1.21 2 1.15
# 3 0.13 1.03 1.01 1.02 3 1.02
# 4 0.195 0.868 0.852 0.892 4 0.871
# 5 0.293 0.677 0.677 0.696 5 0.684
# 6 0.439 0.527 0.523 0.518 6 0.523
# 7 0.658 0.371 0.368 0.361 7 0.367
# 8 0.988 0.252 0.262 0.251 8 0.255
# 9 1.48 0.184 0.175 0.168 9 0.176
# 10 2.22 0.122 0.129 0.122 10 0.124
# 11 3.33 0.0893 0.0928 0.0813 11 0.0878
# 12 5 -0.0637 -0.0627 -0.0731 12 NA
数据:
df1 <- structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293, 0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5),
Replicate = c(1.44558642857143, 1.15371058441558, 1.02689350649351, 0.868325194805193, 0.677496493506493, 0.526922597402598, 0.371443376623376, 0.252155129870129, 0.183662272727273, 0.122282922077922, 0.0892741558441554, -0.0637236363636363),
Replicate.1 = c(-1.41649441558442, -1.11617954545455, 1.00826512987013, 0.851684350649351, 0.677447077922078, 0.523192987012987, 0.368280584415585, 0.262413311688312, 0.175215584415585, 0.129054415584416, 0.092797987012987, -0.0627326623376624),
Replicate.2 = c(1.35938512987013, -1.21117383116883, 1.01522181818182, 0.891895324675324, 0.695687207792208, 0.518078831168831, 0.361077272727272, 0.25113487012987, 0.167685064935065, 0.121838701298701, 0.0813138961038961, -0.0731186363636365)),
class = c("rowwise_df", "tbl_df", "tbl", "data.frame"),
.Names = c("concentration", "Replicate", "Replicate.1", "Replicate.2"),
row.names = c(NA, 12L))
df1 <- ungroup(df1) %>% mutate(rowid = row_number())
答案 1 :(得分:0)
您可以使用基数R:
df1 = structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293, 0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5),
Replicate = c(-0.4689826737158, -0.25575220072642, 0.145706726703793, 0.816415579989552, -0.596636137925088, 0.796779369935393, 0.889350537210703, 0.321595584973693, 0.258228087797761, -0.876427459064871, -0.588050850201398, -0.646886494942009),
Replicate.1 = c(0.374045693315566, -0.231792563572526, 0.539682839997113, -0.00460151582956314, 0.435237016528845, 0.983812189660966, -0.239929641131312, 0.554890442639589, 0.869410462211818, -0.575714957434684, 0.303347532171756, -0.748889808077365),
Replicate.2 = c(-0.465558662544936, -0.227771814912558, -0.973219333682209, -0.235224085859954, 0.73938169144094, -0.319302006624639, -0.0358397690579295, 0.199131650850177, -0.0129173859022558, -0.627564797177911, 0.654746637213975, 0.336933476384729)),
.Names = c("concentration", "Replicate", "Replicate.1", "Replicate.2"),
row.names = c(NA, 12L),
class = c("rowwise_df", "tbl_df", "tbl", "data.frame"))
df1$averageV = apply(df1[,2:4], 1, function(x){mean(x[x>0])})
这将产生以下结果:
concentration Replicate Replicate.1 Replicate.2 averageV
1 0.0000 -0.4689827 0.374045693 -0.46555866 0.3740457
2 0.0867 -0.2557522 -0.231792564 -0.22777181 NaN
3 0.1300 0.1457067 0.539682840 -0.97321933 0.3426948
4 0.1950 0.8164156 -0.004601516 -0.23522409 0.8164156
5 0.2930 -0.5966361 0.435237017 0.73938169 0.5873094
6 0.4390 0.7967794 0.983812190 -0.31930201 0.8902958
7 0.6580 0.8893505 -0.239929641 -0.03583977 0.8893505
8 0.9880 0.3215956 0.554890443 0.19913165 0.3585392
9 1.4810 0.2582281 0.869410462 -0.01291739 0.5638193
10 2.2220 -0.8764275 -0.575714957 -0.62756480 NaN
11 3.3330 -0.5880509 0.303347532 0.65474664 0.4790471
12 5.0000 -0.6468865 -0.748889808 0.33693348 0.3369335
答案 2 :(得分:0)
这是一个稍微复杂一些的版本,但提供了一些进一步的概括:
数据:
df1 <- structure(list(concentration = c(0, 0.0867, 0.13, 0.195, 0.293, 0.439, 0.658, 0.988, 1.481, 2.222, 3.333, 5),
Replicate.1 = c(1.44558642857143, 1.15371058441558, 1.02689350649351, 0.868325194805193,
0.677496493506493, 0.526922597402598, 0.371443376623376, 0.252155129870129,
0.183662272727273, 0.122282922077922, 0.0892741558441554, 0.0637236363636363),
Replicate.2 = c(1.41649441558442, 1.11617954545455, 1.00826512987013, 0.851684350649351,
0.677447077922078, 0.523192987012987, 0.368280584415585, 0.262413311688312,
0.175215584415585, 0.129054415584416, 0.092797987012987, 0.0627326623376624),
Replicate.3 = c(1.35938512987013, 1.21117383116883, 1.01522181818182, 0.891895324675324,
0.695687207792208, 0.518078831168831, 0.361077272727272, 0.25113487012987,
0.167685064935065, 0.121838701298701, 0.0813138961038961, 0.0731186363636365)),
class = c("rowwise_df", "tbl_df", "tbl", "data.frame"),
.Names = c("concentration", "Replicate", "Replicate.1", "Replicate.2"), row.names = c(NA, 12L))
### Add negative rows
extraRows_v <- rbind(c(5.1, -1, 5, 10), c(5.5, -3, -5, -2), c(6, 4, 3, -8))
colnames(extraRows_v) <- colnames(df1)
df2 <- rbind(df1, extraRows_v)
### Add extra reference column
random_v <- rep(c("A", "B", "C"), 5)
df3 <- cbind("Random" = random_v, df2)
功能:
meanPosOnly <- function(data, refCol_v, calcCol_v = NA, negCountName_v = "tnegcount", meanName_v = "averagev", rename_v = T) {
#' Calculate means of positive values
#' @description Calculate the mean value of all positive values in all rows of a data.frame, matrix, etc.
#' @param data - data.frame, matrix, etc. Table of values
#' @param refCol_v - character vector - Name of column(s) that will not be used in taking the mean.
#' Some sort of reference/metadata column(s). Must be before other columns.
#' @param calcCol_v - vector (character or numeric) -
#' character - Name of column(s) that will be used in taking the mean. Default is NA, which will use all columns not in refCol_v.
#' numeric - column indices of column(s) that will be used in taking the mean.
#' @param negCountName_v - character vector - name of column that will tally number of negative values in each row
#' @param meanName_v - character vector - name of column that will contain the resulting average of all positive values in each row
#' @param rename_v - logical - rename the calc columns by adding ".[0-9]" where [0-9] is 1 more than currently in name
#' @value data.frame of same dimensions as data, with 2 extra columns denoting the number of negatives in each row and the mean of all positive values.
#' @export
## Get column indices
if (is.na(calcCol_v[1])) {
cols_v <- grep(paste(refCol_v, collapse = "|"), colnames(data), invert = T)
} else if (is.character(calcCol_v)) {
cols_v <- which(colnames(data) %in% calcCol_v)
} else {
cols_v <- calcCol_v
} # fi
## Get numeric columns
whichNum_v <- which(sapply(data, class) == "numeric")
## Get result
out_df <- as.data.frame(t(apply(data, 1, function(x) {
whichMean_v <- which(as.numeric(x[cols_v]) >= 0)
num0_v <- length(cols_v) - length(whichMean_v)
y <- mean(as.numeric(x[cols_v][whichMean_v]))
z <- c(x, num0_v, y)
return(z)
})))
## Add names
colnames(out_df)[c(ncol(out_df)-1,ncol(out_df))] <- c(negCountName_v, meanName_v)
## Fix numeric columns
for (c_v in c(whichNum_v, ncol(out_df)-1, ncol(out_df))) out_df[,c_v] <- as.numeric(as.character(out_df[,c_v]))
## Fix calc names
colNames_v <- colnames(out_df)[cols_v]
if (rename_v) {
colNames_v <- sapply(colNames_v, function(x) {
y <- strsplit(x, split = "\\.")[[1]]
z <- ifelse(is.na(y[2]),
paste0(y, ".1"),
paste0(y[1], ".", (as.numeric(y[2])+1)))
return(z)})
} # fi
colnames(out_df)[cols_v] <- colNames_v
## Return
return(out_df)
} # meanPosOnly
用法:
### Standard
meanPosOnly(df1, refCol_v = "concentration")
meanPosOnly(df2, refCol_v = "concentration")
### Only 2 columns
meanPosOnly(df1, refCol_v = "concentration", calcCol_v = c("Replicate.1", "Replicate.2"))
meanPosOnly(df2, refCol_v = "concentration", calcCol_v = c("Replicate.1", "Replicate.2"))
### No rename
meanPosOnly(df2, refCol_v = "concentration", rename_v = F)
### 2 columns, no rename
meanPosOnly(df2, refCol_v = "concentration", calcCol_v = c(3,4), rename_v = F)
### Extra reference column
meanPosOnly(df3, refCol_v = c("concentration", "Random"))