我们说我有:
set.seed(42)
d = data.frame(replicate(6,rnorm(10)))
col_labels = c("a", "a", "b", "b", "c", "c")
row_labels = c(1,1,1,2,2,3,3,4,4,4)
我现在想要计算与d
和col_labels
的每个组合相对应的row_labels
子集的平均值,即:
s = subset(d, row_labels==1, select=col_labels=="a")
s_mean = mean(as.matrix(s))
最后,我想要一个数据帧,其中行对应row_labels
,列对应col_labels
,并且值为子集的平均值。如果没有大量的for循环,我该怎么做?
答案 0 :(得分:2)
尝试:
set.seed(42)
d <- data.frame(replicate(6,rnorm(10)))
indx <- expand.grid(unique(row_labels), unique(col_labels))
val1 <- apply(indx, 1, function(x)
mean(as.matrix(subset(d, row_labels==x[1], select=col_labels==x[2]))))
val1
#[1] 0.56200717 0.15625521 0.43927374 -0.31929307 -0.01074557 0.75147423
#[7] -0.79730155 -0.86200887 0.28615306 -0.34058148 0.14431610 -0.15834522
或者
fun1 <- function(x,y) mean(as.matrix(subset(d, row_labels==x, select=col_labels==y)))
mapply(fun1, indx[,1], indx[,2])
#[1] 0.56200717 0.15625521 0.43927374 -0.31929307 -0.01074557 0.75147423
#[7] -0.79730155 -0.86200887 0.28615306 -0.34058148 0.14431610 -0.15834522
或使用outer
outer(unique(row_labels), unique(col_labels), Vectorize(fun1))
# [,1] [,2] [,3]
#[1,] 0.5620072 -0.01074557 0.2861531
#[2,] 0.1562552 0.75147423 -0.3405815
#[3,] 0.4392737 -0.79730155 0.1443161
#[4,] -0.3192931 -0.86200887 -0.1583452
cbind indx
和val
res <- cbind(indx, val1)
head(res,3)
#Var1 Var2 val1
#1 1 a 0.5620072
#2 2 a 0.1562552
#3 3 a 0.4392737
mean(as.matrix(subset(d, row_labels==1, select=col_labels=="a")))
#[1] 0.5620072
mean(as.matrix(subset(d, row_labels==2, select=col_labels=="a")))
#[1] 0.1562552
您也可以更改格式
res1 <- outer(unique(row_labels), unique(col_labels), Vectorize(fun1))
dimnames(res1) <- list(unique(row_labels), unique(col_labels))
res1
# a b c
#1 0.5620072 -0.01074557 0.2861531
#2 0.1562552 0.75147423 -0.3405815
#3 0.4392737 -0.79730155 0.1443161
#4 -0.3192931 -0.86200887 -0.1583452
或者您可以使用reshape2
library(reshape2)
acast(res, Var1~Var2, value.var="val1")
# a b c
#1 0.5620072 -0.01074557 0.2861531
#2 0.1562552 0.75147423 -0.3405815
#3 0.4392737 -0.79730155 0.1443161
#4 -0.3192931 -0.86200887 -0.1583452
答案 1 :(得分:2)
这是另一种选择:
res <- lapply(split.default(d, col_labels), FUN=by, INDICES=list(row_labels), function(x) mean(unlist(x)))
do.call(rbind, res)
# 1 2 3 4
# a 0.56201 0.1563 0.4393 -0.3193
# b -0.01075 0.7515 -0.7973 -0.8620
# c 0.28615 -0.3406 0.1443 -0.1583
答案 2 :(得分:1)
您需要将数据更改为长格式。您应该考虑为什么以这种格式导入数据,以及更好的清理方式。
首先,设置列名
colnames(d) <- col_labels
其次,你不能有重复的rownames,所以你不能简单地做rownames(d)&lt; - row_labels。
相反,我们将不得不以另一种方式将它们分开。你可以用split(d, rowlabels)
现在我们要把它全部变成长格式。包reshape2中的熔化功能通常用于此。
require(reshape2)
dMelt <- melt(split(d, row_labels))
现在看看dMelt。有没有理由你不能以这种方式组织数据?
要查找子集均值,请使用函数aggregate()
aggregate(dMelt$value, FUN=mean, by=list(dMelt$variable, dMelt$L1))
答案 3 :(得分:1)
这是使用data.table
的选项。它应该非常快并且具有任何循环
library(data.table)
library(reshape2)
set.seed(42)
merge(
setkey(data.table(variable=colnames(d),x=col_labels),variable),
setkey(melt(setDT(d)[,row:=row_labels,],id.vars="row"),variable))[
,mean(value),c("row","x")]
row x V1
1: 1 a 0.56200717
2: 2 a 0.15625521
3: 3 a 0.43927374
4: 4 a -0.31929307
5: 1 b -0.01074557
6: 2 b 0.75147423
7: 3 b -0.79730155
8: 4 b -0.86200887
9: 1 c 0.28615306
10: 2 c -0.34058148
11: 3 c 0.14431610
12: 4 c -0.15834522
这个想法是: