我正在尝试在带有正则表达式的某些列上使用ddply,但我无法使用它。我在下面准备了一个小例子。有没有办法在几个变量上使用ddply,或者我只是错过了手册中的内容?
df <- data.frame(low_1=rnorm(5),low_2=rnorm(5),high_1=rnorm(5),high_2=rnorm(5),N=c(1,2,3,4,5))
ddply(df,.(N), summarise, low=mean("low.."), high=mean("high.."))
答案 0 :(得分:1)
您可以使用colwise
计算多列上的相同统计信息,例如:
ddply(df, .(N), colwise(mean))
N low_1 low_2 high_1 high_2
1 1 -1.3105923 -0.5507862 0.6304232 -0.04553457
2 2 -0.1586676 0.6820199 -0.8220206 0.93301381
3 3 0.4434761 0.4337073 -1.2988521 0.84412693
4 4 0.2522467 -0.1393690 0.2361361 1.64288051
5 5 0.4118032 0.4358705 -0.3529169 0.98916518
要在列名上使用正则表达式,您可以执行以下操作:
grep()
的正则表达式来标识您感兴趣的所有列。ddply
,其中子集仅包含步骤1和2中标识的列。试试这个:
idx <- grep("low", names(df))
idk <- which(names(df) == "N")
ddply(df[, c(idx, idk)], .(N), colwise(mean))
N low_1 low_2
1 1 -1.3105923 -0.5507862
2 2 -0.1586676 0.6820199
3 3 0.4434761 0.4337073
4 4 0.2522467 -0.1393690
5 5 0.4118032 0.4358705
答案 1 :(得分:0)
就目前而言,您需要为您正在计算的每个统计信息传递不同的参数。
ddply(
df,
.(N),
summarise,
low_1 = mean(low_1),
low_2 = mean(low_2),
high_1 = mean(high_1),
high_2 = mean(high_2)
)
计算这个的惯用方法是在计算统计数据之前将数据重新整形为长格式。
library(plyr)
library(reshape2)
library(stringr)
df_long <- melt(df, id.vars = "N")
matches <- str_match(df_long$variable, "(low|high)_([[:digit:]])")
df_long <- within(
df_long,
{
height <- matches[, 2]
group <- as.integer(matches[, 3])
}
)
ddply(
df_long,
.(N, height, group),
summarize,
mean_value = mean(value)
)
如果您愿意,可以使用mutate
而不是within
,并且可以使用现代ddply
语法替换对dplyr
的来电。
df_long %>%
group_by(N, height, group) %>%
summarize(mean_value = mean(value))
答案 2 :(得分:0)
您可以执行以下操作:
ddply(df,.(N), summarise,
low=mean(sapply(grep("low",colnames(df),value=T),function(x){get(x)})),
high=mean(sapply(grep("high",colnames(df),value=T),function(x){get(x)})))
给出了这个输出:
N low high
1 1 0.94613752 1.47197645
2 2 -0.68887596 -0.05779876
3 3 -0.28589753 -0.55694341
4 4 -0.01378869 0.28204629
5 5 -0.08681600 0.88544497
数据:
> dput(df)
structure(list(low_1 = c(0.885675347945903, -1.30343272566325, -2.44201300062675, -1.27709377574332, -0.794159839824383),
low_2 = c(1.00659968581264,-0.0743191876393787, 1.87021794472605, 1.24951638739919, 0.620527846366092),
high_1 = c(0.630374573470948, 0.169009703225843, -0.573629421621814, 0.340752780334754, 0.417022085050569),
high_2 = c(2.31357832822303,-0.284607218026423, -0.540257400090053, 0.223339795927736, 1.35386785598766),
N = c(1, 2, 3, 4, 5)),
.Names = c("low_1", "low_2", "high_1", "high_2", "N"),
row.names = c(NA, -5L), class = "data.frame")
答案 3 :(得分:0)
这是dplyr和tidyr的一种方法,我认为会产生所需的输出:
require(dplyr) # if not yet installed, first run: install.packages("dplyr")
require(tidyr) # if not yet installed, first run: install.packages("tidyr")
gather(df, group, val, -N) %>% # reshape the data to long format
mutate(group = gsub("*_\\d+$", "", group)) %>% # delete the numbers from low_x and high_x in the "group" column
group_by(N, group) %>% # group the data based on N and group (low/high)
summarise(val = mean(val)) %>% # apply the mean
ungroup() %>% # ungroup the data
spread(group, val) # reshape to wide format so that low and high are separate columns
#Source: local data frame [5 x 3]
#
# N high low
#1 1 0.29702057 0.15541153
#2 2 -1.02057669 1.09399446
#3 3 0.20745563 0.11582517
#4 4 -0.05573833 -0.22570064
#5 5 0.61697307 -0.06831203
它适用于任意数量的low_X和high_X列。
注意:请确保在 plyr之后加载dplyr 以避免函数名称冲突。
set.seed(4711)
df <- data.frame(low_1=rnorm(5),low_2=rnorm(5),high_1=rnorm(5),high_2=rnorm(5),N=c(1,2,3,4,5))