我试图弄清楚如何计算每个变量的每个值的平均值,中位数和标准差。以下是一些数据(感谢@Barranka以易于复制的格式提供数据):
df <- data.frame(
gama=c(10, 1, 1, 1, 1, 1, 10, 0.1, 10),
theta=c(1, 1, 1, 1, 0.65, 1, 0.65, 1, 1),
detectl=c(3, 5, 1, 1, 5, 3, 5, 5, 1),
NSMOOTH=c(10, 5, 20, 20, 5, 20, 10, 10, 40),
NREF=c(50, 80, 80, 50, 80, 50, 10, 100, 30),
NOBS=c(10, 40, 40, 20, 20, 20, 10, 40, 10),
sma=c(15, 15, 15, 15, 15, 15, 15, 15, 15),
lma=c(33, 33, 33, 33, 33, 33, 33, 33, 33),
PosTrades=c(11, 7, 6, 3, 9, 3, 6, 6, 5),
NegTrades=c(2, 2, 1, 0, 1, 0, 1, 5, 1),
Acc=c(0.846154, 0.777778, 0.857143, 1, 0.9, 1, 0.857143, 0.545455, 0.833333),
AvgWin=c(0.0451529, 0.0676022, 0.0673241, 0.13204, 0.0412913, 0.126522, 0.0630061, 0.0689745, 0.0748437),
AvgLoss=c(-0.0194498, -0.0083954, -0.0174653, NaN, -0.00264179, NaN, -0.0161558, -0.013903, -0.0278908), Return=c(1.54942, 1.54916, 1.44823, 1.44716, 1.42789, 1.42581, 1.40993, 1.38605, 1.38401)
)
为了以后将其保存到csv中,我必须将其设置为应该像这样的数据框
Table for gama
Value Average Median Standard Deviation
10 (Avg of 10) (median of 10) (Stdev of 10)
1 (Avg of 1) (median of 1) (Stdev of 1)
0.1 (Avg of 0.1) (median of 0.1) (Stdev of 0.1)
Table for theta
Value Average Median Standard Deviation
1 (Avg of 10) (median of 10) (Stdev of 10)
0.65 (Avg of 0.65) (median of 0.65) (Stdev of 0.65)
Table for detectionsLimit
Value Average Median Standard Deviation
3 (Avg of 3) (median of 3) (Stdev of 3)
5 (Avg of 5) (median of 5) (Stdev of 5)
...
要用作ID的列是:
ids <- c("gama", "theta","detectl", "NSMOOTH", "NREF", "NOBS", "sma", "lma")
应在以下列中计算摘要统计信息:
vals <- c("PosTrades", "NegTrades", "Acc", "AvgWin", "AvgLoss", "Return")
我尝试过使用data.table
包/函数,但我无法弄清楚如何使用data.table
开发一种方法,而无需逐个重命名值;此外,在采用这种方法时,我的代码变得非常复杂。
答案 0 :(得分:2)
巧妙使用melt()
和tapply()
可以为您提供帮助。我做了以下假设:
gama, theta, ..., negTrades
)为了重现性,请输入:
# Your example data
df <- data.frame(
gama=c(10, 1, 1, 1, 1, 1, 10, 0.1, 10),
theta=c(1, 1, 1, 1, 0.65, 1, 0.65, 1, 1),
detectl=c(3, 5, 1, 1, 5, 3, 5, 5, 1),
NSMOOTH=c(10, 5, 20, 20, 5, 20, 10, 10, 40),
NREF=c(50, 80, 80, 50, 80, 50, 10, 100, 30),
NOBS=c(10, 40, 40, 20, 20, 20, 10, 40, 10),
sma=c(15, 15, 15, 15, 15, 15, 15, 15, 15),
lma=c(33, 33, 33, 33, 33, 33, 33, 33, 33),
PosTrades=c(11, 7, 6, 3, 9, 3, 6, 6, 5),
NegTrades=c(2, 2, 1, 0, 1, 0, 1, 5, 1),
Acc=c(0.846154, 0.777778, 0.857143, 1, 0.9, 1, 0.857143, 0.545455, 0.833333),
AvgWin=c(0.0451529, 0.0676022, 0.0673241, 0.13204, 0.0412913, 0.126522, 0.0630061, 0.0689745, 0.0748437),
AvgLoss=c(-0.0194498, -0.0083954, -0.0174653, NaN, -0.00264179, NaN, -0.0161558, -0.013903, -0.0278908), Return=c(1.54942, 1.54916, 1.44823, 1.44716, 1.42789, 1.42581, 1.40993, 1.38605, 1.38401)
)
这是我提出的解决方案:
library(reshape)
md <- melt(df, id=colnames(df)[1:10]) # This will create one row for each
# 'id' combination, and will store
# the rest of the column headers
# in the `variable` column, and
# each value corresponding to the
# variable. Like this:
head(md)
## gama theta detectl NSMOOTH NREF NOBS sma lma PosTrades NegTrades variable value
## 1 10 1.00 3 10 50 10 15 33 11 2 Acc 0.846154
## 2 1 1.00 5 5 80 40 15 33 7 2 ## Acc 0.777778
## 3 1 1.00 1 20 80 40 15 33 6 1 ## Acc 0.857143
## 4 1 1.00 1 20 50 20 15 33 3 0 ## Acc 1.000000
## 5 1 0.65 5 5 80 20 15 33 9 1 ## Acc 0.900000
## 6 1 1.00 3 20 50 20 15 33 3 0 ## Acc 1.000000
results <- list() # Prepare the results list
for(i in unique(md$variable)) { # For each variable you have...
results[[i]] <- list() # ... create a new list to hold the 'summary'
tmp_data <- subset(md, variable==i) # Filter the data you'll use
for(j in colnames(tmp_data)[1:10]) { # For each variable, use tapply()
# to get what you need, and
# store it into a data frame
# inside the results
results[[i]][[j]] <- as.data.frame(
t(
rbind(
tapply(tmp_data$value, tmp_data[,j], mean),
tapply(tmp_data$value, tmp_data[,j], median),
tapply(tmp_data$value, tmp_data[,j], sd))
)
)
colnames(results[[i]][[j]]) <- c('average', 'median', 'sd')
}
rm(tmp_data) # You'll no longer need this
}
现在怎样?查看results
的摘要:
summary(results)
## Length Class Mode
## Acc 10 -none- list
## AvgWin 10 -none- list
## AvgLoss 10 -none- list
## Return 10 -none- list
您有每个变量的列表。现在,如果您查看任何results
&#34;子列表&#34;的摘要,您会看到:
summary(results$Acc)
## Length Class Mode
## gama 3 data.frame list
## theta 3 data.frame list
## detectl 3 data.frame list
## NSMOOTH 3 data.frame list
## NREF 3 data.frame list
## NOBS 3 data.frame list
## sma 3 data.frame list
## lma 3 data.frame list
## PosTrades 3 data.frame list
## NegTrades 3 data.frame list
查看查看results$Acc$gama
列表时会发生什么:
results$Acc$gama
## average median sd
## 0.1 0.5454550 0.545455 NA
## 1 0.9069842 0.900000 0.09556548
## 10 0.8455433 0.846154 0.01191674
因此,对于每个变量和每个&#34; id&#34;列,您有所需的数据摘要。
希望这有帮助。
答案 1 :(得分:1)
我的方法涉及data.table
。
编辑:我试图提交问题的编辑,但我采取了一些自由,所以它可能会被拒绝。我假设哪些列将用作“id”列(其值为子集数据的列),哪些列应为“measure”列(其值用于计算汇总统计信息的列)。请参阅此处了解这些名称:
ids <- c("gama", "theta","detectl", "NSMOOTH", "NREF", "NOBS", "sma", "lma")
vals <- c("PosTrades", "NegTrades", "Acc", "AvgWin", "AvgLoss", "Return")
# Convert to data.table
df <- data.table(df)
# Helper function to convert a string to a call
# useful in a data.table j
s2c <- function (x, type = "list"){
as.call(lapply(c(type, x), as.symbol))
}
# Function to computer the desired summary stats
smry <- function(x) list(Average=mean(x, na.rm=T), Median=median(x, na.rm=T), StandardDeviation=sd(x, na.rm=T))
# Define some names to use later
ids <- c("gama", "theta","detectl", "NSMOOTH", "NREF", "NOBS", "sma", "lma")
vals <- c("PosTrades", "NegTrades", "Acc", "AvgWin", "AvgLoss", "Return")
usenames <- paste(rep(c("Average","Median","StdDev"),each=length(vals)), vals,sep="_")
data.table
# Compute the summary statistics
df2 <- df[,j={
for(i in 1:length(ids)){ # loop through each id
t.id <- ids[i]
t.out <- .SD[,j={
t.vals <- .SD[,eval(s2c(vals))] # this line returns a data.table with each vals as a column
sapply(t.vals, smry) # apply summary statistics
},by=t.id] # this by= loops through each value of the current id (t.id)
setnames(t.out, c("id.val", usenames)) # fix the names of the data.table to be returned for this i
t.out <- cbind(id=t.id, t.out) # add a column indicating the variable name (t.id)
if(i==1){big.out <- t.out}else{big.out <- rbind(big.out, t.out)} # accumulate the output data.table
}
big.out
}]
df2 <- data.table:::melt.data.table(df2, id.vars=c("id","id.val")) # melt into "long" format
df2[,c("val","metric"):=list(gsub(".*_","",variable),gsub("_.*","",variable))] # splice names to create id's
df2[,variable:=NULL] # delete old column that had the names we just split up
df2 <- data.table:::dcast.data.table(df2, id+id.val+val~metric) # go a bit wider, so stats in diff columns
# reshape2:::acast(df2, id+id.val~metric~val) # maybe replace the above line with this
id id.val val Average Median StdDev
1: NOBS 10 Acc 3.214550 0.01191674 0.006052701
2: NOBS 10 AvgLoss 1.000000 0.06300610 1.409930000
3: NOBS 10 AvgWin 1.333333 0.06100090 1.447786667
4: NOBS 10 NegTrades 6.000000 0.84615400 -0.019449800
5: NOBS 10 PosTrades 7.333333 0.84554333 -0.021165467
---
128: theta 1 AvgLoss 1.000000 0.06897450 1.447160000
129: theta 1 AvgWin 1.571429 0.08320849 1.455691429
130: theta 1 NegTrades 6.000000 0.84615400 -0.017465300
131: theta 1 PosTrades 5.857143 0.83712329 -0.017420860
132: theta 1 Return 1.718249 0.03285638 0.068957635