我有数据集名为data2,data3,data4,data7,data11,data12& data13。对于每个数据集,我需要计算error_rate_V2 / V3 / V4 ... =(V2 / V3 / V4 ...实际-V2 / V3 / V4 ...预测)/ V2 / V3 / V4 ...实际如此,对于每个数据集,计算将根据我们预测的列进行更改。
例如data2看起来像:
V2.actual predicted V3 V4 V5 V6 V7 V11 V12 V13
639 1294.704556 53817 13 1 5 39316 13 104 104
2767 2724.884429 5131 3 1 5 39311 2 22 22
673 683.8030988 11332 7 2 5 39315 14 140 80
3100 2556.14175 442 8 3 5 39317 1 0 6
3015 2115.371589 8143 3 1 5 39323 1 6 6
V2 V3.predicted V3.actual V4 V5 V6 V7 V11 V12 V13
4338 3410.386101 1516 4 1 5 39315 3 18 18
726 2654.803413 442 8 3 5 39317 2 0 12
730 762.412623 12617 16 5 5 39314 2 0 7
755 1653.438693 6722 1 1 5 39322 2 12 12
673 701.7884088 15572 10 3 5 39315 8 0 16
等等其他数据集:
我可以使用以下函数来处理数据集,但无法动态计算错误率。分别为每个数据集:
Error_Rate=lapply(mget(paste0("data",c(2:4,7,11:13))), transform,
Error_Rate= ?
有人可以帮忙吗?提前谢谢。
答案 0 :(得分:0)
我创建了一个可能的解决方案。因为我不知道数据集是以数据帧还是矩阵的形式提供的,所以我提供了两种可能性的解决方案。
首先是矩阵解决方案:
#### Example with Matrix #####
V2 <- c(5,8,6,2,3,9)
V3 <- c(8,8,1,15,48,58)
V4 <- c(7,8,9,4,5,6)
V2.predicted <- c(5.5,6.4,8,9,3,4)
V2.actual <- c(8,8,1,15,48,58)
V3.predicted <- c(4,8,6,55,2,3)
V3.actual <- c(5,8,6,2,3,9)
# cbind as matrix
data2 <- cbind(V2.predicted,V2.actual,V3,V4)
data3 <- cbind(V2,V3.predicted,V3.actual,V4)
str(data2)
str(data3)
fun_calc_error <- function(data,name) {
library(plyr)
str(data) # Debugging
# Tests if name is supplied, if not, it trys to extract the name from the dataframe/matrix
# (doesn't work in lapply and ldply, as it tries to access list through X[[1]])
if(missing(name)==TRUE) {
dataname <- deparse(substitute(data)) # extracts the name of the data object
# http://stackoverflow.com/questions/10520772/in-r-how-to-get-an-objects-name-after-it-is-sent-to-a-function
} else {
dataname <- name
}
cat("dataname: ",dataname,"\n") # Debugging
# extract the number of the matrix
df_num <- as.numeric(gsub("data","",dataname)) # extract number of dataframe
# creates column names
col_pred <- paste0("V",df_num,".predicted")
col_act <- paste0("V",df_num,".actual")
# reduce matrix to the 2 columns predicted and actual
new_matrix <- data[,c(col_pred,col_act)]
# split the matrix by row and apply function
error_rate <- aaply(.data=new_matrix,
.margins=1,
.fun=function(new_matrix) error_rate = (new_matrix[1]-new_matrix[2])/new_matrix[2]
)
# debugging
cat("\n str Error rate: ","\n")
str(error_rate)
return(error_rate)
}
# Test function for one matrix
fun_calc_error(data3)
然后是数据帧解决方案:
#### Example with dataframes #####
V2 <- c(5,8,6,2,3,9)
V3 <- c(8,8,1,15,48,58)
V4 <- c(7,8,9,4,5,6)
V2.predicted <- c(5.5,6.4,8,9,3,4)
V2.actual <- c(8,8,1,15,48,58)
V3.predicted <- c(4,8,6,55,2,3)
V3.actual <- c(5,8,6,2,3,9)
# cbind as matrix
data2 <- cbind.data.frame(V2.predicted,V2.actual,V3,V4,stringsAsFactors=FALSE)
data3 <- cbind.data.frame(V2,V3.predicted,V3.actual,V4,stringsAsFactors=FALSE)
str(data2)
str(data3)
fun_calc_error_df <- function(data,name) {
library(dplyr)
str(data) # Debugging
# Tests if name is supplied, if not, it trys to extract the name from the dataframe/matrix
# (doesn't work in lapply and ldply, as it tries to access list through X[[1]])
if(missing(name)==TRUE) {
dataname <- deparse(substitute(data)) # extracts the name of the data object
# http://stackoverflow.com/questions/10520772/in-r-how-to-get-an-objects-name-after-it-is-sent-to-a-function
} else {
dataname <- name
}
cat("dataname: ",dataname,"\n") # Debugging
df_num <- as.numeric(gsub("data","",dataname)) # extract number of dataframe
# creates column names
col_pred <- paste0("V",df_num,".predicted")
col_act <- paste0("V",df_num,".actual")
new_df <- select_(data,col_pred,col_act)
colnames(new_df) <- c("predicted","actual")
new_df %>%
mutate(error_rate = (predicted-actual)/actual) %>%
select(error_rate) -> error_rate
# debugging
cat("\n str Error rate: ","\n")
str(error_rate)
return(error_rate)
}
# TEST for one dataframe
fun_calc_error_df(data3,"data3")
当你在一个数据帧/矩阵上使用这些函数时,它工作正常,即使没有提供数据帧/矩阵的名称,因为
dataname <- deparse(substitute(data))
我可以提取它。
如果要在lapply或ldply中放置数据帧列表,要将该函数同时应用于多个数据帧,则会出现问题。 ldply使用X [[i]]寻址列表元素,并且不提供数据帧的名称。
要解决此问题,我在以下代码中使用了一个循环。也许你找到了解决这个问题的方法,我希望代码有所帮助。
##### Possible solution for more than one dataframe ####
# Create named!!! list of dataframes
df.list <- list(data2=data2,data3=data3)
# Create list of names
nameslist <- names(df.list)
# Create empty dataframe
df_error_rate <- as.data.frame(NULL)
# loop over list elements
i<-1
while(i <= length(df.list)){
cat(i,"\n") # Debugging
# put list element in variable as dataframe
data <- as.data.frame(df.list[[i]],stringsAsFactors=FALSE)
# put name of dataframe from list in variable
name <- nameslist[i]
# apply function
error_rate <- fun_calc_error_df(data,name)
# create vector with names of dataframe
dataframe <- rep.int(name,nrow(df.list[[i]]))
# bind names and values to data frame
tmp_err_rate <- cbind.data.frame(dataframe,error_rate,stringsAsFactors=FALSE)
# bind rows to big data frame
df_error_rate <- rbind.data.frame(df_error_rate,tmp_err_rate)
# count loop up
i <- i + 1
}