我有一个数据框列表(由初始数据框的排列顺序生成),我希望使用group_by_at()
和mutate()
来应用复杂的微积分。它适用于单个数据帧,但使用for
循环失败,因为mutate
需要数据帧的名称和我的一些微积分。所以我想,好吧,让我们创建一个不同数据帧的列表,这些数据帧都具有相同的名称,并循环遍历最初的名称序列。不幸的是,这个技巧不起作用,我得到以下信息:
Error: object of type 'closure' is not subsettable.
以下是显示我所有步骤的自包含示例。我认为问题来自mutate
。那么,我怎样才能强制for loop
使用mutate
?
data <- read.table(text = 'obs gender ageclass weight year subdata income
1 F 1 10 yearA sub1 1000
2 M 2 25 yearA sub1 1200
3 M 2 5 yearB sub2 1400
4 M 1 11 yearB sub1 1350',
header = TRUE)
library(dplyr)
library(GiniWegNeg)
dataA <- select(data, gender, ageclass)
dataB <- select(data, -gender, -ageclass)
rm(data)
# Generate permutation of indexes based on the number of column in dataA
library(combinat)
index <- permn(ncol(dataA))
# Attach dataA to the previous list of index
res <- lapply(index, function(x) dataA[x])
# name my list keeping track of permutation order in dataframe name
names(res) <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))
# Create a list containing the name of each data.frame name
NameList <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))
# Define as N the number of columns/permutation/dataframes
N <- length(res)
# Merge res and dataB for all permutation of dataframes
res <- lapply(res,function(x) cbind(x,dataB))
# Change the name of res so that all data frames are named data
names(res) <- rep("data", N)
# APPLY FOR LOOP TO ALL DATAFRAMES
for (j in NameList){
runCalc <- function(data, y){
data <- data %>%
group_by_at(1) %>%
mutate(Income_1 = weighted.mean(income, weight))
data <- data %>%
group_by_at(2) %>%
mutate(Income_2 = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_1, data$weight), Gini_RSV(data$Income_2,data$weight))
Gini <- data.frame(gini)
colnames(Gini) <- c("Income_1","Income_2")
rownames(Gini) <- c(paste0("Gini_", y))
return(Gini)
}
runOtherCalc <- function(df, y){
Contrib <- (1/5) * df$Income_1 + df$Income_2
Contrib <- data.frame(Contrib)
colnames(Contrib) <- c("myresult")
rownames(Contrib) <- c(paste0("Contrib_", y)
return(Contrib)
}
# Run runCalc over dataframe data by year
df1_List <- lapply(unique(data$year), function(i) {
byperiod <- subset(data, year == i)
runCalc(byperiod, i)
})
# runCalc returns df which then passes to runOtherCalc, again by year
df1_OtherList <- lapply(unique(data$year), function(i)
byperiod <- subset(data, year == i)
df <- runCalc(byperiod, i)
runOtherCalc(df, i)
})
# Run runCalc over dataframe data by subdata
df2_List <- lapply(unique(data$subdata), function(i) {
byperiod <- subset(data, subdata == i)
runCalc(bysubdata, i)
})
# runCalc returns df which then passes to runOtherCalc, again by subdata
df2_OtherList <- lapply(unique(data$subdata), function(i)
bysubdata <- subset(data, subdata == i)
df <- runCalc(bysubdata, i)
runOtherCalc(df, i)
})
# Return all results in separate frames, then append by row in 2 frames
Gini_df1 <- do.call(rbind, df1_List)
Contrib_df1 <- do.call(rbind,df1_OtherList)
Gini_df2 <- do.call(rbind, df1_List)
Contrib_df2 <- do.call(rbind,df1_OtherList)
Gini <- rbind(Gini_df1, Gini_df2)
Contrib <- rbind(Contrib_df1, Contrib_df2)
}
答案 0 :(得分:0)
不可否认,您在下面收到的R错误有点神秘,但通常意味着您正在对不存在的对象运行操作。
错误:'closure'类型的对象不是子集。
具体来说,它伴随着lapply
调用,因为数据未在全球任何地方定义(仅在 runCalc 方法中),如上所述,您将其删除rm(data)
。
dfList <- lapply(unique(data$year), function(i) {
byperiod <- subset(data, year == i)
runCalc(byperiod, i)
})
通过lapply...unique...subset
的使用方式可以替换为未充分利用的分组基础R函数by()
。
从您的文字和代码中收集,我相信您打算在列表的每个数据框上运行年分组, res 。然后考虑两个by
调用,包含在一个更大的函数中,该函数接收一个数据帧 df 作为参数。然后在列表的所有项目上运行lapply
以返回嵌套数据帧对的新列表。
# SECONDARY FUNCTIONS
runCalc <- function(data) {
data <- data %>%
group_by_at(1) %>%
mutate(Income_1 = weighted.mean(income, weight))
data <- data %>%
group_by_at(2) %>%
mutate(Income_2 = weighted.mean(income, weight))
Gini <- data.frame(
year = data$year[[1]],
Income_1 = unname(Gini_RSV(data$Income_1, data$weight)),
Income_2 = unname(Gini_RSV(data$Income_2, data$weight)),
row.names = paste0("Gini_", data$year[[1]])
)
return(Gini)
}
runOtherCalc <- function(df){
Contrib <- data.frame(
myresult = (1/5) * df$Income_1 + df$Income_2,
row.names = paste0("Contrib_", df$year[[1]])
)
return(Contrib)
}
# PRIMARY FUNCTION
runDfOperations <- function(df) {
gList <- by(df, df$year, runCalc)
gTmp <- do.call(rbind, gList)
cList <- by(gTmp, gTmp$year, runOtherCalc)
cTmp <- do.call(rbind, cList)
gtmp$year <- NULL
return(list(gTmp, cTmp))
}
# RETURNS NESTED LIST OF TWO DFs FOR EACH ORIGINAL DF
new_res <- lapply(res, runDfOperations)
# SEPARATE LISTS IF NEEDED (EQUAL LENGTH)
Gini <- lapply(new_res, "[[", 1)
Contrib <- lapply(new_res, "[[", 2)