使用group_by和mutate()强制使用for循环

时间:2018-03-22 11:12:07

标签: r list for-loop dataframe mutate

我有一个数据框列表(由初始数据框的排列顺序生成),我希望使用group_by_at()mutate()来应用复杂的微积分。它适用于单个数据帧,但使用for循环失败,因为mutate需要数据帧的名称和我的一些微积分。所以我想,好吧,让我们创建一个不同数据帧的列表,这些数据帧都具有相同的名称,并循环遍历最初的名称序列。不幸的是,这个技巧不起作用,我得到以下信息:

Error: object of type 'closure' is not subsettable. 

以下是显示我所有步骤的自包含示例。我认为问题来自mutate。那么,我怎样才能强制for loop使用mutate

data <- read.table(text = 'obs  gender   ageclass    weight   year   subdata   income    
                        1     F         1         10     yearA     sub1   1000   
                        2     M         2         25     yearA     sub1   1200   
                        3     M         2          5     yearB     sub2   1400   
                        4     M         1         11     yearB     sub1   1350',
 header = TRUE)  


library(dplyr)
library(GiniWegNeg)

dataA <- select(data, gender, ageclass)
dataB <- select(data, -gender, -ageclass)
rm(data)

# Generate permutation of indexes based on the number of column in dataA
library(combinat)
index <- permn(ncol(dataA))

# Attach dataA to the previous list of index           
res <- lapply(index, function(x) dataA[x])

# name my list keeping track of permutation order in dataframe name
names(res) <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))

# Create a list containing the name of each data.frame name
NameList <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))

# Define as N the number of columns/permutation/dataframes
N <- length(res)

# Merge res and dataB for all permutation of dataframes
res <- lapply(res,function(x) cbind(x,dataB))

# Change the name of res so that all data frames are named data
names(res) <- rep("data", N)


# APPLY FOR LOOP TO ALL DATAFRAMES

for (j in NameList){

runCalc <- function(data, y){ 

  data <- data %>% 
    group_by_at(1) %>% 
    mutate(Income_1 = weighted.mean(income, weight))
  data <- data %>% 
    group_by_at(2) %>% 
    mutate(Income_2 = weighted.mean(income, weight))      

  gini <- c(Gini_RSV(data$Income_1, data$weight), Gini_RSV(data$Income_2,data$weight))

  Gini <- data.frame(gini)
  colnames(Gini) <- c("Income_1","Income_2")
  rownames(Gini) <- c(paste0("Gini_", y))

  return(Gini)
}

runOtherCalc <- function(df, y){
  Contrib <- (1/5) * df$Income_1 + df$Income_2
  Contrib <- data.frame(Contrib)
  colnames(Contrib) <- c("myresult")
  rownames(Contrib) <- c(paste0("Contrib_", y)

  return(Contrib)
}

# Run runCalc over dataframe data by year

df1_List <- lapply(unique(data$year), function(i) {      
  byperiod <- subset(data, year == i)
  runCalc(byperiod, i)      
})

# runCalc returns df which then passes to runOtherCalc, again by year

df1_OtherList <- lapply(unique(data$year), function(i)     
  byperiod <- subset(data, year == i)
  df <- runCalc(byperiod, i) 
  runOtherCalc(df, i)      
})

# Run runCalc over dataframe data by subdata

df2_List <- lapply(unique(data$subdata), function(i) {      
  byperiod <- subset(data, subdata == i)
  runCalc(bysubdata, i)      
})

# runCalc returns df which then passes to runOtherCalc, again by subdata

df2_OtherList <- lapply(unique(data$subdata), function(i)     
  bysubdata <- subset(data, subdata == i)
  df <- runCalc(bysubdata, i) 
  runOtherCalc(df, i)      
})


# Return all results in separate frames, then append by row in 2 frames

Gini_df1 <- do.call(rbind, df1_List)
Contrib_df1 <- do.call(rbind,df1_OtherList)
Gini_df2 <- do.call(rbind, df1_List)
Contrib_df2 <- do.call(rbind,df1_OtherList)

Gini <- rbind(Gini_df1, Gini_df2)
Contrib <- rbind(Contrib_df1, Contrib_df2)


}

1 个答案:

答案 0 :(得分:0)

不可否认,您在下面收到的R错误有点神秘,但通常意味着您正在对不存在的对象运行操作。

  

错误:'closure'类型的对象不是子集。

具体来说,它伴随着lapply调用,因为数据未在全球任何地方定义(仅在 runCalc 方法中),如上所述,您将其删除rm(data)

dfList <- lapply(unique(data$year), function(i) {      
  byperiod <- subset(data, year == i)
  runCalc(byperiod, i)      
})

通过lapply...unique...subset的使用方式可以替换为未充分利用的分组基础R函数by()

从您的文字和代码中收集,我相信您打算在列表的每个数据框上运行分组, res 。然后考虑两个by调用,包含在一个更大的函数中,该函数接收一个数据帧 df 作为参数。然后在列表的所有项目上运行lapply以返回嵌套数据帧对的新列表。

# SECONDARY FUNCTIONS
runCalc <- function(data) {                                    
  data <- data %>% 
    group_by_at(1) %>% 
    mutate(Income_1 = weighted.mean(income, weight))
  data <- data %>% 
    group_by_at(2) %>% 
    mutate(Income_2 = weighted.mean(income, weight))      

  Gini <- data.frame(
              year = data$year[[1]],
              Income_1 = unname(Gini_RSV(data$Income_1, data$weight)), 
              Income_2 = unname(Gini_RSV(data$Income_2, data$weight)),
              row.names = paste0("Gini_", data$year[[1]])
          )

  return(Gini)
}

runOtherCalc <- function(df){
  Contrib <- data.frame(
                 myresult = (1/5) * df$Income_1 + df$Income_2,
                 row.names = paste0("Contrib_", df$year[[1]])
             )
  return(Contrib)
}

# PRIMARY FUNCTION
runDfOperations <- function(df) {   
  gList <- by(df, df$year, runCalc)     
  gTmp <- do.call(rbind, gList)

  cList <- by(gTmp, gTmp$year, runOtherCalc)
  cTmp <- do.call(rbind, cList)

  gtmp$year <- NULL
  return(list(gTmp, cTmp))
}

# RETURNS NESTED LIST OF TWO DFs FOR EACH ORIGINAL DF
new_res <- lapply(res, runDfOperations)

# SEPARATE LISTS IF NEEDED (EQUAL LENGTH)
Gini <- lapply(new_res, "[[", 1)
Contrib <- lapply(new_res, "[[", 2)