Question

我有几个数据框，我想迭代它们并删除一些含有超过90％NA的列和行。我也玩过lapply，但我无法让它工作......

我目前的代码是：

data_a_2007 <- read.csv(path)
data_a_2008 <- read.csv(path)
datasets_a <- list(data_a_2007, data_a_2008)

for(dataset in datasets_a) {
  columns_to_delete <- NULL
  rows_to_delete <- NULL

  # find columns threshold
  threshold_columns <- floor(nrow(dataset)*0.1)

  # find columns to delete
  valuecount_columns <- colSums(!is.na(dataset))
  columns_to_delete <- sort(which(valuecount_columns < threshold_columns), decreasing = TRUE)

  # find rows threshold
  threshold_rows <- floor(ncol(dataset)*0.1)

  # find rows to delete
  valuecount_rows <- rowSums(!is.na(dataset))
  rows_to_delete <- sort(which(valuecount_rows < threshold_rows), decreasing = TRUE)

  # delete columns with less than x values  
  for(column_id in columns_to_delete) {
    dataset[column_id] <- NULL
  }

  # delete rows with less than x values  
  for (row in rows_to_delete) {
    dataset <- dataset[-row,]
  }
}

Answer 1

保持行/列与例如少于50％的NAs：

# sample data
set.seed(1)
mat <- matrix(runif(1000), ncol = 5)
mat[sample(1:length(mat), length(mat)*.5)] <- NA
l <- split(as.data.frame(mat), gl(2, 100))

# NA threshold: <50%
NAthres <- 0.5

# keep columns with number of NAs below NA threshold:
l2 <- lapply(l, function(df) {
  cols <- apply(df, 2, function(x) sum(is.na(x)))
  return(df[, cols < (nrow(df) * NAthres)])
})


# keep rows with number of NAs below NA threshold:
l3 <- lapply(l, function(df) {
  rows <- apply(df, 1, function(x) sum(is.na(x)))  
  df[rows < (ncol(df) * NAthres), ]
})

迭代数据框列表并删除每个数据框中的列和行

1 个答案: