我有几个数据框,我想迭代它们并删除一些含有超过90%NA的列和行。我也玩过lapply,但我无法让它工作......
我目前的代码是:
data_a_2007 <- read.csv(path)
data_a_2008 <- read.csv(path)
datasets_a <- list(data_a_2007, data_a_2008)
for(dataset in datasets_a) {
columns_to_delete <- NULL
rows_to_delete <- NULL
# find columns threshold
threshold_columns <- floor(nrow(dataset)*0.1)
# find columns to delete
valuecount_columns <- colSums(!is.na(dataset))
columns_to_delete <- sort(which(valuecount_columns < threshold_columns), decreasing = TRUE)
# find rows threshold
threshold_rows <- floor(ncol(dataset)*0.1)
# find rows to delete
valuecount_rows <- rowSums(!is.na(dataset))
rows_to_delete <- sort(which(valuecount_rows < threshold_rows), decreasing = TRUE)
# delete columns with less than x values
for(column_id in columns_to_delete) {
dataset[column_id] <- NULL
}
# delete rows with less than x values
for (row in rows_to_delete) {
dataset <- dataset[-row,]
}
}
答案 0 :(得分:0)
保持行/列与例如少于50%的NAs:
# sample data
set.seed(1)
mat <- matrix(runif(1000), ncol = 5)
mat[sample(1:length(mat), length(mat)*.5)] <- NA
l <- split(as.data.frame(mat), gl(2, 100))
# NA threshold: <50%
NAthres <- 0.5
# keep columns with number of NAs below NA threshold:
l2 <- lapply(l, function(df) {
cols <- apply(df, 2, function(x) sum(is.na(x)))
return(df[, cols < (nrow(df) * NAthres)])
})
# keep rows with number of NAs below NA threshold:
l3 <- lapply(l, function(df) {
rows <- apply(df, 1, function(x) sum(is.na(x)))
df[rows < (ncol(df) * NAthres), ]
})