在循环R中标记箱线图的异常值

时间:2020-07-15 02:25:48

标签: r

我一直在尝试更改'labeling outliers in a boxplot'中的代码,以使其循环工作(df中的列)。

ens_id=names(mtcars)
for(i in 1:length(ens_id)){
 
 dat <- test %>% tibble::rownames_to_column(var="outlier") %>% group_by(cond) %>% mutate(is_outlier=ifelse(is_outlier(as.numeric(ens_id[i])),as.numeric(ens_id[i]), as.numeric(NA)))
 dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
 
 p=ggplot(dat, aes_string(y=ens_id[i], x="cond",fill="cond")) + geom_boxplot()  + ylab(gene_id[i])+ geom_text(aes(label=outlier),na.rm=TRUE,nudge_x=0.15)
 ggsave(p, file=paste0("/media/chi/Figures/HVOLvsCDCS/",ens_id[i],".png"), width = 14, height = 10, units = "cm")
 }

quantile.default(x,0.25)中的错误:缺少值且NaN不存在 如果'na.rm'为FALSE,则允许。另外:警告消息:在 is_outlier(as.numeric(ens_id [i])):强制引入的NAs

我使用as.numeric(ens_id [i])来解决该错误:

(1-h)* qs [i]中的错误:二进制运算符的非数字参数

1 个答案:

答案 0 :(得分:1)

问题是字符串(即列名作为字符串)不被评估。一种选择是直接在across中传递字符串或转换为sym bol并求值(!!)。由于前者更容易实现,因此我们在这里显示

library(dplyr) # 1.0.0
library(stringr)
for(i in seq_along(ens_id)) {

      dat <-  test %>%
          tibble::rownames_to_column(var="outlier") %>%
          group_by(cond) %>%
          mutate(across(ens_id[i], ~ replace(., !is_outlier(.), NA), .names = "{col}_is_outlier")) %>%
          # or use mutate_at (if the version is less than 1.0.0
          #mutate_at(vars(ens_id[i]), list(is_outlier = ~replace(., !is_outlier(.), NA))) %>%
          rename_at(vars(ends_with('is_outlier')), ~ str_remove(., str_c(ens_id[i], "_")))
     
      dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
      print(head(dat))

   }

或者如上所述,第二个选项是在转换为!! bol后评估(sym

for(i in seq_along(ens_id)) {
      dat <- test %>%
               tibble::rownames_to_column(var="outlier") %>%
               group_by(cond) %>%
               mutate(is_outlier = replace(!! sym(ens_id[i]), 
                     !is_outlier(!!sym(ens_id[i])), NA)) 
       dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
       print(head(dat))
   }

使用可复制的示例

ens_id <- c("mpg", "wt")
 test <- mtcars
 test$mpg[10] <- 9800
 test$wt[22] <- 4895
 plist <- vector('list', length(ens_id))
 for(i in seq_along(ens_id)) {

       dat <-  test %>%
           tibble::rownames_to_column(var="outlier") %>%
           group_by(gear) %>%
           mutate(across(ens_id[i], ~ replace(., !is_outlier(.), NA), .names = "{col}_is_outlier")) %>%
           # or use mutate_at (if the version is less than 1.0.0
           #mutate_at(vars(ens_id[i]), list(is_outlier = ~replace(., !is_outlier(.), NA))) %>%
           rename_at(vars(ends_with('is_outlier')), ~ str_remove(., str_c(ens_id[i], "_")))
     
       dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
       plist[[i]] <- ggplot(dat, aes_string(y=ens_id[i], x="gear", group="gear")) +
                geom_boxplot()  +
                 ylab(ens_id[i])+ 
                 geom_text(aes(label=outlier), na.rm=TRUE, nudge_x=0.15)

    }
    
plist[[1]]
plist[[2]]