循环以将R中的多个变量的异常值更改为95%

时间:2016-09-30 03:56:03

标签: r loops

我的数据集中有一些异常值。感兴趣的变量命名为j_q3_1, j_q3_2,...,j_q3_14j_q4_1, j_q4_2,...,j_q4_14。我想将大于95百分位的条目更改为95百分位数。我想知道我是否可以创建一个更改问题编号(q3到q4)的循环以及下划线(1到14)之后的最后一个数字。任何建议将不胜感激。

示例数据(仅限_2和q3和q4):

    test <- data.frame(hhid = c(1:5), j_q3_1 =c(1000,1500,2000,5000,10000), j_q4_1=c(500,100,200,10000,200), j_q5_1 =c(200,300,400,203,100), j_q3_2 =c(300,10000,200,300,200), j_q4_2=c(100,200,320,120,302), j_q5_2=c(10000,120,1222,300,2333))

此代码适用于每个变量:

    quantiles <- quantile(test$j_q3_1,c(0.95))
    test$j_q3_1[test$j_q3_1 > quantiles[1]] <- quantiles[1]

    quantiles <- quantile(test$j_q4_1,c(0.95))
    test$j_q4_1[test$j_q4_1 > quantiles[1]] <- quantiles[1]

    quantiles <- quantile(test$j_q3_2,c(0.95))
    test$j_q3_2[test$j_q3_2 > quantiles[1]] <- quantiles[1]

    quantiles <- quantile(test$j_q4_2,c(0.95))
    test$j_q4_2[test$j_q3_2 > quantiles[1]] <- quantiles[1]

现在我尝试将代码用于整个数据集,并将条件更改为99%置信区间。由于变量的99%变为9260,因此25000的异常值保持不变,因此它不能正常工作。它将平均值从1606改为1813,并将NA从2825减少到2801观察结果。

  for (i in 3:6){
    for (j in 1:14){
    cname <- paste0("j_q", i, "_", j)
    quantiles <- quantile(test[, cname], c(0.99), na.rm = TRUE)
    test[test[!is.na(test[, cname]), cname] > quantiles[1], cname] <- quantiles[1]
  }
}

4 个答案:

答案 0 :(得分:0)

你可以这样做:

cname <- paste0("j_q", i, "_", j)
quantiles <- quantile(test[, cname], c(0.99))
test[test[, cname] > quantiles[1], cname] <- quantiles[1]

如果你有NA值:

cname <- paste0("j_q", i, "_", j)
quantiles <- quantile(test[, cname], c(0.99), na.rm = TRUE)
test[!is.na(test[,cname]) & test[, cname] > quantiles[1], cname] <- quantiles[1]

答案 1 :(得分:0)

这可能是函数式编程的一个很好的选择:

quantOut <- function(vec) {
  quantiles <- quantile(vec,c(0.95))
  vec[vec > quantiles[1]] <- quantiles[1]
  return(vec)
}

现在quantOut包装您为一个特定列完成的过程。它可以在任何必要的上下文中使用。

test %>% mutate_at(vars(j_q3_1:j_q4_14), quantOut)   

答案 2 :(得分:0)

fn_FindUpperLimit = function(x) quantile(x,0.95)
fn_ReplaceUpperExtreme = function(x) ifelse(x>quantile(x,0.95),quantile(x,0.95),x) 

mtcars %>% 
select(disp,hp,drat,wt,qsec) %>% 
mutate_each(funs(fn_FindUpperLimit),everything()) %>%
distinct()
#  disp     hp   drat      wt    qsec
#1  449 253.55 4.3145 5.29275 20.1045

mtcars %>% 
select(disp,hp,drat,wt,qsec) %>% 
mutate_each(funs(fn_ReplaceUpperExtreme),everything())


#    disp     hp   drat      wt    qsec
#1  160.0 110.00 3.9000 2.62000 16.4600
#2  160.0 110.00 3.9000 2.87500 17.0200
#3  108.0  93.00 3.8500 2.32000 18.6100
#4  258.0 110.00 3.0800 3.21500 19.4400
#5  360.0 175.00 3.1500 3.44000 17.0200
#6  225.0 105.00 2.7600 3.46000 20.1045
#7  360.0 245.00 3.2100 3.57000 15.8400
#8  146.7  62.00 3.6900 3.19000 20.0000
#9  140.8  95.00 3.9200 3.15000 20.1045
#10 167.6 123.00 3.9200 3.44000 18.3000
#11 167.6 123.00 3.9200 3.44000 18.9000
#12 275.8 180.00 3.0700 4.07000 17.4000
#13 275.8 180.00 3.0700 3.73000 17.6000
#14 275.8 180.00 3.0700 3.78000 18.0000
#15 449.0 205.00 2.9300 5.25000 17.9800
#16 449.0 215.00 3.0000 5.29275 17.8200
#17 440.0 230.00 3.2300 5.29275 17.4200
#18  78.7  66.00 4.0800 2.20000 19.4700
#19  75.7  52.00 4.3145 1.61500 18.5200
#20  71.1  65.00 4.2200 1.83500 19.9000
#21 120.1  97.00 3.7000 2.46500 20.0100
#22 318.0 150.00 2.7600 3.52000 16.8700
#23 304.0 150.00 3.1500 3.43500 17.3000
#24 350.0 245.00 3.7300 3.84000 15.4100
#25 400.0 175.00 3.0800 3.84500 17.0500
#26  79.0  66.00 4.0800 1.93500 18.9000
#27 120.3  91.00 4.3145 2.14000 16.7000
#28  95.1 113.00 3.7700 1.51300 16.9000
#29 351.0 253.55 4.2200 3.17000 14.5000
#30 145.0 175.00 3.6200 2.77000 15.5000
#31 301.0 253.55 3.5400 3.57000 14.6000
#32 121.0 109.00 4.1100 2.78000 18.6000

答案 3 :(得分:0)

简单的基础R

quantile <- quantile(test$j_q3_1,0.95)
test[test>=10000] <- quantile

hhid j_q3_1 j_q4_1 j_q5_1 j_q3_2 j_q4_2 j_q5_2
1    1   1000    500    200    300    100   9000
2    2   1500    100    300   9000    200    120
3    3   2000    200    400    200    320   1222
4    4   5000   9000    203    300    120    300
5    5   9000    200    100    200    302   2333