我的数据集中有一些异常值。感兴趣的变量命名为j_q3_1, j_q3_2,...,j_q3_14
和j_q4_1, j_q4_2,...,j_q4_14
。我想将大于95百分位的条目更改为95百分位数。我想知道我是否可以创建一个更改问题编号(q3到q4)的循环以及下划线(1到14)之后的最后一个数字。任何建议将不胜感激。
示例数据(仅限_2和q3和q4):
test <- data.frame(hhid = c(1:5), j_q3_1 =c(1000,1500,2000,5000,10000), j_q4_1=c(500,100,200,10000,200), j_q5_1 =c(200,300,400,203,100), j_q3_2 =c(300,10000,200,300,200), j_q4_2=c(100,200,320,120,302), j_q5_2=c(10000,120,1222,300,2333))
此代码适用于每个变量:
quantiles <- quantile(test$j_q3_1,c(0.95))
test$j_q3_1[test$j_q3_1 > quantiles[1]] <- quantiles[1]
quantiles <- quantile(test$j_q4_1,c(0.95))
test$j_q4_1[test$j_q4_1 > quantiles[1]] <- quantiles[1]
quantiles <- quantile(test$j_q3_2,c(0.95))
test$j_q3_2[test$j_q3_2 > quantiles[1]] <- quantiles[1]
quantiles <- quantile(test$j_q4_2,c(0.95))
test$j_q4_2[test$j_q3_2 > quantiles[1]] <- quantiles[1]
现在我尝试将代码用于整个数据集,并将条件更改为99%置信区间。由于变量的99%变为9260,因此25000的异常值保持不变,因此它不能正常工作。它将平均值从1606改为1813,并将NA从2825减少到2801观察结果。
for (i in 3:6){
for (j in 1:14){
cname <- paste0("j_q", i, "_", j)
quantiles <- quantile(test[, cname], c(0.99), na.rm = TRUE)
test[test[!is.na(test[, cname]), cname] > quantiles[1], cname] <- quantiles[1]
}
}
答案 0 :(得分:0)
你可以这样做:
cname <- paste0("j_q", i, "_", j)
quantiles <- quantile(test[, cname], c(0.99))
test[test[, cname] > quantiles[1], cname] <- quantiles[1]
如果你有NA值:
cname <- paste0("j_q", i, "_", j)
quantiles <- quantile(test[, cname], c(0.99), na.rm = TRUE)
test[!is.na(test[,cname]) & test[, cname] > quantiles[1], cname] <- quantiles[1]
答案 1 :(得分:0)
这可能是函数式编程的一个很好的选择:
quantOut <- function(vec) {
quantiles <- quantile(vec,c(0.95))
vec[vec > quantiles[1]] <- quantiles[1]
return(vec)
}
现在quantOut
包装您为一个特定列完成的过程。它可以在任何必要的上下文中使用。
test %>% mutate_at(vars(j_q3_1:j_q4_14), quantOut)
答案 2 :(得分:0)
fn_FindUpperLimit = function(x) quantile(x,0.95)
fn_ReplaceUpperExtreme = function(x) ifelse(x>quantile(x,0.95),quantile(x,0.95),x)
mtcars %>%
select(disp,hp,drat,wt,qsec) %>%
mutate_each(funs(fn_FindUpperLimit),everything()) %>%
distinct()
# disp hp drat wt qsec
#1 449 253.55 4.3145 5.29275 20.1045
mtcars %>%
select(disp,hp,drat,wt,qsec) %>%
mutate_each(funs(fn_ReplaceUpperExtreme),everything())
# disp hp drat wt qsec
#1 160.0 110.00 3.9000 2.62000 16.4600
#2 160.0 110.00 3.9000 2.87500 17.0200
#3 108.0 93.00 3.8500 2.32000 18.6100
#4 258.0 110.00 3.0800 3.21500 19.4400
#5 360.0 175.00 3.1500 3.44000 17.0200
#6 225.0 105.00 2.7600 3.46000 20.1045
#7 360.0 245.00 3.2100 3.57000 15.8400
#8 146.7 62.00 3.6900 3.19000 20.0000
#9 140.8 95.00 3.9200 3.15000 20.1045
#10 167.6 123.00 3.9200 3.44000 18.3000
#11 167.6 123.00 3.9200 3.44000 18.9000
#12 275.8 180.00 3.0700 4.07000 17.4000
#13 275.8 180.00 3.0700 3.73000 17.6000
#14 275.8 180.00 3.0700 3.78000 18.0000
#15 449.0 205.00 2.9300 5.25000 17.9800
#16 449.0 215.00 3.0000 5.29275 17.8200
#17 440.0 230.00 3.2300 5.29275 17.4200
#18 78.7 66.00 4.0800 2.20000 19.4700
#19 75.7 52.00 4.3145 1.61500 18.5200
#20 71.1 65.00 4.2200 1.83500 19.9000
#21 120.1 97.00 3.7000 2.46500 20.0100
#22 318.0 150.00 2.7600 3.52000 16.8700
#23 304.0 150.00 3.1500 3.43500 17.3000
#24 350.0 245.00 3.7300 3.84000 15.4100
#25 400.0 175.00 3.0800 3.84500 17.0500
#26 79.0 66.00 4.0800 1.93500 18.9000
#27 120.3 91.00 4.3145 2.14000 16.7000
#28 95.1 113.00 3.7700 1.51300 16.9000
#29 351.0 253.55 4.2200 3.17000 14.5000
#30 145.0 175.00 3.6200 2.77000 15.5000
#31 301.0 253.55 3.5400 3.57000 14.6000
#32 121.0 109.00 4.1100 2.78000 18.6000
答案 3 :(得分:0)
简单的基础R
quantile <- quantile(test$j_q3_1,0.95)
test[test>=10000] <- quantile
hhid j_q3_1 j_q4_1 j_q5_1 j_q3_2 j_q4_2 j_q5_2
1 1 1000 500 200 300 100 9000
2 2 1500 100 300 9000 200 120
3 3 2000 200 400 200 320 1222
4 4 5000 9000 203 300 120 300
5 5 9000 200 100 200 302 2333