如果一系列项目中缺少超过20%的变量,我将如何创建过滤变量?例如,如果mssi1_1:mssi1_4
在超过20%的变量中包含NA
值,则过滤掉。
可重现的数据集:
df2 <- structure(list(uci = c("10001h", "10476h", "10484h", "10580h",
"14280h", "2313n", "2778n", "3063n", "3579h", "3699h", "4090h",
"4393h", "4412h", "4528h", "4582h", "4683h", "4735h", "4736h",
"4745h", "4750h", "4756h", "4770h", "4771h", "4832h", "4872h",
"517n", "6292h", "6309h", "6481h", "6601h", "6704h", "6948h",
"7020h", "7030h", "7071h", "7160h", "7188h", "7235h", "7266h",
"7348h", "7746h", "7810h", "8082h", "8119h", "8334h", "8345h",
"8462h", "8486h", "8518h", "8578h", "8761h", "8799h", "8939h",
"9046h", "9191h", "9194h", "9222h", "9273h", "9293h", "9448h",
"9486h", "9757h", "9894h", "10268h", "10431h", "10498h", "10572h",
"10622h", "10652h", "10660h", "14457h", "2420n", "2966n", "3006n",
"3766h", "4219h", "4256h", "4366h", "4367h", "4534h", "4538h",
"4543h", "4569h", "4570h", "4757h", "4769h", "4806h", "4843h",
"4955h", "4958h", "50n", "601h", "603n", "6315h", "6340h", "6348h",
"6358h", "6369h", "6379h", "6395h"), ID = c(1, 5, 6, 13, 20,
28, 32, 36, 44, 48, 55, 69, 72, 80, 92, 107, 114, 115, 116, 117,
118, 124, 125, 131, 135, 154, 158, 160, 179, 185, 193, 214, 218,
220, 223, 232, 236, 240, 242, 248, 285, 288, 308, 313, 330, 332,
341, 345, 350, 354, 369, 372, 379, 389, 403, 404, 405, 412, 413,
421, 425, 445, 456, 2, 3, 7, 11, 14, 17, 18, 23, 30, 34, 35,
50, 59, 61, 66, 67, 83, 85, 87, 90, 91, 119, 123, 127, 133, 148,
149, 153, 156, 157, 162, 165, 166, 167, 169, 170, 173), Class = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), age = c(14, 17,
14, 14, 15, 14, 16, 20, 12, 16, 12, 15, 15, 12, 16, 17, 14, 14,
13, 13, 14, 14, 23, 12, 15, 15, 14, 13, 17, 22, 15, 17, 22, 14,
15, 15, 23, 15, 17, 12, 24, 15, 13, 13, 14, 17, 13, 21, 14, 14,
15, 13, 21, 14, 21, 15, 15, 14, 16, 13, 12, 12, 12, 14, 17, 16,
16, 15, 15, 13, 14, 20, 24, 15, 15, 14, 17, 14, 16, 15, 15, 17,
14, 15, 13, 19, 19, 14, 16, 16, 22, 21, 23, 19, 15, 15, 14, 14,
15, 24), sex = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
1), bhsMean = c(0.47, 0.3, 0.16, 0.15, 0.35, 0.06, 0.25, 0.35,
0.15, 0.35, 0.3, 0, 0.3, 0.38, 0.3, 0.1, 0.2, 0.1, 0.25, 0.2,
0.3, 0.4, 0.3, 0.4, 0.2, 0.2, 0.05, 0.1, 0.35, 0.1, 0, 0.25,
0.2, 0.25, 0.05, 0.35, 0.3, 0.25, 0.2, 0.27, 0.35, 0.15, 0.25,
0.1, 0.2, 0.25, 0.05, 0.1, 0.45, 0.3, 0.36, 0.3, 0.44, 0.15,
0.2, 0.11, 0.25, 0.2, 0.05, 0.45, 0, 0.4, 0.25, 0.6, 0.6, 0.55,
0.71, 0.67, 0.5, 0.5, 0.55, 0.68, 0.55, 0.4, 0.68, 0.5, 0.6,
0.53, 0.6, 0.65, 0.53, 0.65, 0.65, 0.65, 0.6, 0.55, 0.5, 0.55,
0.6, 0.75, 0.65, 0.45, 0.5, 0.5, 0.65, 0.45, 0.6, 0.65, 0.65,
0.45), tbMean = c(2.56, 3.89, 2.67, 2.33, 4.89, 1.44, 2.44, 2.44,
NA, NA, NA, NA, NA, 3.44, 1.22, 3.11, 4, 4.11, 3, 2, 2.78, 2.67,
3.44, 3.33, 3.33, 3.78, 3.89, 2.11, 4.56, 4, 1, 3.22, 3.33, 2.89,
1.44, 3.11, 2.67, 3.33, 3.44, 1.33, 2.78, 2.67, 3.33, 2, 2.44,
3.89, 2.44, 3.78, 3.67, 3.56, 3.56, 3.78, 1.78, 2.11, 3.33, 3.11,
2.67, 2.44, 3.56, 1.67, NA, 2.67, 4.44, 4.89, 4.56, 3.89, 4.44,
4.11, 3.67, 3.44, 4.44, 5, 3.78, 4.78, NA, NA, NA, NA, NA, 3.44,
4, 4.56, 4.11, 4, 3.78, 5.11, 3.56, 2.89, 3.11, 3.11, 4.33, 3.56,
5.11, 3.33, 4.11, 4.44, 4.67, 4, 4.56, 4.67), pbMean = c(2, 3.67,
4, 4.5, 2.17, 1, 3.5, 2.33, NA, NA, NA, NA, NA, 1.5, 3.67, 3,
3.5, 2.5, 2.17, 2, 1, 3.67, 2.33, 1.67, 2, 2, 3.17, 2.17, 1,
3.83, 1, 2.33, 2.67, 3, 1, 3.33, 2, 3, 1.83, 1.17, 1, 2, 2.33,
2.17, 2.17, 2.83, 2.67, 2.67, 1, 2.17, 1.67, 3.33, 1.33, 2.17,
2.17, 1.17, 2.33, 1.83, 2.17, 1, NA, 1.5, 1.2, 3.17, 4.67, 1.33,
2.83, 2.67, 2, 4.33, 3, 3, 5, 3.33, NA, NA, NA, NA, NA, 4.5,
1.5, 4, 5.17, 3.33, 3.33, 3.67, 4.5, 2, 3.17, 3.67, 4.83, 4.33,
3.67, 3.83, 5.17, 3, 2.33, 2.33, 4, 1.33), acssMean = c(2.29,
1.86, 1.14, 2, 1.14, NA, 2, 3.29, NA, NA, NA, NA, NA, NA, 1.57,
2.33, 3.43, 0.14, 1.43, 1.57, 2.29, 1.29, 0.29, 1.43, 0.57, 0.43,
2.29, NA, 2.57, 1.71, 2.43, 1.43, 2.71, 2.29, 2.29, 1.86, 0.86,
3.71, 1.57, NA, 2.29, 1, 2.71, 2, 0, 1.43, 2.71, NA, NA, NA,
1.86, NA, 1.83, 2, 3.43, 0, 3.43, 0.86, NA, NA, NA, 2.14, NA,
3.43, 4, 3.14, 3.29, 2.83, 1.71, 1.86, 2.14, 1.33, 1.71, 1.57,
NA, NA, NA, NA, NA, 2.71, 1.29, 3.57, 2.29, 0.14, 1.71, 0.14,
2.86, 2.71, 1.43, 1.71, 0.86, 2.33, 2.43, 1.71, 2.57, 1.14, 3.43,
2.86, 3.57, 1.86), mssi1_1 = c(NA, 0, 0, 1, 1, 0, 2, 2, 0, 0,
0, 0, 0, NA, 0, 1, 0, 0, 0, 0, 0, 1, NA, 0, 0, 1, 1, 1, 2, 1,
0, 0, NA, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 1, 1, 2, 1,
1, 1, NA, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 1, 2,
0, 1, 1, 1, 0, 0, 0, 1, 1), mssi1_2 = c(NA, 1, 0, 1, 1, 0, 1,
2, 1, 1, 0, 0, 2, NA, 1, 1, 0, 0, 0, 0, 0, 1, NA, 0, 0, 2, 1,
0, 2, 1, 0, 1, NA, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 2, 1,
1, 1, 0, 1, 1, NA, 0, 0, 0, 0, 2, 1, 1, 1, 2, 0, 1, 0, 1, 1,
0, 0, 2, 0, 1, 1, 1, 0, 0, 0, 1, 1), mssi1_3 = c(NA, 0, 0, 0,
0, 0, 2, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0,
0, 0, 0, 0, 2, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), mssi1_4 = c(NA,
0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0,
2, NA, 0, 0, 0, 0, 0, 1, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, NA, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), mssi1_5 = c(NA,
NA, NA, NA, NA, NA, 2, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 0, NA, NA,
NA, 0, NA, NA, 1, 2, NA, NA, NA, NA, NA, NA, NA, 3, NA, NA, NA,
2, NA, NA, NA, NA, NA, NA, NA, 2, NA, 1, NA, 2, NA, NA, NA, NA,
NA), mssi1_6 = c(NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA,
0, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA,
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA, NA, 1, NA, NA, 1, 0, NA, NA, NA, NA, NA,
NA, NA, 3, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, 2, NA,
1, NA, 0, NA, NA, NA, NA, NA), mssi1_7 = c(NA, NA, NA, NA, NA,
NA, 2, 1, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA,
2, NA, NA, NA, 2, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 2, 0, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, 2, NA, NA,
2, 1, NA, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, 1, NA, NA, NA,
NA, NA, NA, NA, 2, NA, 1, NA, 2, NA, NA, NA, NA, NA), mssi1_8 = c(NA,
NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, 0, NA, NA, NA, NA,
NA, NA, NA, NA, 0, NA, NA, NA, 1, NA, NA, 2, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 1, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA,
NA, 3, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA,
1, NA, NA, NA, NA, NA, NA, NA, 0, NA, 2, NA, 1, NA, NA, NA, NA,
NA), mssi1_9 = c(NA, NA, NA, NA, NA, NA, 2, 2, NA, NA, NA, NA,
2, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA,
2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA,
NA, NA, NA, NA, NA, NA, 3, NA, NA, 1, 1, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, 2, NA,
1, NA, 1, NA, NA, NA, NA, NA), mssi1_10 = c(NA, NA, NA, NA, NA,
NA, 2, 0, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA,
0, NA, NA, NA, 1, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA,
1, 2, NA, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, 2, NA, NA, NA,
NA, NA, NA, NA, 0, NA, NA, NA, 1, NA, NA, NA, NA, NA), mssi1_11 = c(NA,
NA, NA, NA, NA, NA, 3, 1, NA, NA, NA, NA, 2, NA, NA, NA, NA,
NA, NA, NA, NA, 0, NA, NA, NA, 0, NA, NA, 2, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA,
NA, 0, NA, NA, 1, 2, NA, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA,
0, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA, NA,
NA, NA), mssi1_12 = c(NA, NA, NA, NA, NA, NA, 1, 2, NA, NA, NA,
NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 0, NA,
NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0,
NA, NA, NA, NA, NA, NA, NA, 2, NA, NA, 1, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, 1,
NA, NA, NA, 0, NA, NA, NA, NA, NA), mssi1_13 = c(NA, NA, NA,
NA, NA, NA, 1, 3, NA, NA, NA, NA, 3, NA, NA, NA, NA, NA, NA,
NA, NA, 0, NA, NA, NA, 1, NA, NA, 2, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 1, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, 3,
NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1,
NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 0, NA, NA, NA, NA,
NA), mssi1_14 = c(NA, NA, NA, NA, NA, NA, 1, 0, NA, NA, NA, NA,
0, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, 1, NA, NA,
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA,
NA, NA, NA, NA, NA, NA, 1, NA, NA, 0, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, 1, NA,
0, NA, 0, NA, NA, NA, NA, NA), mssi1_15 = c(NA, NA, NA, NA, NA,
NA, 0, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA,
1, NA, NA, NA, 0, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA,
0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA,
NA, NA, NA, NA, NA, 0, NA, 0, NA, 0, NA, NA, NA, NA, NA), mssi1_16 = c(NA,
NA, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA,
NA, NA, NA, NA, 0, NA, NA, NA, 0, NA, NA, 0, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA,
NA, 0, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0, NA, NA, NA, NA, NA, NA, NA, 0, NA, 0, NA, 0, NA, NA, NA,
NA, NA), mssi1_17 = c(NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA,
NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, 3, NA,
NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0,
NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, 0, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, 0,
NA, 0, NA, 0, NA, NA, NA, NA, NA), mssi1_18 = c(NA, NA, NA, NA,
NA, NA, 0, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA,
NA, 0, NA, NA, NA, 0, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, 0, NA,
NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, 0, NA,
NA, NA, NA, NA, NA, NA, 0, NA, 0, NA, 0, NA, NA, NA, NA, NA)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
我会假设能够将其应用于任意数量的变量。但我不知道从哪里开始。我是否首先为一系列变量定义一些例子:
case1 <- vars(mssi1_1:mssi1_4)
case2 <- vars(mssi1_5:mssi1_18)
同样,我真的不知道从哪里开始。如果其中任何一个令人困惑,请提前道歉。如果您需要更多信息,请告诉我。
非常感谢任何帮助,我提前感谢你。
答案 0 :(得分:2)
希望这有帮助!
library(dplyr)
df2 %>%
mutate(missing_perc = rowMeans(is.na(select(., mssi1_1: mssi1_4))) * 100)
输出为:
uci ID Class age sex bhsMean tbMean pbMean acssMean mssi1_1 mssi1_2 mssi1_3 mssi1_4 missing_perc
1 10001h 1.00 1.00 14.0 0 0.470 2.56 2.00 2.29 NA NA NA NA 100
2 10476h 5.00 1.00 17.0 0 0.300 3.89 3.67 1.86 NA NA 0 0 50.0
3 10484h 6.00 1.00 14.0 0 0.160 2.67 4.00 1.14 0 0 0 0 0
4 10580h 13.0 1.00 14.0 0 0.150 2.33 4.50 2.00 1.00 1.00 0 0 0
5 14280h 20.0 1.00 15.0 0 0.350 4.89 2.17 1.14 1.00 1.00 0 0 0
6 2313n 28.0 1.00 14.0 0 0.0600 1.44 1.00 NA 0 0 0 0 0
示例数据:
df2 <- structure(list(uci = c("10001h", "10476h", "10484h", "10580h",
"14280h", "2313n"), ID = c(1, 5, 6, 13, 20, 28), Class = c(1,
1, 1, 1, 1, 1), age = c(14, 17, 14, 14, 15, 14), sex = c(0, 0,
0, 0, 0, 0), bhsMean = c(0.47, 0.3, 0.16, 0.15, 0.35, 0.06),
tbMean = c(2.56, 3.89, 2.67, 2.33, 4.89, 1.44), pbMean = c(2,
3.67, 4, 4.5, 2.17, 1), acssMean = c(2.29, 1.86, 1.14, 2,
1.14, NA), mssi1_1 = c(NA, NA, 0, 1, 1, 0), mssi1_2 = c(NA,
NA, 0, 1, 1, 0), mssi1_3 = c(NA, 0, 0, 0, 0, 0), mssi1_4 = c(NA,
0, 0, 0, 0, 0)), .Names = c("uci", "ID", "Class", "age",
"sex", "bhsMean", "tbMean", "pbMean", "acssMean", "mssi1_1",
"mssi1_2", "mssi1_3", "mssi1_4"), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
答案 1 :(得分:1)
我喜欢使用tidyverse
方法:
使用您的数据集:
require(tidyverse)
df2 %>%
#Make it tidy. I assumed that uci and ID are keys in your data.
gather(variable, value, -uci, -ID) %>%
#Group the data by variable.
group_by(variable) %>%
#Calculating new variables based on the grouping: missing, valid and precent missing
#for each variable
mutate(Missing = sum(is.na(value)),
Valid = sum(!is.na(value)),
percentMissing = Missing/(Missing+Valid)) %>%
#Filtering OUT variables with more than 20% missing.
filter(percentMissing < 0.2)
如果您愿意,可以使用spread
返回宽屏格式。
使用spread
和select
的新行删除新变量的相同脚本:
df2 %>%
#Make it tidy. I assumed that uci and ID are keys in your data.
gather(variable, value, -uci, -ID) %>%
#Group the data by variable.
group_by(variable) %>%
#Calculating new variables based on the grouping, missing, valid and precent missing
#for each variables
mutate(Missing = sum(is.na(value)),
Valid = sum(!is.na(value)),
percentMissing = Missing/(Missing+Valid)) %>%
#Filtering OUT variables with more than 20% missing.
filter(percentMissing < 0.2) %>%
#Going back to the wide format, and removing the new variables
#Remove variables
select(-Missing, -Valid, -percentMissing) %>%
#Back to wide format
spread(variable, value)