我需要一件很简单的事情。遍历数据集的列以创建所述列的百分位数版本。我尝试了dplyr和data.table,但似乎没有一个能满足我的需求。特别是,在创建列的百分数版本时,我需要排除de NA值。
下面的可复制示例:
values<-c(19,
6,
27,
63,
50,
59,
97,
89,
NA,
9,
31,
58,
83,
2,
1,
31,
3,
1,
27,
40,
32,
42,
99,
NA,
12,
16,
23,
98,
44,
25,
13,
70,
64,
NA,
37,
75,
73,
59,
21,
3,
76,
43,
6,
96,
55,
48,
70,
90,
18,
58,
22,
19,
26,
49,
59,
94,
31,
45,
20,
8,
26,
56,
7,
11,
98,
50,
41,
38,
86,
0,
37,
NA,
40,
7,
88,
38,
41,
41,
19,
34,
21,
64,
87,
22,
54,
39,
75,
72,
91,
78)
values2<- c(98,
60,
9,
98,
NA,
88,
NA,
54,
92,
90,
NA,
83,
92,
65,
44,
NA,
98,
40,
26,
40,
54,
56,
15,
90,
15,
63,
57,
NA,
85,
69,
73,
43,
24,
27,
82,
75,
29,
98,
29,
5,
91,
88,
28,
12,
53,
NA,
2,
42,
86,
2,
78,
20,
50,
73,
77,
NA,
4,
39,
90,
NA,
29,
14,
98,
88,
77,
79,
30,
9,
74,
93,
NA,
16,
27,
16,
18,
40,
NA,
2,
66,
71,
82,
10,
62,
84,
25,
NA,
15,
12,
85,
50)
groups<-c(1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2)
df<-as.data.frame(cbind(groups,values,values2))
library(dplyr)
for (i in c("values","values2")) {
df<-df %>%
group_by(groups) %>%
mutate(!!sym(paste( i,"_percentile", sep="")) := percent_rank(na.omit(i)))
}
for (i in c("values","values2")) {
df<-df %>%
group_by(groups) %>%
mutate(!!sym(paste( i,"_percentile", sep="")) := rank(i)/length(i) )
}
library(data.table)
df<- as.data.table(df)
for (i in c("values","values2")) {
df[, paste(i,"_percentile",sep="") := rank(get(i))/length( get(i)), by = groups ]
}
for (i in c("values","values2")) {
df[!is.na(i), paste(i,"_percentile",sep="") := rank(get(i))/length( get(i)), by = groups ]
}
答案 0 :(得分:1)
一个选项是mutate_at
。按“组”分组后,使用mutate_at
遍历以starts_with
(“ values”)作为列名replace
的列,其中这些值不是{ {1}}个非NA元素
percent_rank
或与library(dplyr)
df %>%
group_by(groups) %>%
mutate_at(vars(starts_with('values')),
list(percentile = ~ replace(., !is.na(.), percent_rank(.[!is.na(.)]))))
data.table
答案 1 :(得分:0)
我的npm install react-native@version
答案与@akrun的结构相同-使用tidyverse
添加多个列,使用mutate_at
选择列。举个最小的例子,有几点需要指出:
starts_with
函数在计算时已经删除了NA,因此您无需执行其他工作即可将其从计算中滤除。percent_rank
可以在此处返回NaN值,因为它缩放了percent_rank
。在直接min_rank
中,似乎可以避免该问题。 (尚不清楚在您的情况下应分配给什么值)。mutate_at
的值不是1.0。percent_rank