我在csv文件中有以下数据:
Date Model Color Value Samples
6/19/2017 Gold Blue 0.5 500
6/19/2017 Gold Red 0.0 449
6/19/2017 Silver Blue 0.75 1320
6/19/2017 Silver Blue 1.5 103
6/19/2017 Gold Red 0.7 891
6/19/2017 Gold Blue 0.41 18103
6/19/2017 Copper Blue 0.83 564
6/19/2017 Silver Pink 1.17 173
6/19/2017 Platinum Brown 0.43 793
6/19/2017 Platinum Red 0.71 1763
6/19/2017 Gold Orange 1.92 503
我使用fread
函数创建data.table:
library(dplyr)
library(data.table)
df <- fread("test_data.csv",
header = TRUE,
fill = TRUE,
sep = ",")
然后按Model
对数据进行子集化,如下所示:
df_subset <- subset(df, df$Model=='Gold' & df$Value > 0)
然后,我根据Color
变量创建一些百分位数,如下所示:
df_subset[, .(Samples = sum(Samples),
'50th' = quantile(AvgValue, probs = c(0.50)),
'99th' = quantile(AvgValue, probs = c(0.99)),
'99.9th' = quantile(AvgValue, probs = c(0.999)),
'99.99th' = quantile(AvgValue, probs = c(0.9999))),
by = Color]
其中给出了以下输出:
Color Samples 50th 99th 99.9th 99.99th
1: Blue 18603 0.455 0.4991 0.49991 0.499991
2: Red 1340 0.975 1.2445 1.24945 1.249945
3: Orange 503 1.920 1.9200 1.92000 1.920000
我尝试遍历Model
值列表并输出每个Model
值的关联百分位值。
我尝试了以下(失败):
models <- unique(df$Model)
for (model in models){
df$model[, .(Samples = sum(Samples),
'50th' = quantile(Value, probs = c(0.50)),
'99th' = quantile(Value, probs = c(0.99)),
'99.9th' = quantile(Value, probs = c(0.999)),
'99.99th' = quantile(Value, probs = c(0.9999))),
by = Color]
}
错误消息是:
Error in .(Samples = sum(Samples), `50th` = quantile(Value, probs = c(0.5)), : could not find function "."
答案 0 :(得分:2)
这可能会解决您的问题
library(dplyr)
df [,-1] %>% filter(Value > 0) %>% group_by(Model, Color) %>%
do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999)))))
关于您在评论中的问题,关于如何连接样本总和:您可以使用aggregate
;我没有使用dplyr::summarise
的原因是我需要在应用do
之后开始一个新的管道,这是没有意义的。
data.frame(df %>% filter(Value > 0) %>% select(-Date) %>% group_by(Model, Color) %>%
do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999))))),
aggregate(Samples ~ Color+Model, df, sum)["Samples"])
# Model Color X50. X99. X99.9. X99.99. Samples
# 1 Copper Blue 0.830 0.8300 0.83000 0.830000 564
# 2 Gold Blue 0.455 0.4991 0.49991 0.499991 18603
# 3 Gold Orange 1.920 1.9200 1.92000 1.920000 503
# 4 Gold Red 0.700 0.7000 0.70000 0.700000 1340
# 5 Platinum Brown 0.430 0.4300 0.43000 0.430000 793
# 6 Platinum Red 0.710 0.7100 0.71000 0.710000 1763
# 7 Silver Blue 1.125 1.4925 1.49925 1.499925 1423
# 8 Silver Pink 1.170 1.1700 1.17000 1.170000 173
<强> 数据:的强>
df <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "6/19/2017", class = "factor"), Model = structure(c(2L,
2L, 4L, 4L, 2L, 2L, 1L, 4L, 3L, 3L, 2L), .Label = c("Copper",
"Gold", "Platinum", "Silver"), class = "factor"), Color = structure(
c(1L,5L, 1L, 1L, 5L, 1L, 1L, 4L, 2L, 5L, 3L), .Label = c("Blue", "Brown",
"Orange", "Pink", "Red"), class = "factor"), Value = c(0.5, 0,
0.75, 1.5, 0.7, 0.41, 0.83, 1.17, 0.43, 0.71, 1.92), Samples = c(500L,
449L, 1320L, 103L, 891L, 18103L, 564L, 173L, 793L, 1763L, 503L)),
.Names = c("Date", "Model", "Color", "Value", "Samples"),
class = "data.frame", row.names = c(NA, -11L))
答案 1 :(得分:2)
使用您的定义,您可以尝试以下方法:
library(data.table)
df<-fread("~/theData.csv")
df$Value<-as.numeric(df$Value)
result<-data.frame()
for (i in seq_along(unique(df$Model))){
temp <- subset(df, df$Model==unique(df$Model)[i] & df$Value > 0)
temp<-temp[, .(Samples = sum(Samples),
'50th' = quantile(Value, probs = c(0.50)),
'99th' = quantile(Value, probs = c(0.99)),
'99.9th' = quantile(Value, probs = c(0.999)),
'99.99th' = quantile(Value, probs = c(0.9999))),
by = Color]
temp$model<-unique(df$Model)[i]
result<-rbind(result, temp)
}
rm(temp)
答案 2 :(得分:2)
fread
创建一个data.table对象而不是数据框,因此我建议坚持使用data.table语法,而不是将其与dplyr混合使用。不需要for
循环,我们可以在by
参数中使用两个变量的列表来循环一行代码中的模型和颜色:
qs = df[Value > 0, .(Samples = sum(Samples),
'50th' = quantile(Value, probs = c(0.50)),
'99th' = quantile(Value, probs = c(0.99)),
'99.9th' = quantile(Value, probs = c(0.999)),
'99.99th' = quantile(Value, probs = c(0.9999))),
by = .(Model, Color)]
setkey(qs, 'Model')
# Model Color Samples 50th 99th 99.9th 99.99th
# 1: Copper Blue 564 0.830 0.8300 0.83000 0.830000
# 2: Gold Blue 18603 0.455 0.4991 0.49991 0.499991
# 3: Gold Red 891 0.700 0.7000 0.70000 0.700000
# 4: Gold Orange 503 1.920 1.9200 1.92000 1.920000
# 5: Platinum Brown 793 0.430 0.4300 0.43000 0.430000
# 6: Platinum Red 1763 0.710 0.7100 0.71000 0.710000
# 7: Silver Blue 1423 1.125 1.4925 1.49925 1.499925
# 8: Silver Pink 173 1.170 1.1700 1.17000 1.170000