Question

我在csv文件中有以下数据：

Date        Model      Color    Value   Samples
6/19/2017   Gold       Blue     0.5     500
6/19/2017   Gold       Red      0.0     449
6/19/2017   Silver     Blue     0.75    1320
6/19/2017   Silver     Blue     1.5     103
6/19/2017   Gold       Red      0.7     891
6/19/2017   Gold       Blue     0.41    18103
6/19/2017   Copper     Blue     0.83    564
6/19/2017   Silver     Pink     1.17    173
6/19/2017   Platinum   Brown    0.43    793
6/19/2017   Platinum   Red      0.71    1763
6/19/2017   Gold       Orange   1.92    503

我使用fread函数创建data.table：

library(dplyr)
library(data.table)

df <- fread("test_data.csv", 
                 header = TRUE,
                 fill = TRUE,
                 sep = ",")

然后按Model对数据进行子集化，如下所示：

df_subset <- subset(df, df$Model=='Gold' & df$Value > 0)

然后，我根据Color变量创建一些百分位数，如下所示：

df_subset[, .(Samples = sum(Samples),
    '50th'    = quantile(AvgValue, probs = c(0.50)),
    '99th'    = quantile(AvgValue, probs = c(0.99)),
    '99.9th'  = quantile(AvgValue, probs = c(0.999)), 
    '99.99th' = quantile(AvgValue, probs = c(0.9999))),
by = Color]

其中给出了以下输出：

    Color Samples  50th   99th  99.9th  99.99th
1:   Blue   18603 0.455 0.4991 0.49991 0.499991
2:    Red    1340 0.975 1.2445 1.24945 1.249945
3: Orange     503 1.920 1.9200 1.92000 1.920000

我尝试遍历Model值列表并输出每个Model值的关联百分位值。

我尝试了以下（失败）：

models <- unique(df$Model)

for (model in models){

  df$model[, .(Samples = sum(Samples),
                '50th'    = quantile(Value, probs = c(0.50)),
                '99th'    = quantile(Value, probs = c(0.99)),
                '99.9th'  = quantile(Value, probs = c(0.999)), 
                '99.99th' = quantile(Value, probs = c(0.9999))),
            by = Color]
}

错误消息是：

Error in .(Samples = sum(Samples), `50th` = quantile(Value, probs = c(0.5)),  :  could not find function "."

Answer 1

这可能会解决您的问题

library(dplyr)

df [,-1] %>% filter(Value > 0) %>% group_by(Model, Color) %>% 
        do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999)))))

关于您在评论中的问题，关于如何连接样本总和：您可以使用aggregate;我没有使用dplyr::summarise的原因是我需要在应用do之后开始一个新的管道，这是没有意义的。

data.frame(df %>% filter(Value > 0) %>% select(-Date) %>% group_by(Model, Color) %>% 
              do(data.frame(t(quantile(.$Value, probs = c(0.50, 0.99, 0.999, 0.9999))))),
           aggregate(Samples ~ Color+Model, df, sum)["Samples"])

#      Model  Color  X50.   X99.  X99.9.  X99.99. Samples 
# 1   Copper   Blue 0.830 0.8300 0.83000 0.830000     564 
# 2     Gold   Blue 0.455 0.4991 0.49991 0.499991   18603 
# 3     Gold Orange 1.920 1.9200 1.92000 1.920000     503 
# 4     Gold    Red 0.700 0.7000 0.70000 0.700000    1340 
# 5 Platinum  Brown 0.430 0.4300 0.43000 0.430000     793 
# 6 Platinum    Red 0.710 0.7100 0.71000 0.710000    1763 
# 7   Silver   Blue 1.125 1.4925 1.49925 1.499925    1423 
# 8   Silver   Pink 1.170 1.1700 1.17000 1.170000     173

<强> 数据：的

df <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = "6/19/2017", class = "factor"), Model = structure(c(2L, 
2L, 4L, 4L, 2L, 2L, 1L, 4L, 3L, 3L, 2L), .Label = c("Copper", 
"Gold", "Platinum", "Silver"), class = "factor"), Color = structure( 
c(1L,5L, 1L, 1L, 5L, 1L, 1L, 4L, 2L, 5L, 3L), .Label = c("Blue", "Brown", 
"Orange", "Pink", "Red"), class = "factor"), Value = c(0.5, 0, 
0.75, 1.5, 0.7, 0.41, 0.83, 1.17, 0.43, 0.71, 1.92), Samples = c(500L, 
449L, 1320L, 103L, 891L, 18103L, 564L, 173L, 793L, 1763L, 503L)), 
.Names = c("Date", "Model", "Color", "Value", "Samples"), 
class = "data.frame", row.names = c(NA, -11L))

Answer 2

使用您的定义，您可以尝试以下方法：

library(data.table)
df<-fread("~/theData.csv")
df$Value<-as.numeric(df$Value)
result<-data.frame()
for (i in seq_along(unique(df$Model))){
  temp <- subset(df, df$Model==unique(df$Model)[i] & df$Value > 0)
  temp<-temp[, .(Samples = sum(Samples),
  '50th'    = quantile(Value, probs = c(0.50)),
  '99th'    = quantile(Value, probs = c(0.99)),
  '99.9th'  = quantile(Value, probs = c(0.999)), 
  '99.99th' = quantile(Value, probs = c(0.9999))),
   by = Color]
  temp$model<-unique(df$Model)[i]
  result<-rbind(result, temp)
}
rm(temp)

Answer 3

fread创建一个data.table对象而不是数据框，因此我建议坚持使用data.table语法，而不是将其与dplyr混合使用。不需要for循环，我们可以在by参数中使用两个变量的列表来循环一行代码中的模型和颜色：

qs = df[Value > 0, .(Samples = sum(Samples),
              '50th'    = quantile(Value, probs = c(0.50)),
              '99th'    = quantile(Value, probs = c(0.99)),
              '99.9th'  = quantile(Value, probs = c(0.999)), 
              '99.99th' = quantile(Value, probs = c(0.9999))),
          by = .(Model, Color)]
setkey(qs, 'Model')

#       Model  Color Samples  50th   99th  99.9th  99.99th
# 1:   Copper   Blue     564 0.830 0.8300 0.83000 0.830000
# 2:     Gold   Blue   18603 0.455 0.4991 0.49991 0.499991
# 3:     Gold    Red     891 0.700 0.7000 0.70000 0.700000
# 4:     Gold Orange     503 1.920 1.9200 1.92000 1.920000
# 5: Platinum  Brown     793 0.430 0.4300 0.43000 0.430000
# 6: Platinum    Red    1763 0.710 0.7100 0.71000 0.710000
# 7:   Silver   Blue    1423 1.125 1.4925 1.49925 1.499925
# 8:   Silver   Pink     173 1.170 1.1700 1.17000 1.170000

对于R中具有子集的循环

3 个答案: