在数据框中:
> df
Version.ID Relevant.product Proportion
1000 OS 0.05095541
1000 C 0.75159236
1000 R 0.19745223
1000 Other 0.00000000
1000 C 0.75159236
1000 C 0.75159236
1000 C 0.75159236
1000 C 0.75159236
2000 O 1.00000000
3000 En 0.93498526
3000 En 0.93498526
3000 En 0.93498526
3000 R 0.06501474
3000 En 0.93498526
3000 En 0.93498526
3000 Other 0.00000000
3000 En 0.93498526
我想获得每个Version.ID最大比例的产品名称:
Version.ID Relevant.product
1000 C
2000 O
3000 En
由于
答案 0 :(得分:2)
试用data.table
库
library(data.table)
setDT(df)[, Relevant.product[which.max(Proportion)], by = Version.ID]
# Version.ID V1
# 1: 1000 C
# 2: 2000 O
# 3: 3000 En
如果您只想要与max(Proportion)
对应的第一个“Relevant.product”,则上述解决方案非常好。如果你有兴趣返回所有这些,这是一种方式:
require(data.table) ## 1.9.2
idx = setDT(df)[, .I[Proportion == max(Proportion)], by=Version.ID]$V1
ans = unique(df[idx], by=c("Version.ID", "Relevant.product"))
答案 1 :(得分:1)
另一种解决方案(不使用任何外部包,但不太优雅):
x[row.names(x) %in% sapply(split(x, x$Version.ID),
function(df) row.names(df[which.max(df$Proportion),])),]
## Version.ID Relevant.product Proportion
## 2 1000 C 0.7515924
## 9 2000 O 1.0000000
## 10 3000 En 0.9349853
事实上,正如大卫所说,这个解决方案也慢了。对于10000行和10个类,我们有:
x <- data.frame(Version.ID=as.factor(sample(1:10, replace=TRUE, 10000)),
Relevant.product=sample(LETTERS[1:5], replace=TRUE, 10000),
Proportion=runif(10000))
library(data.table)
library(microbenchmark)
microbenchmark(
{data.table(x)[, Relevant.product[which.max(Proportion)], by = Version.ID]},
{x[row.names(x) %in% sapply(split(x, x$Version.ID),
function(df) row.names(df[which.max(df$Proportion),])),]})
## Unit: milliseconds
## expr min lq median uq max neval
## [data.table] 3.802304 4.046833 4.124973 4.262634 80.18705 100
## [split] 11.171008 11.364131 11.502188 11.679067 14.51869 100
但知道其他选择很好:)
编辑:以下是100000行的结果:
## Unit: milliseconds
## min lq median uq max neval
## [data.table] 9.350692 13.88461 18.33646 68.44882 95.78928 100
## [split] 89.726972 106.39916 124.10599 169.41667 237.70003 100
和1000000行:
## Unit: milliseconds
## min lq median uq max neval
## [data.table] 76.58919 117.7388 155.9511 210.2772 362.0843 100
## [split] 963.87984 1190.5079 1395.7724 1602.5480 3417.5468 100
另一方面,对于100000行和1000个类,我们得到:
## Unit: milliseconds
## min lq median uq max neval
## [data.table] 39.55042 46.22971 48.59297 50.02435 133.3646 100
## [split] 844.62629 900.54373 916.15211 966.89630 1055.5050 100
答案 2 :(得分:0)
除data.table
外,请不要忘记dplyr
:
library(microbenchmark)
microbenchmark(
dt = dt[, .SD$Relevant.product[which.max(Proportion)], by = Version.ID],
dplyr = unique(df %.%
group_by(Version.ID) %.%
filter(Proportion == max(Proportion)) %.%
select(Version.ID, Relevant.product)
),
times = 1000
)
# Unit: milliseconds
# expr min lq median uq max neval
# dt 2.164455 2.274471 2.311025 2.390110 10.868671 1000
# dplyr 1.758137 1.846008 1.871316 1.916657 6.448726 1000
初始化:
library(data.table)
library(dplyr)
df <- read.table(text="Version.ID Relevant.product Proportion
1000 OS 0.05095541
1000 C 0.75159236
1000 R 0.19745223
1000 Other 0.00000000
1000 C 0.75159236
1000 C 0.75159236
1000 C 0.75159236
1000 C 0.75159236
2000 O 1.00000000
3000 En 0.93498526
3000 En 0.93498526
3000 En 0.93498526
3000 R 0.06501474
3000 En 0.93498526
3000 En 0.93498526
3000 Other 0.00000000
3000 En 0.93498526", header=T)
dt <- data.table(df)
编辑@DavidArenburg:
除了示例数据很小并且差异似乎很小的事实之外,我不知道这是否是一个有效的基准:
microbenchmark(
dt = { data.table(df)[, .SD$Relevant.product[which.max(Proportion)], by = Version.ID]
df <- as.data.frame(df)
},
dplyr = { unique(df %.%
group_by(Version.ID) %.%
filter(Proportion == max(Proportion)) %.%
select(Version.ID, Relevant.product)
)
df <- as.data.frame(df)
},
setdt = { setDT(df)[, Relevant.product[which.max(Proportion)], by = Version.ID]
df <- as.data.frame(df)
},
times = 1000
)
# Unit: milliseconds
# expr min lq median uq max neval
# dt 3.258985 3.445448 3.494130 3.580771 8.991382 1000
# dplyr 1.840736 1.937044 1.955497 1.992579 10.654265 1000
# setdt 2.879731 3.046159 3.091678 3.179549 100.604628 1000