我有以下数据框:
> data6
verb_object SESSION_ID
1: BA31C1CC63E5043483FAE25F085E25E5 INSERT 41595370
2: BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE 41595371
3: 26D695C8CA82CAFFDF985201F3AA44D7 UPDATE 41595282
4: 26D695C8CA82CAFFDF985201F3AA44D7 UPDATE 41595282
5: 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE 41595373
6: 6D944D54C54ED75D487288FE1505BB59 INSERT 41595368
我有一个转换表:
> u1
items newitem
1 BA31C1CC63E5043483FAE25F085E25E5 INSERT OV1
2 BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE OV2
3 26D695C8CA82CAFFDF985201F3AA44D7 UPDATE OV3
4 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE OV4
5 6D944D54C54ED75D487288FE1505BB59 INSERT OV5
我想在转换表(u1)中添加原始数据帧(data6)中项目的频率。 在这种情况下,我将u1的结果是:
> u1
items newitem freq
1 BA31C1CC63E5043483FAE25F085E25E5 INSERT OV1 0.1667
2 BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE OV2 0.1667
3 26D695C8CA82CAFFDF985201F3AA44D7 UPDATE OV3 0.3333
4 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE OV4 0.1667
5 6D944D54C54ED75D487288FE1505BB59 INSERT OV5 0.1667
以下是两种解决方案的基准:
> microbenchmark(
+ setDT(u100)[setDT(data100)[, .(freq = .N/nrow(data100)), by = verb_object], on=c("items"="verb_object")],
+
+ merge(u100, xtabs(~ verb_object, data100)/length(data100$verb_object) ,
+ by.x = "items", by.y = "verb_object", all.x = TRUE, sort = FALSE),
+ times=1000
+ )
Unit: milliseconds
expr
setDT(u100)[setDT(data100)[, .(freq = .N/nrow(data100)), by = verb_object], on = c(items = "verb_object")]
merge(u100, xtabs(~verb_object, data100)/length(data100$verb_object), by.x = "items", by.y = "verb_object", all.x = TRUE, sort = FALSE)
min lq mean median uq max neval cld
1.269799 1.394808 1.586311 1.439762 1.493543 66.58702 1000 a
1.842091 2.030118 2.634712 2.099499 2.182838 67.77471 1000 b
答案 0 :(得分:2)
基础套餐:
merge(u1, xtabs(~ verb_object, data6)/length(data6$verb_object),
by.x = "items", by.y = "verb_object", all.x = TRUE, sort = FALSE)
输出:
items newitem Freq
1 BA31C1CC63E5043483FAE25F085E25E5 INSERT OV1 0.1666667
2 BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE OV2 0.1666667
3 26D695C8CA82CAFFDF985201F3AA44D7 UPDATE OV3 0.3333333
4 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE OV4 0.1666667
5 6D944D54C54ED75D487288FE1505BB59 INSERT OV5 0.1666667
数据:
data6 <- structure(list(verb_object = c("BA31C1CC63E5043483FAE25F085E25E5 INSERT",
"BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE", "26D695C8CA82CAFFDF985201F3AA44D7 UPDATE",
"26D695C8CA82CAFFDF985201F3AA44D7 UPDATE", "2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE",
"6D944D54C54ED75D487288FE1505BB59 INSERT"), SESSION_ID = c(41595370L,
41595371L, 41595282L, 41595282L, 41595373L, 41595368L)), .Names = c("verb_object",
"SESSION_ID"), row.names = c(NA, -6L), class = "data.frame")
u1 <- structure(list(items = c("BA31C1CC63E5043483FAE25F085E25E5 INSERT",
"BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE", "26D695C8CA82CAFFDF985201F3AA44D7 UPDATE",
"2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE", "6D944D54C54ED75D487288FE1505BB59 INSERT"
), newitem = structure(1:5, .Label = c("OV1", "OV2", "OV3", "OV4",
"OV5"), class = "factor")), .Names = c("items", "newitem"), row.names = c(NA,
-5L), class = "data.frame")
答案 1 :(得分:1)
使用data.table:
.anyRequest().authenticated()
给出:
library(data.table) setDT(u1)[setDT(data6)[, .(freq = .N/nrow(data6)), by = verb_object], on=c("items"="verb_object")]