我的dataset具有以下结构:
Classes 'data.table' and 'data.frame': 214175 obs. of 12 variables:
$ app_id : int 35949 49453 49970 50913 5| __truncated__ ...
$ customer_id : int 35948 49452 49452 50912 5| __truncated__ ...
$ sent_at : POSIXct, format: "2012-07-01 23:33:32.262" "2012-08-19 09:56:37.533" "2012-08-25 02:09:41.296" "2012-08-26 09:54:35.345" ...
$ ip_address_id : int 102298 96092 96091 67963 | __truncated__ ...
$ device_id : int NA NA NA NA NA NA NA NA NA NA ...
$ person_id : int 138622 9551 9551 28228 14| __truncated__ ...
$ passport_id : int 115828 148527 148527 1464| __truncated__ ...
$ email_id : int 19097 2685 2685 163914 69| __truncated__ ...
$ mobile_phone_id : int 104954 157463 157463 1032| __truncated__ ...
$ work_phone_id : int 68337 309192 309192 11972| __truncated__ ...
$ contact_phone_id: int NA NA NA NA NA NA NA NA NA NA ...
$ rejected : logi FALSE FALSE TRUE TRUE TRUE FALSE ...
您能帮助我将此带有子查询的SQL脚本转换为data.table
表达式:
SELECT app_id,
(SELECT count(DISTINCT customer_id)
FROM am_data
WHERE device_id = a.device_id
AND sent_at < a.sent_at
AND customer_id != a.customer_id) AS rule_1,
(SELECT count(DISTINCT customer_id)
FROM am_data
WHERE device_id = a.device_id
AND sent_at < a.sent_at
AND customer_id != a.customer_id
AND rejected = 1) AS rule_2,
(SELECT count(DISTINCT person_id)
FROM am_data
WHERE device_id = a.device_id
AND sent_at < a.sent_at
AND customer_id != a.customer_id) AS rule_3,
(SELECT count(DISTINCT customer_id)
FROM am_data
WHERE ip_address_id = a.ip_address_id
AND sent_at < a.sent_at
AND sent_at >= datetime(a.sent_at, '-14 days')
AND customer_id != a.customer_id
AND rejected = 1) AS rule_4,
(SELECT count(DISTINCT customer_id)
FROM am_data
WHERE contact_phone_id = a.mobile_phone_id
AND sent_at < a.sent_at
AND customer_id != a.customer_id
AND rejected = 1) AS rule_5,
(SELECT count(DISTINCT customer_id)
FROM am_data
WHERE work_phone_id = a.mobile_phone_id
AND sent_at < a.sent_at
AND customer_id != a.customer_id
AND rejected = 1) AS rule_6,
(SELECT count(DISTINCT customer_id)
FROM am_data
WHERE person_id = a.person_id
AND passport_id != a.passport_id
AND sent_at < a.sent_at
AND customer_id != a.customer_id) AS rule_7
FROM am_data AS a;
我对Rule_1的解决方案:
cols <- c("device_id", "customer_id", "app_id", "sent_at", "rejected")
tmp <- am_data[!is.na(device_id), ..cols]
setorder(tmp, sent_at)
tmp[, rule_1 := if (.N > 1L) cumsum(!duplicated(customer_id)) - 1L else 0L, by = "device_id"]
要获得所需的结果以比较data.table
解决方案,请执行以下代码:
## ---- Load packages ----
library(data.table)
library(sqldf)
## ---- Load data ----
am_data <- fread("https://gitlab.com/artemklevtsov/test-dt/raw/master/data/am_data.csv")
am_data <- unique(am_data, by = "app_id") # omit duplicates
am_data[, sent_at := as.POSIXct(sent_at, format = "%Y-%m-%dT%H:%M:%OS")]
setorder(am_data, sent_at)
## ---- SQL query ----
sql_query <- readLines("https://gitlab.com/artemklevtsov/test-dt/raw/master/sql/query.sql")
idx_cols <- names(am_data)[!sapply(am_data, is.logical)]
res <- sqldf(
c(sprintf("CREATE INDEX %s_idx ON am_data(%s)", idx_cols, idx_cols),
paste(sql_query, collapse = "\n"))
)
注意:我将接受任何包含与正确结果匹配的解决方案的答案。
链接:
答案 0 :(得分:4)
最终编辑:我将uniqueN
替换为length(unique())
。这提供了快速的结果。另外,我在先前对规则7的编辑中有一个错字。我使用unique(am_data)
删除了重复项,这似乎可以解决Rule_4以外的所有问题。
> res_2[, lapply(.SD, sum), .SDcols = 2:8]
rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 17167 10448 17165 2 606 16040 17072
> res[, lapply(.SD,sum), .SDcols = 2:8]
rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 17167 10448 17165 0 606 16040 17072
am_data <- unique(am_data)
# Prepare for Rules 1 - 3 -------------------------------------------------
am_data2 <- copy(am_data)[!is.na(device_id)]
a <- copy(am_data2)
setnames(a, paste0('a.', names(a)))
# Make Rules 1-3 happen ---------------------------------------------------
self_join <- am_data2[a,
on = .(device_id = a.device_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_1 = self_join[, length(unique(customer_id)), by = a.app_id]
rule_2 = self_join[rejected == 1 , length(unique(customer_id)), by = a.app_id]
rule_3 = self_join[, length(unique(person_id)), by = a.app_id]
# Prepare for Rule 4 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(ip_address_id)]
a <- copy(am_data2)
setnames(a, paste0('a.', names(a)))
a[, a.sent_at_range := a.sent_at - 14]
# Make Rule 4 happen ------------------------------------------------------
self_join <- am_data2[rejected == 1
][a,
on = .(ip_address_id = a.ip_address_id,
sent_at < a.sent_at,
sent_at >= a.sent_at_range),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_4 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Prepare for Rule 5 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(contact_phone_id)]
a <- copy(am_data)[!is.na(mobile_phone_id)]
setnames(a, paste0('a.', names(a)))
# Make Rule 5 happen ------------------------------------------------------
self_join <- am_data2[rejected == 1
][a,
on = .(contact_phone_id = a.mobile_phone_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_5 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Prepare for Rule 6 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(work_phone_id)]
a <- copy(am_data)[!is.na(mobile_phone_id)]
setnames(a, paste0('a.', names(a)))
# Make Rule 6 Happen ------------------------------------------------------
self_join <- am_data2[rejected == 1
][a,
on = .(work_phone_id = a.mobile_phone_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
,nomatch = 0L
][customer_id != a.customer_id]
rule_6 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Prepare for Rule 7 ------------------------------------------------------
am_data2 <- copy(am_data)[!is.na(person_id)]
a <- copy(am_data2)
setnames(a, paste0('a.', names(a)))
# Make Rule 7 Happen ------------------------------------------------------
self_join <- am_data2[a,
on = .(person_id = a.person_id,
sent_at < a.sent_at),
allow.cartesian = TRUE
# ,nomatch = 0L
][customer_id != a.customer_id & passport_id != a.passport_id]
rule_7 <- self_join[, length(unique(customer_id)), by = a.app_id]
# Combine and cast the rules we made --------------------------------------
res_2 <- dcast(rbindlist(list(rule_1, rule_2, rule_3, rule_4, rule_5, rule_6, rule_7), idcol = 'rule'), formula = a.app_id ~ rule , fill = 0L)
setnames(res_2,2:8, paste0('rule_', 1:7))
结果
> res_2
a.app_id rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 89033 0 0 0 0 0 1 0
2: 95775 0 0 0 0 0 1 0
3: 96542 0 0 0 0 0 1 0
4: 106447 0 0 0 0 0 1 0
5: 113040 0 0 0 0 0 1 0
---
21925: 34904219 1 1 1 0 0 1 0
21926: 34904725 1 1 1 0 0 0 1
21927: 34904750 1 0 1 0 0 1 1
21928: 34904921 1 0 1 0 0 0 1
21929: 34905033 0 0 0 0 0 1 1
> res[order(a.app_id) & (rule_1 > 0 | rule_2 > 0 | rule_3 > 0 |
rule_4 > 0 | rule_5 >0 | rule_6 > 0 | rule_7 > 0)]
a.app_id rule_1 rule_2 rule_3 rule_4 rule_5 rule_6 rule_7
1: 89033 0 0 0 0 0 1 0
2: 95775 0 0 0 0 0 1 0
3: 96542 0 0 0 0 0 1 0
4: 106447 0 0 0 0 0 1 0
5: 113040 0 0 0 0 0 1 0
---
22403: 34904219 1 1 1 0 0 1 1
22404: 34904725 1 1 1 0 0 0 1
22405: 34904750 1 0 1 0 0 1 1
22406: 34904921 1 0 1 0 0 0 1
22407: 34905033 0 0 0 0 0 1 1
原始:由于它是由设备锁定的,因此可能会有所帮助。
这是Rule1的SQL的data.table等效项。我发现检查了前5个结果和后5个结果,它们相匹配。
tmp2 <- am_data[!is.na(device_id), ..cols]
tmp2[tmp2,
on = .(device_id = device_id,
sent_at > sent_at),
allow.cartesian = TRUE
][customer_id != i.customer_id | is.na(customer_id),
.N,
keyby = device_id]