我有这个data.table,我想在链中生成所有可能的'前缀'。
user_id v_chain nr_v root_v_chain last_v
616905 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 25
这应该成为
user_id v_chain nr_v root_v_chain last_v
616905 3 -> 8 5 3 -> 8 -> 16 -> 17 25
616905 3 -> 8 -> 16 5 3 -> 8 -> 16 -> 17 25
616905 3 -> 8 -> 16 -> 17 5 3 -> 8 -> 16 -> 17 25
616905 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 25
如何遍历整个链并生成新行?
数据
dd <- read.table(stringsAsFactors = FALSE, header = TRUE,
text = "user_id v_chain nr_v root_v_chain last_v
616905 '3 -> 8 -> 16 -> 17 -> 25' 5 '3 -> 8 -> 16 -> 17' 25")
答案 0 :(得分:3)
另一种data.table
方法,使用自定义函数:
library(data.table)
f <- function(x, sep = " -> ") {
Reduce(function(...) paste(..., sep=sep), tstrsplit(x, sep, fixed=TRUE), accumulate=TRUE)[-1L]
}
setDT(dt)[, list(v_chain = f(v_chain)), by = setdiff(names(dt), "v_chain")]
# user_id nr_v root_v_chain last_v v_chain
#1: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8
#2: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16
#3: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17
#4: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17 -> 25
或者您可以使用加入:
tmp <- dt[, list(v_chain = f(v_chain)), by=user_id]
dt <- tmp[dt[,-"v_chain",with=FALSE], , on = "user_id"]
请注意,如果您的数据是
,则无需对其他用户进行修改即可使用> dt
# user_id v_chain nr_v root_v_chain last_v
#1: 1 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 -> 25 25
#2: 2 1 -> 5 -> 3 -> 4 -> 2 5 1 -> 5 -> 3 -> 4 -> 2 25
然后它会产生:
dt[, list(v_chain = f(v_chain)), by=setdiff(names(dt), "v_chain")]
# user_id nr_v root_v_chain last_v v_chain
#1: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8
#2: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8 -> 16
#3: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8 -> 16 -> 17
#4: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8 -> 16 -> 17 -> 25
#5: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5
#6: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5 -> 3
#7: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5 -> 3 -> 4
#8: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5 -> 3 -> 4 -> 2
答案 1 :(得分:1)
我认为以下内容应该有效,但您必须调整用户ID:
# create a vector of all of the V-chains, you would use olddf$v_chain
temp <- c("3 -> 8 -> 16 -> 17 -> 25", "2 -> 6 -> 10 -> 12 -> 20")
# name it with user IDs
names(temp) <- c("user1", "user2") # you would use olddf$user_id
# get the chains
tempList <- lapply(strsplit(temp, split=" -> "),
function(i) sapply(2:length(i),
function(j) paste(i[1:j], collapse=" -> ")))
# make it a data.frame
tempdf <- do.call(data.frame, tempList)
# melt the data.frame to long format
library(data.table)
tempdf <- melt(tempdf, measure.vars=list("user1", "user2"),
value.name="v_chain", variable.name="user_id")
现在,只需将此data.frame合并到原始文件:
finaldf <- merge(olddf, tempdf, by=user_id)
答案 2 :(得分:0)
您可以使用以下代码获取所有前缀
c_chain_split <- unlist(strsplit(dd$v_chain, split = ' '))
for(i in seq(3, length(c_chain_split),2)){
paste(c_chain_split[1:i], collapse = ' ')
}
然后将c()
和rbind()
组合在一起构建行。
答案 3 :(得分:0)
也许这会有所帮助:
s <- "3 -> 8 -> 16 -> 17 -> 25"
x <- strsplit(s, " -> ")[[1]]
n <- 2:length(x)
sapply(n, function(i) paste(x[1:i], collapse=" -> "))
产生所需列中的内容:
[1] "3 -> 8" "3 -> 8 -> 16" "3 -> 8 -> 16 -> 17"
[4] "3 -> 8 -> 16 -> 17 -> 25"
答案 4 :(得分:0)
三个阶段data.table
解决方案:
library(data.table)
setDT(dd)[, v_chain := list(strsplit(v_chain, "->"))]
[, v_chain := list(lapply(v_chain, function(ele) lapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->"))))]
[, .(v_chain = unlist(v_chain)), .(user_id, nr_v, root_v_chain, last_v)]
user_id nr_v root_v_chain last_v v_chain
1: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8
2: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16
3: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17
4: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17 -> 25
您还可以使用dplyr
和tidyr
包:
library(dplyr); library(tidyr);
dd %>% mutate(v_chain = strsplit(v_chain, "->")) %>%
mutate(v_chain = lapply(v_chain, function(ele)
sapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->")))) %>%
unnest(v_chain)
Source: local data frame [4 x 5]
user_id nr_v root_v_chain last_v v_chain
(int) (int) (chr) (int) (chr)
1 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8
2 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16
3 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17
4 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17 -> 25
答案 5 :(得分:0)
使用regex和data.table:
dt <- data.table(
user_id = 616905,
v_chain = '3 -> 8 -> 16 -> 17 -> 25',
nr_v = 5,
root_v_chain = '3 -> 8 -> 16 -> 17',
last_v = 25
)
dt <- merge(
dt,
# use regex to find ending position of each sub-chain and merge this back with the original data.table assuming that "user_id" is your unique key
dt[, lapply(gregexpr('(?<=->\\s)\\d+', v_chain, perl = TRUE), function(x) Reduce("+", list(x, attr(x, "match.length"), - 1))), by = .(user_id)],
by = "user_id"
)
# perform the substring operation and remove the temp column using data.table syntax
dt[, `:=` (v_chain = substr(v_chain, 1, V1), V1 = NULL)]
dt
user_id v_chain nr_v root_v_chain last_v
1: 616905 3 -> 8 5 3 -> 8 -> 16 -> 17 25
2: 616905 3 -> 8 -> 16 5 3 -> 8 -> 16 -> 17 25
3: 616905 3 -> 8 -> 16 -> 17 5 3 -> 8 -> 16 -> 17 25
4: 616905 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 25