在拆分字符串上将行扩展为多行

时间:2016-05-24 14:06:22

标签: r

我有这个data.table,我想在链中生成所有可能的'前缀'。

user_id         v_chain                  nr_v       root_v_chain           last_v
616905      3 -> 8 -> 16 -> 17 -> 25        5     3 -> 8 -> 16 -> 17        25

这应该成为

user_id         v_chain                  nr_v       root_v_chain           last_v
616905      3 -> 8                          5     3 -> 8 -> 16 -> 17        25
616905      3 -> 8 -> 16                    5     3 -> 8 -> 16 -> 17        25
616905      3 -> 8 -> 16 -> 17              5     3 -> 8 -> 16 -> 17        25
616905      3 -> 8 -> 16 -> 17 -> 25        5     3 -> 8 -> 16 -> 17        25

如何遍历整个链并生成新行?

数据

dd <- read.table(stringsAsFactors = FALSE, header = TRUE,
text = "user_id         v_chain                  nr_v       root_v_chain           last_v
616905      '3 -> 8 -> 16 -> 17 -> 25'        5     '3 -> 8 -> 16 -> 17'        25")

6 个答案:

答案 0 :(得分:3)

另一种data.table方法,使用自定义函数:

library(data.table)
f <- function(x, sep = " -> ") {
 Reduce(function(...) paste(..., sep=sep), tstrsplit(x, sep, fixed=TRUE), accumulate=TRUE)[-1L]
}

setDT(dt)[, list(v_chain = f(v_chain)), by = setdiff(names(dt), "v_chain")]
#   user_id nr_v       root_v_chain last_v                  v_chain
#1:  616905    5 3 -> 8 -> 16 -> 17     25                   3 -> 8
#2:  616905    5 3 -> 8 -> 16 -> 17     25             3 -> 8 -> 16
#3:  616905    5 3 -> 8 -> 16 -> 17     25       3 -> 8 -> 16 -> 17
#4:  616905    5 3 -> 8 -> 16 -> 17     25 3 -> 8 -> 16 -> 17 -> 25

或者您可以使用加入:

tmp <- dt[, list(v_chain = f(v_chain)), by=user_id]
dt <- tmp[dt[,-"v_chain",with=FALSE], , on = "user_id"]

请注意,如果您的数据是

,则无需对其他用户进行修改即可使用
> dt
#   user_id                  v_chain nr_v             root_v_chain last_v
#1:       1 3 -> 8 -> 16 -> 17 -> 25    5 3 -> 8 -> 16 -> 17 -> 25     25
#2:       2    1 -> 5 -> 3 -> 4 -> 2    5    1 -> 5 -> 3 -> 4 -> 2     25

然后它会产生:

dt[, list(v_chain = f(v_chain)), by=setdiff(names(dt), "v_chain")]
#   user_id nr_v             root_v_chain last_v                  v_chain
#1:       1    5 3 -> 8 -> 16 -> 17 -> 25     25                   3 -> 8
#2:       1    5 3 -> 8 -> 16 -> 17 -> 25     25             3 -> 8 -> 16
#3:       1    5 3 -> 8 -> 16 -> 17 -> 25     25       3 -> 8 -> 16 -> 17
#4:       1    5 3 -> 8 -> 16 -> 17 -> 25     25 3 -> 8 -> 16 -> 17 -> 25
#5:       2    5    1 -> 5 -> 3 -> 4 -> 2     25                   1 -> 5
#6:       2    5    1 -> 5 -> 3 -> 4 -> 2     25              1 -> 5 -> 3
#7:       2    5    1 -> 5 -> 3 -> 4 -> 2     25         1 -> 5 -> 3 -> 4
#8:       2    5    1 -> 5 -> 3 -> 4 -> 2     25    1 -> 5 -> 3 -> 4 -> 2

答案 1 :(得分:1)

我认为以下内容应该有效,但您必须调整用户ID:

# create a vector of all of the V-chains, you would use olddf$v_chain
 temp <- c("3 -> 8 -> 16 -> 17 -> 25", "2 -> 6 -> 10 -> 12 -> 20")
# name it with user IDs
names(temp) <- c("user1", "user2") # you would use olddf$user_id

# get the chains
tempList <- lapply(strsplit(temp, split=" -> "), 
                   function(i) sapply(2:length(i), 
                                      function(j) paste(i[1:j], collapse=" -> ")))
# make it a data.frame
tempdf <- do.call(data.frame, tempList)
# melt the data.frame to long format
library(data.table)
tempdf <- melt(tempdf, measure.vars=list("user1", "user2"), 
               value.name="v_chain", variable.name="user_id")

现在,只需将此data.frame合并到原始文件:

finaldf <- merge(olddf, tempdf, by=user_id)

答案 2 :(得分:0)

您可以使用以下代码获取所有前缀

c_chain_split <- unlist(strsplit(dd$v_chain, split = ' '))

for(i in seq(3, length(c_chain_split),2)){
  paste(c_chain_split[1:i], collapse = ' ')
}

然后将c()rbind()组合在一起构建行。

答案 3 :(得分:0)

也许这会有所帮助:

s <- "3 -> 8 -> 16 -> 17 -> 25"
x <- strsplit(s, " -> ")[[1]]
n <- 2:length(x)
sapply(n, function(i) paste(x[1:i], collapse=" -> "))

产生所需列中的内容:

[1] "3 -> 8"                   "3 -> 8 -> 16"             "3 -> 8 -> 16 -> 17"      
[4] "3 -> 8 -> 16 -> 17 -> 25"

答案 4 :(得分:0)

三个阶段data.table解决方案:

library(data.table)
setDT(dd)[, v_chain := list(strsplit(v_chain, "->"))]
         [, v_chain := list(lapply(v_chain, function(ele) lapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->"))))]
         [, .(v_chain = unlist(v_chain)), .(user_id, nr_v, root_v_chain, last_v)]
   user_id nr_v       root_v_chain last_v                  v_chain
1:  616905    5 3 -> 8 -> 16 -> 17     25                  3 -> 8 
2:  616905    5 3 -> 8 -> 16 -> 17     25            3 -> 8 -> 16 
3:  616905    5 3 -> 8 -> 16 -> 17     25      3 -> 8 -> 16 -> 17 
4:  616905    5 3 -> 8 -> 16 -> 17     25 3 -> 8 -> 16 -> 17 -> 25

您还可以使用dplyrtidyr包:

library(dplyr); library(tidyr);
dd %>% mutate(v_chain = strsplit(v_chain, "->")) %>% 
       mutate(v_chain = lapply(v_chain, function(ele) 
              sapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->")))) %>% 
       unnest(v_chain)
Source: local data frame [4 x 5]

  user_id  nr_v       root_v_chain last_v                  v_chain
    (int) (int)              (chr)  (int)                    (chr)
1  616905     5 3 -> 8 -> 16 -> 17     25                  3 -> 8 
2  616905     5 3 -> 8 -> 16 -> 17     25            3 -> 8 -> 16 
3  616905     5 3 -> 8 -> 16 -> 17     25      3 -> 8 -> 16 -> 17 
4  616905     5 3 -> 8 -> 16 -> 17     25 3 -> 8 -> 16 -> 17 -> 25

答案 5 :(得分:0)

使用regex和data.table:

dt <- data.table(
    user_id = 616905,
    v_chain = '3 -> 8 -> 16 -> 17 -> 25',
    nr_v = 5,
    root_v_chain = '3 -> 8 -> 16 -> 17',
    last_v = 25
)
dt <- merge(
    dt,
    # use regex to find ending position of each sub-chain and merge this back with the original data.table assuming that "user_id" is your unique key
    dt[, lapply(gregexpr('(?<=->\\s)\\d+', v_chain, perl = TRUE), function(x) Reduce("+", list(x, attr(x, "match.length"), - 1))), by = .(user_id)], 
    by = "user_id"
)
# perform the substring operation and remove the temp column using data.table syntax
dt[, `:=` (v_chain = substr(v_chain, 1, V1), V1 = NULL)]
dt
   user_id                  v_chain nr_v       root_v_chain last_v
1:  616905                   3 -> 8    5 3 -> 8 -> 16 -> 17     25
2:  616905             3 -> 8 -> 16    5 3 -> 8 -> 16 -> 17     25
3:  616905       3 -> 8 -> 16 -> 17    5 3 -> 8 -> 16 -> 17     25
4:  616905 3 -> 8 -> 16 -> 17 -> 25    5 3 -> 8 -> 16 -> 17     25