我有一个包含以下信息的专栏:
function route(pathname) {
console.log("About to route a request for " + pathname);
}
exports.route = route;
因此,没有一种id类型在一行中有一定数量的出现。
它们全部由制表符分隔。
我正在寻找一种方法来获取每一行的ID作为单独的列,并在1 x=abc1000000\ty=pqr2000000\tz=olk78fgzu_zuii8999_ikooo
2 x=oljhh88999\ty=lop9876666
3 x=frdt876544\ty=ztr6u76532\ty=uzrt899963\tz=wertttts_765342_ioooosww\tz=tzuuuee_66554422_88uuiiid
中找到tstrsplit
,但无法弄清楚如何将它与多个拆分参数一起使用。有什么想法吗?
编辑:预期格式为:
data.table
请注意,ID的名称中不包含“id”。所以我相应地更新了这个例子。 id可能在每一行中出现多次。上面给出的格式只是使问题更清晰的一个例子。实际上,例如,X类型的ID在一行中可以有20个出现。然后,X的列数将是给定整个数据集的行中此特定类型ID的最大出现次数。数据非常大。我们谈论的是约30米的线路。
答案 0 :(得分:6)
新答案:
对于更新的示例,您可以按如下方式解决问题:
dt2 <- dt[, rn := .I
][, .(V1 = unlist(tstrsplit(V1, '\t'))), by = rn
][, c('id','value') := tstrsplit(V1, '=')
][, idn := 1:.N, by = .(rn, id)]
dcast(dt2, rn ~ id + idn, value.var = 'value', sep = '')
导致:
rn x1 y1 y2 z1 z2 1: 1 abc1000000 pqr2000000 NA olk78fgzu_zuii8999_ikooo NA 2: 2 oljhh88999 lop9876666 NA NA NA 3: 3 frdt876544 ztr6u76532 uzrt899963 wertttts_765342_ioooosww tzuuuee_66554422_88uuiiid
要获得准确的输出(因此也包括x2
列),您可以执行以下操作:
dcast(dt2[CJ(rn = rn, id = id, idn = idn, unique = TRUE), on = .(rn, id, idn)],
rn ~ id + idn, value.var = 'value', sep = '')
导致:
rn x1 x2 y1 y2 z1 z2 1: 1 abc1000000 NA pqr2000000 NA olk78fgzu_zuii8999_ikooo NA 2: 2 oljhh88999 NA lop9876666 NA NA NA 3: 3 frdt876544 NA ztr6u76532 uzrt899963 wertttts_765342_ioooosww tzuuuee_66554422_88uuiiid
使用过的数据:
dt <- fread('"x=abc1000000\ty=pqr2000000\tz=olk78fgzu_zuii8999_ikooo"
"x=oljhh88999\ty=lop9876666"
"x=frdt876544\ty=ztr6u76532\ty=uzrt899963\tz=wertttts_765342_ioooosww\tz=tzuuuee_66554422_88uuiiid"',
header=FALSE)
回答原始问题:
如果您想使用tstrsplit
,可以按照以下方式进行操作:
dt[, rn := .I
][, .(V1 = unlist(tstrsplit(V1, '\t'))), by = rn
][, .(rn, id = gsub('([a-z0-9]+)(=.*$)','\\1',V1))]
导致:
rn id 1: 1 xid1 2: 1 yid2 3: 1 zid3 4: 2 xid4 5: 2 yid5 6: 3 xid6 7: 3 yid7 8: 3 yid8 9: 3 zid9
导致宽格式输出的替代方案:
dt[, tstrsplit(V1, '\t'),
][, lapply(.SD, gsub, pattern = '([a-z0-9]+)(=.*$)', replacement = '\\1')]
导致:
V1 V2 V3 V4 1: xid1 yid2 zid3 NA 2: xid4 yid5 NA NA 3: xid6 yid7 yid8 zid9
如果你想将所有id提取为@UweBlock,你也可以这样做(虽然比UweBlock的方法稍微简单一点):
l <- regmatches(dt$V1, gregexpr('([a-z]{1}id[0-9]{1})',dt$V1))
l <- lapply(l, as.data.frame.list)
l <- lapply(l, function(x) {names(x) <- paste0('v',seq_along(x)); as.data.table(x)})
rbindlist(l, fill = TRUE)
导致:
v1 v2 v3 v4 v5 1: xid1 yid2 zid3 NA NA 2: xid4 yid5 NA NA NA 3: xid6 yid7 yid8 zid8 zid9
使用过的数据:
dt <- fread('"xid1=abc1000000\tyid2=pqr2000000\tzid3=olk78fgzu_zuii8999_ikooo"
"xid4=oljhh88999\tyid5=lop9876666"
"xid6=frdt876544\tyid7=ztr6u76532\tyid8=uzrt899963tzid8=wertttts_765342_ioooosww\tzid9=tzuuuee_66554422_88uuiiid"',header=FALSE)
答案 1 :(得分:5)
您没有指定输出应该是什么样子。为了击败akrun到答案,这里是一个列表,其中列表的元素代表你的行。
在此解决方案中,您可以通过选项卡拆分每一行,并找到[xyz] id [整数]的模式。
x <- c("xid1=abc1000000\tyid2=pqr2000000\tzid3=olk78fgzu_zuii8999_ikooo",
"xid4=oljhh88999\tyid5=lop9876666",
"xid6=frdt876544\tyid7=ztr6u76532\tyid8=uzrt899963tzid8=wertttts_765342_ioooosww\tzid9=tzuuuee_66554422_88uuiiid")
res <- sapply(x, FUN = function(m) {
m <- strsplit(m, "\t")
out <- sapply(m, FUN = function(o) gsub(pattern = "(^[[:alpha:]]id\\d+)(=.*$)", replacement = "\\1", x = o),
simplify = FALSE)
out
}, simplify = FALSE)
res <- unname(res)
res
[[1]]
[[1]][[1]]
[1] "xid1" "yid2" "zid3"
[[2]]
[[2]][[1]]
[1] "xid4" "yid5"
[[3]]
[[3]][[1]]
[1] "xid6" "yid7" "yid8" "zid9"
如果省略simplify = FALSE
但未取消对结果的命名,则可以
$`xid1=abc1000000\tyid2=pqr2000000\tzid3=olk78fgzu_zuii8999_ikooo`
[,1]
[1,] "xid1"
[2,] "yid2"
[3,] "zid3"
$`xid4=oljhh88999\tyid5=lop9876666`
[,1]
[1,] "xid4"
[2,] "yid5"
$`xid6=frdt876544\tyid7=ztr6u76532\tyid8=uzrt899963tzid8=wertttts_765342_ioooosww\tzid9=tzuuuee_66554422_88uuiiid`
[,1]
[1,] "xid6"
[2,] "yid7"
[3,] "yid8"
[4,] "zid9"
如果您不关心每个元素的来源,您可以
rapply(as.list(x), f = function(m){
m <- strsplit(m, "\t")
out <- sapply(m, FUN = function(o) gsub(pattern = "(^[[:alpha:]]id\\d+)(=.*$)", replacement = "\\1", x = o),
simplify = FALSE)
})
[1] "xid1" "yid2" "zid3" "xid4" "yid5" "xid6" "yid7" "yid8" "zid9"
但即便如此,也可以使用第一种解决方案(计算每个列表中元素的数量)来重建。
答案 2 :(得分:3)
OP现在已经指定了预期结果,并且还更新了样本数据集。所以,答案必须相应更新。
以下方法使用Jaap's answer中的data.table
但不同如下
tstrsplit()
仅调用一次以拆分所有列melt()
用于融合多个列,产生两个值列,一个用于ID,一个用于付费负载值,dcast()
用于相应地重命名列。 下面的代码应该在OP请求的每行中使用任意数量的id-value-pair:
library(data.table) # CRAN version 1.10.4 used
# split input data at "=" and "\t"
splitted <- DT[, tstrsplit(V1, "=|\t")]
# odd numbered columns contain the ids
cols_odd <- names(splitted)[c(TRUE, FALSE)] # or: seq(1L, ncol(splitted), by = 2L)
# even numbered columns contain the pay load values
cols_even <- names(splitted)[c(FALSE, TRUE)] # or: seq(2L, ncol(splitted), by = 2L)
# add row number before melting, melt multiple columns, remove NA
molten <- melt(splitted[, rn := .I],
measure.vars = list(cols_odd, cols_even),
value.name = c("id", "value"), na.rm = TRUE)
# in each row, number the occurences of each id consecutively
molten[, var.count := rowid(id), rn]
# final reshape from long to wide,
# create column names from id and id count per row
# remove rn because no longer needed
dcast(molten, rn ~ id + var.count, sep = "")[, rn := NULL][]
结果
x1 y1 y2 z1 z2
1: abc1000000 pqr2000000 NA olk78fgzu_zuii8999_ikooo NA
2: oljhh88999 lop9876666 NA NA NA
3: frdt876544 ztr6u76532 uzrt899963 wertttts_765342_ioooosww tzuuuee_66554422_88uuiiid
与上述结果相反,OP预期的结果包括空x2
列。如果这是OP的真正意图,则需要在最终dcast()
之前添加缺少的行。这是通过加入data.table
来实现的,其中包含每行的行号,ID和id计数的所有可能组合,如下所示:
dcast(molten[CJ(rn, id, var.count, unique = TRUE),
on = .(rn = V1, id = V2, var.count = V3)],
rn ~ id + var.count, sep = "")[, rn := NULL][]
产生
x1 x2 y1 y2 z1 z2
1: abc1000000 NA pqr2000000 NA olk78fgzu_zuii8999_ikooo NA
2: oljhh88999 NA lop9876666 NA NA NA
3: frdt876544 NA ztr6u76532 uzrt899963 wertttts_765342_ioooosww tzuuuee_66554422_88uuiiid
library(data.table)
DT <- fread('"x=abc1000000\ty=pqr2000000\tz=olk78fgzu_zuii8999_ikooo"
"x=oljhh88999\ty=lop9876666"
"x=frdt876544\ty=ztr6u76532\ty=uzrt899963\tz=wertttts_765342_ioooosww\tz=tzuuuee_66554422_88uuiiid"',
header=FALSE)
由于问题的主题是使用正则表达式将列拆分为单独的列而OP明确要求提供关于如何使用 tstrsplit
的提示多个拆分参数所有答案都集中在拆分列上。
但是,如果OP只感兴趣获取每行的ID作为单独的列 ,则可以使用简单的解决方案:
stringr::str_extract_all(DT$V1, "\\wid\\d", simplify = TRUE)
# [,1] [,2] [,3] [,4] [,5]
#[1,] "xid1" "yid2" "zid3" "" ""
#[2,] "xid4" "yid5" "" "" ""
#[3,] "xid6" "yid7" "yid8" "zid8" "zid9"
或在data.table
上下文中:
DT[, data.table(stringr::str_extract_all(V1, "\\wid\\d", simplify = TRUE))]
# V1 V2 V3 V4 V5
#1: xid1 yid2 zid3
#2: xid4 yid5
#3: xid6 yid7 yid8 zid8 zid9
请注意,在第三行中找到的ID zid8
在所有其他答案中都缺失。其他答案正在寻找将字符串拆分为\t
(制表符)。可能是给定数据集的第三行中缺少\
,因此该部分为tzid8=
而不是\tzid8=
。
如问题中所述(包括潜在错误)。
DT <- fread('"xid1=abc1000000\tyid2=pqr2000000\tzid3=olk78fgzu_zuii8999_ikooo"
"xid4=oljhh88999\tyid5=lop9876666"
"xid6=frdt876544\tyid7=ztr6u76532\tyid8=uzrt899963tzid8=wertttts_765342_ioooosww\tzid9=tzuuuee_66554422_88uuiiid"',
header=FALSE)
答案 3 :(得分:2)
对于更新的示例和所需的结果:
library(tidyverse)
df <- data_frame(x = c("x=abc1000000\ty=pqr2000000\tz=olk78fgzu_zuii8999_ikooo",
"x=oljhh88999\ty=lop9876666",
"x=frdt876544\ty=ztr6u76532\ty=uzrt899963\tz=wertttts_765342_ioooosww\tz=tzuuuee_66554422_88uuiiid"))
df_tidy <- df %>%
mutate(row = row_number()) %>%
separate_rows(x, sep = '\t') %>%
separate(x, c('var', 'val'), sep = '=') %>%
group_by(row, var) %>%
mutate(n = row_number()) %>%
unite(var, var, n, sep = '') %>%
spread(var, val)
df_tidy
#> # A tibble: 3 x 6
#> # Groups: row [3]
#> row x1 y1 y2 z1
#> * <int> <chr> <chr> <chr> <chr>
#> 1 1 abc1000000 pqr2000000 <NA> olk78fgzu_zuii8999_ikooo
#> 2 2 oljhh88999 lop9876666 <NA> <NA>
#> 3 3 frdt876544 ztr6u76532 uzrt899963 wertttts_765342_ioooosww
#> # ... with 1 more variables: z2 <chr>
对于原始示例,如果要以整洁的形式捕获所有数据,
library(tidyverse)
df <- data_frame(x = c("xid1=abc1000000\tyid2=pqr2000000\tzid3=olk78fgzu_zuii8999_ikooo",
"xid4=oljhh88999\tyid5=lop9876666",
"xid6=frdt876544\tyid7=ztr6u76532\tyid8=uzrt899963tzid8=wertttts_765342_ioooosww\tzid9=tzuuuee_66554422_88uuiiid"))
df_tidy <- df %>%
mutate(row_id = row_number()) %>%
separate_rows(x, sep = '\t') %>%
separate(x, c('id', 'value'), extra = 'merge')
df_tidy
#> # A tibble: 9 x 3
#> row_id id value
#> * <int> <chr> <chr>
#> 1 1 xid1 abc1000000
#> 2 1 yid2 pqr2000000
#> 3 1 zid3 olk78fgzu_zuii8999_ikooo
#> 4 2 xid4 oljhh88999
#> 5 2 yid5 lop9876666
#> 6 3 xid6 frdt876544
#> 7 3 yid7 ztr6u76532
#> 8 3 yid8 uzrt899963tzid8=wertttts_765342_ioooosww
#> 9 3 zid9 tzuuuee_66554422_88uuiiid
如果按separate_rows
或=
进一步拆分,则添加更多_
次来电。
答案 4 :(得分:0)
我们可以使用ls |xargs sed -i 's/google_ad_channel = "1013717472xx"/google_ad_channel = "new_code"/g'
tidyverse
如果我们需要宽幅
library(tidyverse)
xl <- x %>%
data_frame(id = .) %>%
rownames_to_column(., 'rn') %>%
separate_rows(id, sep = '\t') %>%
mutate(id = str_extract(id, "[[:alnum:]]+(?=\\=)"))
xl
# A tibble: 9 x 2
# rn id
# <chr> <chr>
#1 1 xid1
#2 1 yid2
#3 1 zid3
#4 2 xid4
#5 2 yid5
#6 3 xid6
#7 3 yid7
#8 3 yid8
#9 3 zid9
xl %>%
group_by(rn) %>%
mutate(Seq = paste0("V", row_number())) %>%
spread(Seq, id)
# A tibble: 3 x 5
# Groups: rn [3]
# rn V1 V2 V3 V4
#* <chr> <chr> <chr> <chr> <chr>
#1 1 xid1 yid2 zid3 <NA>
#2 2 xid4 yid5 <NA> <NA>
#3 3 xid6 yid7 yid8 zid9