Question

我有一个字符向量，我想从中提取一些字符串。我可以通过使用循环实现它，但想知道是否可以在不使用循环的情况下完成相同的操作。我已经包含了一个带有我的代码的示例向量。

egVec = c("a - (2),bewc", "c,d,e","efd, ejw, qdn", "we3, asw - 23")

我想提取每个向量的第一个元素，使得所需的输出为：

Vec1
  [1] "a - (2)" "c" "efd" "we3"

我使用for循环的代码：

Vec1 = as.character(0)
for (i in 1:length(egVec)){
  SplitVec = unlist(strsplit(egVec[i], ","))
  Vec1[i] = SplitVec[1]
}

Answer 1

library(purrr)
library(stringi)

egVec <- c("a - (2),bewc", "c,d,e","efd, ejw, qdn", "we3, asw - 23")

strsplit(egVec, ",") %>%
  vapply(`[`, character(1), 1)                     # type-safe base R
## [1] "a - (2)" "c"       "efd"     "we3"

strsplit(egVec, ",") %>%
  sapply(`[`, 1)                                   # non-type-safe base R
## [1] "a - (2)" "c"       "efd"     "we3"

strsplit(egVec, ",") %>%
  map_chr(1)                                       # type-safe tidyvere
## [1] "a - (2)" "c"       "efd"     "we3"

stri_split_fixed(egVec, ",", 2, simplify=TRUE)[,1] # stringi one-liner splitting strings
## [1] "a - (2)" "c"       "efd"     "we3"

gsub(",.*$", "", egVec)                            # base R one-liner string replacing
## [1] "a - (2)" "c"       "efd"     "we3"

stri_replace_first_regex(egVec, ",.*$", "")        # stringi one-liner string replacing
## [1] "a - (2)" "c"       "efd"     "we3"

基准：

library(microbenchmark)
library(ggplot2)

microbenchmark(
  vapply=strsplit(egVec, ",") %>% vapply(`[`, character(1), 1),
  sapply=strsplit(egVec, ",") %>% sapply(`[`, 1),
  map_chr=strsplit(egVec, ",") %>% map_chr(1),
  stri_split=stri_split_fixed(egVec, ",", 2, simplify=TRUE)[,1] ,
  gsub=gsub(",.*$", "", egVec),
  stri_replace=stri_replace_first_regex(egVec, ",.*$", "")
) -> mb

mb
## Unit: microseconds
##          expr     min       lq      mean   median       uq      max neval cld
##        vapply 109.657 140.6025 169.51454 159.9715 181.4645 1102.825   100   b
##        sapply 125.206 147.8225 176.49470 172.4420 196.8730  396.046   100   b
##       map_chr 123.767 145.7385 179.12090 177.9535 198.2710  325.098   100   b
##    stri_split   6.626  12.7120  15.60843  14.6755  17.6315   68.299   100  a 
##          gsub  13.912  20.5335  24.99184  23.8180  28.1800   45.563   100  a 
##  stri_replace  17.532  25.8590  30.81416  28.9465  31.0715  170.869   100  a

autoplot(mb)

（不是基准测试的最佳测试工具，但我认为stri_split…会出现在最佳状态。

我也习惯使用gsub()而忘记使用sub()。它与gsub()具有几乎相同的基准。但是，使用sub()进行与stri_replace_first_regex()的比较更为公平。

从向量中提取字符串而不使用循环

1 个答案: