生成数据

Question

我的观察数据超过1000万，字符串变量为dat_text。我试图给每个观察指标变量（IV）。

当字符串dat_text 包含时，如果包含pat_text AND 中的任何字符串的任何表单，则IV将为1排除包含ex_text中任何字符串的任何形式（请参阅编辑的含义）。我试图在R中实现这个。

dat_text <- c("dbhgfadgdfgc", "sdfdsfsdgdfxgfydz", "fqdfsbfdjhdhts","dbhgfghfadgdfgc", "sdfdghsfsdgdfxgfydz", "fqdfsbfdjhfghdhts", "fdsafgdjfx", "dfdoslfspd")
ex_text <- c("fgh", "opl")
pat_text <- c("abc", "xyz", "jbq")

我创建了一个内置循环的函数：

myfunction <- function(pat_text, ex_text, dat_text){
# =========PART 1: pat_text========================
logic_tem <- list()
for(i in 1:length(pat_text)){# for each phrase in "pat_text"
  temp <- list()
  for(t in 1:nchar(pat_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(pat_text[i], t, t), dat_text) 
  }
  # Use "AND" to connect multiple logic vectors
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}

logic_tem <- do.call(cbind, logic_tem)
logic_pattext <- rowSums(logic_tem) > 0

# =========PART 2: ex_text========================
logic_tem <- list()
for(i in 1:length(ex_text)){# for each phrase in "ex_text"
  temp <- list()
  for(t in 1:nchar(ex_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(ex_text[i], t, t), dat_text) 
  }
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
# Use "OR" to connect multiple logic vectors
logic_tem <- do.call(cbind, logic_tem)
logic_extext <- rowSums(logic_tem) > 0

# =========PART 3: combine the two parts=========
return(logic_pattext & !logic_extext)
}

此功能运作良好：

> myfunction(pat_text, ex_text, dat_text)
[1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE

但是当我在原始数据上实现它时，结果却相当缓慢且效率低下。我问是否有人能提供任何提高R中此功能性能的线索？感谢

编辑我不清楚状态＆＃34;任何字符串的排列＆＃34;。通过那个短语，我实际上意味着观察包含该字符串中的所有字母。如果pat_text为fgh，则以下短语均符合要求：

"fgh", "00000f00000g00000h", "00000g00000h00000f", "000000h00000f00000g"

实际上，上述模式中的0可以是任何字符或数字。我意识到这个问题实际上在这个版本之后成了一个完全不同的问题。当我写这篇文章时，我真的没有意识到这是一个完全不同的问题。我很抱歉。

Answer 1

以下是函数的矢量化方法：

myfunction <- function(pat_text, ex_text, dat_text){

    sep_pat_text = strsplit(pat_text,"")
    result = lapply(sep_pat_text, FUN  = function(k){
        testLetter = lapply(k, grepl, x = dat_text)
        resultLetter = do.call(cbind, testLetter)
        apply(resultLetter, 1, all)
    })
    include = apply(do.call(cbind, result), 1, any)


    sep_ex_text = strsplit(ex_text,"")
    result = lapply(sep_ex_text, FUN  = function(k){
        testLetter = lapply(k, grepl, x = dat_text)
        resultLetter = do.call(cbind, testLetter)
        apply(resultLetter, 1, all)
    })
    dontInclude = apply(do.call(cbind, result), 1, any)

    return(!dontInclude & include)
}

将此功能应用于输入样本结果：

> myfunction(pat_text, ex_text, dat_text)
[1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE

我无法保证这会表现得更好，但请试一试并评论您的结果。

编辑：通过将重复的代码组合到内部函数中，可以简化此功能。

myfunction <- function(pat_text, ex_text, dat_text){
    testLetters = function(text, pattern){
        sep_pat = strsplit(pattern, "")
        result = lapply(sep_pat, FUN  = function(k){
            testLetter = lapply(k, grepl, x = text)
            resultLetter = do.call(cbind, testLetter)
            apply(resultLetter, 1, all)
        })
        return(apply(do.call(cbind, result), 1, any))
    }    

    include = testLetters(dat_text, pat_text)
    dontInclude = testLetters(dat_text, ex_text)

    return(!dontInclude & include)
}

Answer 2

@R。 Schifini我运行了一个基准测试代码来测试哪个函数运行得更快，这是我的结果。您可以尝试运行相同的代码来确认代码的可复制性。

生成数据

dat_text <- stringi::stri_rand_strings(10^6, 5)
ex_text <- c("fgh", "opl")
pat_text <- c("abc", "xyz", "jbq")

功能1：原始海报提供的功能

myfunction1 <- function(pat_text, ex_text, dat_text){
# =========PART 1: pat_text========================
logic_tem <- list()
for(i in 1:length(pat_text)){# for each phrase in "pat_text"
  temp <- list()
  for(t in 1:nchar(pat_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(pat_text[i], t, t), dat_text) 
  }
  # Use "AND" to connect multiple logic vectors
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}

logic_tem <- do.call(cbind, logic_tem)
logic_pattext <- rowSums(logic_tem) > 0

# =========PART 2: ex_text========================
logic_tem <- list()
for(i in 1:length(ex_text)){# for each phrase in "ex_text"
  temp <- list()
  for(t in 1:nchar(ex_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(ex_text[i], t, t), dat_text) 
  }
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
# Use "OR" to connect multiple logic vectors
logic_tem <- do.call(cbind, logic_tem)
logic_extext <- rowSums(logic_tem) > 0

# =========PART 3: combine the two parts=========
return(logic_pattext & !logic_extext)
}

功能2：@R的第一个功能。 Schifini

myfunction2 <- function(pat_text, ex_text, dat_text){
sep_pat_text = strsplit(pat_text,"")
result = lapply(sep_pat_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
include = apply(do.call(cbind, result), 1, any)
sep_ex_text = strsplit(ex_text,"")
result = lapply(sep_ex_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
dontInclude = apply(do.call(cbind, result), 1, any)
return(!dontInclude & include)
}

功能3：@R的第二个功能。 Schifini

myfunction3 <- function(pat_text, ex_text, dat_text){
testLetters = function(text, pattern){
sep_pat = strsplit(pattern, "")
result = lapply(sep_pat, FUN = function(k){
testLetter = lapply(k, grepl, x = text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
return(apply(do.call(cbind, result), 1, any))
}
include = testLetters(dat_text, pat_text)
dontInclude = testLetters(dat_text, ex_text)
return(!dontInclude & include)
}

基准

microbenchmark::microbenchmark(
myfunction1(pat_text, ex_text, dat_text),
myfunction2(pat_text, ex_text, dat_text),
myfunction3(pat_text, ex_text, dat_text))

>## Unit: seconds
>## expr min lq mean median uq max neval
>## myfunction1(pat_text, ex_text, dat_text) 3.284922 3.443022 3.605378 3.594186 3.698748 4.041584 100
>## myfunction2(pat_text, ex_text, dat_text) 12.134576 13.457712 13.802636 13.710624 14.765376 16.084844 100
>## myfunction3(pat_text, ex_text, dat_text) 12.136296 13.522227 13.812180 13.719780 14.662117 17.126667 100

Answer 3

我建议您使用一些tidyverse工具，特别是stringr::str_detect，正则表达式和dplyr。没有原始数据，我不知道会有多快，但我认为最有可能。它将类似于以下内容。您可以组合部件以缩短部件，但我认为这是最具可读性的。我也在诠释＆＃34;任何字符串的任何排列＆＃34;例如，dat_text中的单个字符串不能包含"fgh"中的ex_text，但可以包含"hgf"，"fgh"必须完全包含library(tidyverse) # Make dat_text a column in a data frame dat_txt_tbl <- tibble(dat_text) # Make regular expressions ex_rgx <- str_c(ex_text, sep = "|") pat_rgx <- str_c(pat_text, sep = "|") dat_txt_tbl %>% mutate(inc_pat_txt = str_detect(dat_text, pat_rgx)) %>% mutate(inc_ex_txt = str_detect(ex_text, ex_rgx)) %>% mutate(IV = inc_pat_txt & (!inc_ex_txt))一个要计数的字符串。

{{1}}

IV希望是你想要的。没有一个例子，它很难调试。

Answer 4

根据问题的新定义进行更新。不确定这是否会更快，但您可以从这种方法中挑选一些技术：

library(stringr)

pat_text <- c("abc", "xyz", "jbq")
ex_text <- c("fgh", "opl")
dat_text <- c("dbhgfadgdfgc", "sdfdsfsdgdfxgfydz", "fqdfsbfdjhdhts","dbhgfghfadgdfgc", "sdfdghsfsdgdfxgfydz", "fqdfsbfdjhfghdhts", "fdsafgdjfx", "dfdoslfspd")

pat_chars <- strsplit(pat_text, "")
ex_chars <- strsplit(ex_text, "")

mat_pat <- lapply(dat_text, function(x) sapply(pat_chars, str_detect, string = x))
mat_ex <- lapply(dat_text, function(x) sapply(ex_chars, str_detect, string = x))

match_pat <- apply(sapply(mat_pat, function(x) apply(x, 2, all)), 2, any)
match_ex <- apply(sapply(mat_ex, function(x) apply(x, 2, all)), 2, any)

result <- match_pat & !match_ex
result
# [1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE

<小时/> 回答原始（未编辑）问题的旧代码

注意：保留以下代码，因为它说明了一些概念，以防有人偶然发现这个问题。

所以我不确定您当前的方法是否正常工作，因为我找不到pat_text的任何匹配项？ FALSE 和 anything始终为FALSE。无论如何，我认为这就是你想要的。可能会被优化和清理 - 关键是所有的排列。

library(tidyverse)
library(combinat)
library(stringr)

get_permutations <- function(text) {
  strsplit(text, "") %>%
    map(permn) %>%
    data.frame(check.names = FALSE) %>%
    map(paste0, collapse = "") %>%
    unname %>% 
    unlist
}

pat_perm <- get_permutations(pat_text)
pat_perm
# [1] "abc" "acb" "cab" "cba" "bca" "bac" "xyz" "xzy" "zxy" "zyx" "yzx" "yxz" "jbq" "jqb" "qjb" "qbj"
# [17] "bqj" "bjq"

ex_perm <- get_permutations(ex_text)
ex_perm
# [1] "fgh" "fhg" "hfg" "hgf" "ghf" "gfh" "opl" "olp" "lop" "lpo" "plo" "pol"

match_pat_perm <- str_detect(dat_text, paste0(pat_perm, collapse = "|"))
match_ex_perm <- str_detect(dat_text, paste0(ex_perm, collapse = "|"))

result <- match_pat_perm & !match_ex_perm
result

# Included to show how you might get the locations.
str_locate_all(dat_text, paste0(pat_perm, collapse = "|")) 
str_locate_all(dat_text, paste0(ex_perm, collapse = "|"))

注意：您的问题表明结果应该是整数，但是您的示例另有说明;无论如何，你总是可以做as.integer(result)。

r

4 个答案:

生成数据

功能1：原始海报提供的功能

功能2：@R的第一个功能。 Schifini

功能3：@R的第二个功能。 Schifini

基准