使用R

时间:2017-01-06 14:41:52

标签: arrays r string

我试图找到一种快速方法在R中搜索数组中的特定字符串,有点像游戏Boggle,除非你知道这个词是在前面。

您可以按以下方向移动字符串的下一个字母:向上,向下,向右或向左

比如说一个简单的例子,你有一个表格数组:

> G    
A, Q, A, Q, Q,  
A, Q, P, Q, Q,   
Q, Q, P, L, Q,   
Q, Q, Q, E, Q

并且您想要使用字符串APPLE将函数应用于G,函数返回TRUE,此数组中存在APPLEFALSE如果它没有按'吨

是否存在可以做到这一点的预制功能或包,或者有一种聪明的方法来做到这一点,我在处理R中的字符串方面相对较新,而我正在努力看到一种方式。

任何帮助非常感谢。感谢。

4 个答案:

答案 0 :(得分:2)

这将首先检查你的单词中是否存在数组中不存在的任何字符,然后检查数组中的字符数是否足以满足单词中的重复字母

word <- strsplit("APPLE", "")
pool <- c("A", "Q", "A", "Q",
          "Q", "A", "Q", "P",
          "Q", "Q", "Q", "Q",
          "P", "L", "Q", "Q",
          "Q", "Q", "E", "Q")

t.word <- table(word)
t.pool <- table(pool)

length(setdiff(names(t.word), names(t.pool))) == 0
min(t.pool[names(t.word)] - t.word) >= 0

最后两个函数都会输出TRUE以显示word中的所有字母都存在于pool中且word中的单个字母的计数不是大于pool

以函数形式输出TRUE如果找到,否则FALSE

word.find <- function(word, pool) {
  t.word <- table(strsplit(word, ""))
  t.pool <- table(pool)
  length(setdiff(names(t.word), names(t.pool))) == 0 & min(t.pool[names(t.word)] - t.word) >= 0
}

word.find("APPLE", pool)
[1] TRUE

word.find("APPLES", pool)
[1] FALSE

word.find("APPLEE", pool)
[1] FALSE

答案 1 :(得分:2)

此功能仅使用基础R

功能

ipStream
  .connect(routeStream)
  .keyBy(_ => 0, _ => 0)
  .flatMap(new MyRichCoFlatMapFunction) // with ValueState[RoutingTable]

<强> USAGE

search_string = function(matrix_array, word_to_search){

    position = data.frame(NA,NA,NA) #Create empty dataframe

    word_to_search_inv = sapply(lapply(strsplit(word_to_search, NULL), rev), paste, collapse="") #Reverse word_to_search

    for (i in 1:nrow(matrix_array)){
        str_row = paste((matrix_array[i,]),collapse = "") #Collapse entire row into a string
        if (grepl(word_to_search,str_row)) { #Check if the word_to_search is in the string towards right
            position = rbind(position,c(i,paste(gregexpr(word_to_search, str_row)[[1]], collapse = ', '),"RIGHT")) #Get position and add it to the dataframe      
        }
        if (grepl(word_to_search_inv,str_row)) {#Check if the word_to_search is in the string towards left (by checking for reverse of word_to_search)
            position = rbind(position,c(i,paste(gregexpr(word_to_search_inv, str_row)[[1]], collapse = ', '),"LEFT"))       
        }
    }

    for (j in 1:ncol(matrix_array)){        
        str_column = paste((matrix_array[,j]),collapse = "")
        if (grepl(word_to_search, str_column)) { #Check if the word_to_search is in the string towards down
            position = rbind(position, c(paste(gregexpr(word_to_search, str_column)[[1]], collapse = ', '),j,"DOWN"))
        }
        if (grepl(word_to_search_inv, str_column)) { #Check if the word_to_search is in the string towards up
            position = rbind(position, c(paste(gregexpr(word_to_search_inv, str_column)[[1]], collapse = ', '),j,"UP"))
        }
    }

    colnames(position) = c("ROW","COLUMN","DIRECTION")
    position = position[c(2:nrow(position)),]
    rownames(position) = NULL
    return(position) #Return the datafram containing row, columnm, and direction where word_to_match is found
}

答案 2 :(得分:1)

添加另一种方法:

board = structure(c("A", "A", "Q", "Q", "Q", "Q", "Q", "Q", "A", "P", 
"P", "Q", "Q", "Q", "L", "E", "Q", "Q", "Q", "Q"), .Dim = 4:5, .Dimnames = list(
    NULL, NULL))

word = "APPLE"

我们从:

开始
matches = lapply(strsplit(word, NULL)[[1]], function(x) which(x == board, arr.ind = TRUE))

这是一个简单的 - 可能不可避免的 - 搜索“板”的索引,匹配单词的每个字母。它是一个包含行/列索引的“列表”,如:

#[[1]]
#     row col
#[1,]   1   1
#[2,]   2   1
#[3,]   1   3
#
#[[2]]
#     row col
#[1,]   2   3
#[2,]   3   3
#
##.....

有了这个,我们需要逐步找出每个元素中的索引是否在下一个元素中具有邻居(即右/左/上/下单元格)。例如。我们需要这样的东西:

as.matrix(find_neighbours(matches[[1]], matches[[2]], dim(board)))
#      [,1]  [,2]
#[1,] FALSE FALSE
#[2,] FALSE FALSE
#[3,]  TRUE FALSE

告诉我们,matches[[1]]的第3行是matches[[2]]的第1行的邻居,即[1, 3][2, 3]确实是相邻的单元格。对于“匹配”中的每个连续元素,我们需要这个:

are_neighs = Map(function(x, y) which(find_neighbours(x, y, dim(board)), TRUE), 
                 matches[-length(matches)], matches[-1])
are_neighs
#[[1]]
#     [,1] [,2]
#[1,]    3    1
#
#[[2]]
#     [,1] [,2]
#[1,]    2    1
#[2,]    1    2
#
#[[3]]
#     [,1] [,2]
#[1,]    2    1
#
#[[4]]
#     [,1] [,2]
#[1,]    1    1

现在我们有成对(“i”和“i + 1”)邻居匹配,我们需要完成链。对于此示例,我们希望有一个像c(1, 2, 1, 1)这样的向量,其中包含are_neighs[[1]]的第1行与are_neighs[[2]]的第2行链接的信息,该第2行与第1行链接与are_neighs[[3]]的第1行链接的are_neighs[[4]]。这闻起来像是一个“igraph”问题,但我对它并不熟悉(希望有人有更好的想法),所以这是一种天真的方法来获得链接:

row_connections = matrix(NA_integer_, nrow(are_neighs[[1]]), length(are_neighs))
row_connections[, 1] = 1:nrow(are_neighs[[1]])
cur = are_neighs[[1]][, 2]
for(i in 1:(length(are_neighs) - 1)) {
    im = match(cur, are_neighs[[i + 1]][, 1]) 
cur = are_neighs[[i + 1]][, 2][im]
row_connections[, i + 1] = im
}
row_connections = row_connections[complete.cases(row_connections), , drop = FALSE]

返回:

row_connections
#     [,1] [,2] [,3] [,4]
#[1,]    1    2    1    1

现在有了这个向量,我们可以从“are_neighs”中提取相应的链:

Map(function(x, i) x[i, ], are_neighs, row_connections[1, ])
#[[1]]
#[1] 3 1
#
#[[2]]
#[1] 1 2
#
#[[3]]
#[1] 2 1
#
#[[4]]
#[1] 1 1

可用于从“匹配”中提取适当的索引行/列链:

ans = vector("list", nrow(row_connections))
for(i in 1:nrow(row_connections)) {
     connect = Map(function(x, i) x[i, ], are_neighs, row_connections[i, ])
     ans[[i]] = do.call(rbind, Map(function(x, i) x[i, ], matches, c(connect[[1]][1], sapply(connect, "[", 2))))
}
ans
#[[1]]
#     row col
#[1,]   1   3
#[2,]   2   3
#[3,]   3   3
#[4,]   3   4
#[5,]   4   4

将所有内容包装在函数中(find_neighbours在内部定义):

library(Matrix)
ff = function(word, board)
{
    matches = lapply(strsplit(word, NULL)[[1]], function(x) which(x == board, arr.ind = TRUE))

    find_neighbours = function(x, y, d)
    {
        neighbours = function(i, j, d = d) 
        {
            ij = rbind(cbind(i, j + c(-1L, 1L)), cbind(i + c(-1L, 1L), j))
            ijr = ij[, 1]; ijc = ij[, 2]
            ij = ij[((ijr > 0L) & (ijr <= d[1])) & ((ijc > 0L) & (ijc <= d[2])), ]

            ij[, 1] + (ij[, 2] - 1L) * d[1]
        }

        x.neighs = lapply(1:nrow(x), function(i) neighbours(x[i, 1], x[i, 2], dim(board)))
        y = y[, 1] + (y[, 2] - 1L) * d[1]

        x.sparse = sparseMatrix(i = unlist(x.neighs), 
                                j = rep(seq_along(x.neighs), lengths(x.neighs)), 
                                x = 1L, dims = c(prod(d), length(x.neighs)))
        y.sparse = sparseMatrix(i = y, j = seq_along(y), x = 1L, dims = c(prod(d), length(y)))                         

        ans = crossprod(x.sparse, y.sparse, boolArith = TRUE)

        ans
    }      

    are_neighs = Map(function(x, y) which(find_neighbours(x, y, dim(board)), TRUE), matches[-length(matches)], matches[-1])

    row_connections = matrix(NA_integer_, nrow(are_neighs[[1]]), length(are_neighs))
    row_connections[, 1] = 1:nrow(are_neighs[[1]])
    cur = are_neighs[[1]][, 2]
    for(i in 1:(length(are_neighs) - 1)) {
        im = match(cur, are_neighs[[i + 1]][, 1]) 
        cur = are_neighs[[i + 1]][, 2][im]
        row_connections[, i + 1] = im
    }
    row_connections = row_connections[complete.cases(row_connections), , drop = FALSE]

    ans = vector("list", nrow(row_connections))
    for(i in 1:nrow(row_connections)) {
        connect = Map(function(x, i) x[i, ], are_neighs, row_connections[i, ])
        ans[[i]] = do.call(rbind, Map(function(x, i) x[i, ], matches, c(connect[[1]][1], sapply(connect, "[", 2))))
    }
    ans
}

我们可以尝试一下:

ff("APPLE", board)
#[[1]]
#     row col
#[1,]   1   3
#[2,]   2   3
#[3,]   3   3
#[4,]   3   4
#[5,]   4   4

并且有多场比赛:

ff("AQQP", board)
#[[1]]
#     row col
#[1,]   1   1
#[2,]   1   2
#[3,]   2   2
#[4,]   2   3
#
#[[2]]
#     row col
#[1,]   1   3
#[2,]   1   2
#[3,]   2   2
#[4,]   2   3
#
#[[3]]
#     row col
#[1,]   1   3
#[2,]   1   4
#[3,]   2   4
#[4,]   2   3

虽然它在返回多个匹配时很灵活,但它并不返回所有可能的匹配,简而言之,这是因为在构建邻居链时使用match - 可以使用线性搜索相反,但是 - 当下 - 增加了重要的代码复杂性。

答案 3 :(得分:0)

我写了下面的内容,它运行良好,速度快,并且可以翻译成任何其他语言。

给定图G和字典,它搜索字典然后测试G是否有任何字母对应于它需要检查的每个单词的第一个字母。接下来,它检查由前一个TRUE值的TRUE值+ delta的索引找到的任何邻居是否等于该单词的第二个。而且这种情况还在继续。

如果在任何时候发现这不是TRUE,则函数结束并返回FALSE。此外,如果您按字母组合的“稀有性”对字典进行排序,则该功能将更快地运行。

#function to check if a word appears in a graph
dict_check <- function(dictionary, G) {

#Run thru dictionary and check if word is G
#If at any point after a word check, it doesn't appear, break and return FALSE

n <- length(dictionary)
count_1 <- 0    #sum of words checked
count_2 <- 0    #sum of words successfully found
delta <- matrix(c(-1,  0, 1, 0, 
                   0, -1, 0, 1), 
                   byrow = T, nrow = 4, ncol = 2)

for (dc in 1:n) {
word <- dictionary[dc]

#Add 1 for each word checked
count_1 <- count_1 + 1

#Split word into a vector
W <- unlist(strsplit(word, ""))

#Boolean matrix for 1st letter of word, if not there, end and return False
G_bool <- G == W[1]
if(sum(G_bool) == 0) {
  return(FALSE)
}

#Fetch indices of True values for 1st letter of word
I <- which(G_bool == T, arr.ind = T)

#Loop thru word and check if neighbours match next letter of word,
#for all letters of word
#if at any point after iteration of a letter in word whereby G is all False,
#return False for word_check

last <- length(W)
for (w in 2:last) {

  #For each index in I, check if wordbox range, 
  #and check if neighbours ar equal to W[2, ...]
  for (i in 1:nrow(I)) {
    for (d in 1:nrow(delta)) {
      #neighbour
      k <- I[i, ] + delta[d, ]

      #If neighbour is out of bounds of box then move onto next neighbour
      #Each valid neighbour checked if is equal to next letter of word
      #If it is equal set to neighbour to TRUE, and original position to FALSE
      #If neighbour doesn't equal next letter, make original position FALSE anyway
      G_bool[I[i, 1], I[i, 2]] <- FALSE   #Set original position to FALSE
      if (k[1] == 0 | k[1] > nrow(G) | k[2] == 0 | k[2] > ncol(G)) {
        next} else if (G[k[1], k[2]] == W[w]) {
          G_bool[k[1], k[2]] <- TRUE    #Set neighbour to TRUE
        } 
      }
    }
    #Check after each iteration of letter if any letters of subsequent 
    #letters appear, if yes, continue to next letter of word, if no, return
    #FALSE for word check
    if (sum(G_bool) == 0) {
      return(FALSE)
    }
    #Update indices I for next TRUE in G_bool, corresponding to next letters found
    I <- which(G_bool == T, arr.ind = T)
  }
  #Final check after word iteration is complete on G_bool
  if (sum(G_bool) == 0) {
    return(FALSE)
  } else if (sum(G_bool) > 0) {
    count_2 <- count_2 + 1    #Add 1 to count_2 if word successfully found
  }
  if (count_1 != count_2) {
    return(FALSE)
  } 
  }
  #Final check
  if (count_1 != count_2) {
  return(FALSE)
  } else
  return(TRUE)
  }