匹配仅包含特定字符和特定长度的所有单词

时间:2019-01-18 02:58:44

标签: r regex stringr

我正在尝试创建一个小脚本,以帮助我在此填字游戏应用中作弊,这样我就可以击败正在和我擦地板的妻子。

该应用程序提供了几个字符,然后您应该将它们放入填字游戏中。我想要一种快速的方法来减少可能出现的单词。这是我目前的尝试:

library(tidyverse)

dat <- read_lines("https://raw.githubusercontent.com/dwyl/english-words/master/words.txt")


str_extract_all(dat, "\\b[ilrfle]{2,6}\\b") %>% 
  compact() %>% unlist()
#>    [1] "el"     "el"     "fi"     "life"   "free"   "fe"     "rifle" 
#>    [8] "fire"   "reef"   "fire"   "le"     "relief" "relief" "le"    
#>   [15] "fere"   "fell"   "le"     "fell"   "er"     "free"   "fire"  
#>   [22] "fire"   "free"   "free"   "fire"   "fire"   "fire"   "fire"  
#>   [29] "reef"   "life"   "free"   "eel"    "free"   "file"   "fire"  
#>   [36] "refer"  "eel"    "free"   "fire"   "free"   "re"     "reef"  
#>   [43] "file"   "free"   "ee"     "eel"    "eel"    "eel"    "eeler" 
#>   [50] "eel"    "eelier" "eel"    "eel"    "eel"    "eel"    "eel"   
#>   [57] "eer"    "er"     "eerie"  "eerier" "ef"     "eff"    "effeir"
#>   [64] "efl"    "eir"    "el"     "el"     "elf"    "elf"    "elf"   
#>   [71] "elf"    "elf"    "elf"    "elf"    "ell"    "lil"    "ell"   
#>   [78] "ell"    "ell"    "lil"    "er"     "erer"   "erf"    "erl"   
#>   [85] "err"    "free"   "free"   "free"   "free"   "free"   "fee"   
#>   [92] "fee"    "feel"   "feeler" "feere"  "feerie" "fee"    "fee"   
#>   [99] "fee"    "fee"    "feff"   "fei"    "feif"   "feirie" "fele"  
#>  [106] "fell"   "fell"   "fell"   "fell"   "feel"   "fer"    "fer"   
#>  [113] "fer"    "fere"   "ferfel" "ferie"  "ferlie" "ferr"   "ferri" 
#>  [120] "ff"     "fie"    "fief"   "fie"    "fie"    "fiel"   "fieri" 
#>  [127] "fifer"  "fife"   "fifie"  "fil"    "file"   "file"   "file"  
#>  [134] "file"   "file"   "fili"   "fili"   "filii"  "fill"   "fill"  
#>  [141] "fill"   "fille"  "filler" "filler" "filler" "filli"  "fill"  
#>  [148] "fill"   "fill"   "fill"   "free"   "fir"    "fir"    "fir"   
#>  [155] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [162] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [169] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [176] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [183] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [190] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [197] "fire"   "fire"   "fire"   "fire"   "free"   "fire"   "fire"  
#>  [204] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [211] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [218] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [225] "fire"   "fire"   "fire"   "fire"   "firer"  "fire"   "fire"  
#>  [232] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [239] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [246] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [253] "fire"   "fire"   "fire"   "fire"   "fire"   "fire"   "fire"  
#>  [260] "fire"   "fir"    "fir"    "fir"    "reel"   "reeler" "flee"  
#>  [267] "fleer"  "flier"  "flier"  "free"   "fll"    "ferri"  "flrie" 
#>  [274] "le"     "free"   "lie"    "fire"   "fee"    "free"   "free"  
#>  [281] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [288] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [295] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [302] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [309] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [316] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [323] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [330] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [337] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [344] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [351] "free"   "freir"  "frier"  "frill"  "frill"  "frill"  "frill" 
#>  [358] "frill"  "free"   "life"   "fill"   "fire"   "relief" "free"  
#>  [365] "ill"    "ll"     "fire"   "fi"     "flier"  "le"     "er"    
#>  [372] "free"   "ie"     "ie"     "ier"    "ier"    "if"     "fere"  
#>  [379] "iffier" "ifree"  "ii"     "iii"    "il"     "il"     "ile"   
#>  [386] "ile"    "ill"    "ll"     "ill"    "ill"    "ill"    "ill"   
#>  [393] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [400] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [407] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [414] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [421] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [428] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [435] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [442] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [449] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [456] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [463] "ill"    "ill"    "iller"  "ill"    "ill"    "ill"    "ill"   
#>  [470] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [477] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [484] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [491] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [498] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [505] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [512] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [519] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [526] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [533] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [540] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [547] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [554] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [561] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [568] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [575] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [582] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [589] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [596] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [603] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [610] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [617] "ill"    "ill"    "ill"    "ill"    "ill"    "ill"    "ill"   
#>  [624] "ll"     "re"     "ir"     "ire"    "free"   "ll"     "file"  
#>  [631] "fee"    "eel"    "free"   "le"     "lee"    "lee"    "leef"  
#>  [638] "leer"   "lee"    "lei"    "lere"   "lie"    "lie"    "lie"   
#>  [645] "lie"    "liefer" "lier"   "lierre" "life"   "life"   "life"  
#>  [652] "life"   "life"   "life"   "life"   "life"   "life"   "life"  
#>  [659] "life"   "life"   "life"   "life"   "life"   "life"   "life"  
#>  [666] "life"   "life"   "life"   "life"   "life"   "life"   "life"  
#>  [673] "life"   "life"   "life"   "life"   "life"   "life"   "life"  
#>  [680] "life"   "life"   "life"   "life"   "life"   "life"   "life"  
#>  [687] "life"   "life"   "life"   "lifer"  "life"   "life"   "life"  
#>  [694] "life"   "life"   "life"   "life"   "life"   "life"   "life"  
#>  [701] "life"   "life"   "life"   "life"   "life"   "life"   "life"  
#>  [708] "life"   "life"   "life"   "lile"   "lill"   "eleele" "lire"  
#>  [715] "ll"     "ll"     "ller"   "le"     "fer"    "fire"   "life"  
#>  [722] "le"     "lie"    "fire"   "free"   "er"     "er"     "er"    
#>  [729] "free"   "fire"   "el"     "er"     "reeler" "fire"   "reel"  
#>  [736] "le"     "relief" "free"   "filler" "free"   "fire"   "free"  
#>  [743] "free"   "free"   "free"   "free"   "free"   "free"   "free"  
#>  [750] "fire"   "firer"  "filer"  "fire"   "firer"  "re"     "re"    
#>  [757] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [764] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [771] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [778] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [785] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [792] "re"     "re"     "re"     "re"     "reefer" "re"     "reef"  
#>  [799] "reef"   "re"     "re"     "re"     "re"     "re"     "re"    
#>  [806] "re"     "re"     "re"     "reeler" "re"     "re"     "reel"  
#>  [813] "reel"   "reel"   "re"     "re"     "re"     "re"     "reel"  
#>  [820] "reel"   "re"     "re"     "re"     "re"     "re"     "re"    
#>  [827] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [834] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [841] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [848] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [855] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [862] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [869] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [876] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [883] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [890] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [897] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [904] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [911] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [918] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [925] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [932] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [939] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [946] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [953] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [960] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [967] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [974] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [981] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [988] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#>  [995] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1002] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1009] "re"     "re"     "re"     "re"     "re"     "ref"    "ref"   
#> [1016] "refeel" "refel"  "refell" "refer"  "refile" "refill" "refire"
#> [1023] "refl"   "refl"   "reflee" "re"     "re"     "re"     "re"    
#> [1030] "refr"   "re"     "re"     "rei"    "re"     "reif"   "re"    
#> [1037] "rei"    "rel"    "rel"    "re"     "re"     "re"     "re"    
#> [1044] "re"     "re"     "relief" "relief" "relier" "re"     "free"  
#> [1051] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1058] "re"     "re"     "re"     "re"     "re"     "rere"   "re"    
#> [1065] "re"     "rere"   "re"     "re"     "re"     "re"     "re"    
#> [1072] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1079] "re"     "re"     "re"     "re"     "re"     "rere"   "re"    
#> [1086] "re"     "reree"  "rereel" "re"     "refer"  "re"     "re"    
#> [1093] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1100] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1107] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1114] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1121] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1128] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1135] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1142] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1149] "re"     "rere"   "re"     "re"     "re"     "re"     "re"    
#> [1156] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1163] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1170] "re"     "re"     "re"     "re"     "re"     "re"     "re"    
#> [1177] "re"     "re"     "re"     "re"     "re"     "re"     "rfree" 
#> [1184] "free"   "rier"   "rife"   "rifer"  "riff"   "rifle"  "rifle" 
#> [1191] "rifler" "rifle"  "rifle"  "rile"   "rill"   "rille"  "rill"  
#> [1198] "fire"   "rle"    "eel"    "fill"   "free"   "fire"   "fi"    
#> [1205] "free"   "er"     "filler" "ill"    "life"   "free"   "ll"    
#> [1212] "free"   "life"   "lifer"  "file"   "fire"   "ell"    "free"  
#> [1219] "ll"     "ll"     "free"   "fire"   "life"   "fire"   "free"  
#> [1226] "free"   "ll"     "ll"     "re"     "ll"     "ll"     "free"  
#> [1233] "life"   "reel"   "free"   "free"   "free"   "ll"     "free"  
#> [1240] "free"   "free"   "free"   "life"   "re"     "life"   "free"  
#> [1247] "le"     "free"   "free"   "free"   "ll"     "li"     "re"    
#> [1254] "er"     "eer"    "ll"     "re"     "er"     "er"     "ll"    
#> [1261] "re"     "er"     "er"     "ll"     "re"     "er"     "ll"    
#> [1268] "re"     "lie"    "ll"     "re"     "fire"   "eel"    "free"  
#> [1275] "free"

reprex package(v0.2.1)于2019-01-17创建

数据来自包含约500,000个英语单词的数据库。在此示例中,该应用给了我ilrfle。因此,我希望所有仅使用 个字母的长度小于或等于6的单词。我将如何去做呢?我的问题是,它返回了一堆重复项,并且对于应用程序提供的某些组合也不起作用。

1 个答案:

答案 0 :(得分:1)

我不确定我是否赞成作弊-我为你的妻子加油!但这是一个有趣的问题,所以我会忽略它;-)。

这是另一种方法。将dat中的所有单词按字母顺序排序;还要按字母顺序对输入集中的字母进行排序。我们可以使用一个简单的正则表达式在字典中查找单词,每个单词中所包含的单词总数不超过指定数量。

library(tidyverse)

# Get the dictionary.
dat = read_lines("https://raw.githubusercontent.com/dwyl/english-words/master/words.txt")

# A function that returns possible words given a set of letters.  The letters
# are provided as a single string argument (e.g., "ilrfle").
possible.words = function(letters) {
  # Filter to words that contain only letters in the list.  This step isn't
  # strictly necessary, but it gives later steps a smaller list to have to
  # process.
  right.letters = unique(dat[grepl(paste("^[", letters, "]+$", sep = ""), dat)])
  # We're going to create a data frame where the first column is the word and
  # the second column is the word with its characters sorted in alphabetical
  # order.  Start with the first column.
  df = data.frame(word = right.letters, stringsAsFactors = F)
  # Now add the second column.  This could probably be done in dplyr, but my
  # initial attempt with mutate didn't work, and for the examples I've tried
  # the loop actually doesn't take too long.
  for(i in 1:nrow(df)) {
    df$sorted.word[i] = paste(sort(unlist(strsplit(df$word[i], ""))), collapse = "")
  }
  # Now we want to extract words that contain only as many tokens of each
  # letter as there were in the initial set.  We can use a regular expression
  # to compare the (sorted) letters of the initial set to the (sorted) letters
  # of each word, where each letter in the initial set is optional.
  sorted.letters.regex = paste(sort(paste(unlist(strsplit(letters, "")), "?", sep = "")), collapse = "")
  df = df %>%
    filter(grepl(paste("^", sorted.letters.regex, "$", sep = ""), sorted.word))
  return(df$word)
}