strsplit中的错误(word,NULL):带有拼写检查器的非字符参数

时间:2017-04-04 13:14:26

标签: r strsplit

我尝试使用R进行拼写检查,以纠正单词或文档的拼写错误。

我尝试使用这个R代码对一个单词进行修正,它的效果非常好:

> Correct("speling", dtm = counts)
$l4
[1] "spelling"

但是当我尝试修正文档时,我收到了这个错误:

> CorrectDocument("the quick bruwn fowx jumpt ovre tha lasy dog", dtm = counts)

 Error in strsplit(word, NULL) : non-character argument
# This is a text processing function, which I
# borrowed from a CMU Data mining course professor.
strip.text <- function(txt) {
  # remove apostrophes (so "don't" -> "dont", "Jane's" -> "Janes", etc.)
  txt <- gsub("'","",txt)
  # convert to lowercase
  txt <- tolower(txt)
  # change other non-alphanumeric characters to spaces
  txt <- gsub("[^a-z0-9]"," ",txt)
  # change digits to #
  txt <- gsub("[0-9]+"," ",txt)
  # split and make one vector
  txt <- unlist(strsplit(txt," "))
  # remove empty words
  txt <- txt[txt != ""]
  return(txt)
}

# Words within 1 transposition.
Transpositions <- function(word = FALSE) {
  N <- nchar(word)
  if (N > 2) {
    out <- rep(word, N - 1)
    word <- unlist(strsplit(word, NULL))
    # Permutations of the letters
    perms <- matrix(c(1:(N - 1), 2:N), ncol = 2)
    reversed <- perms[, 2:1]
    trans.words <- matrix(rep(word, N - 1), byrow = TRUE, nrow = N - 1)
    for(i in 1:(N - 1)) {
      trans.words[i, perms[i, ]] <- trans.words[i, reversed[i, ]]
      out[i] <- paste(trans.words[i, ], collapse = "")
    }
  }
  else if (N == 2) {
    out <- paste(word[2:1], collapse = "")
  }
  else {
    out <- paste(word, collapse = "")
  }
  return(out)
}

# Single letter deletions.
Deletes <- function(word = FALSE) {
  N <- nchar(word) 
  word <- unlist(strsplit(word, NULL))
  out <- list()
  for(i in 1:N) {
    out[i] <- paste(word[-i], collapse = "")
  }
  return(out)
}

# Single-letter insertions.
Insertions <- function(word = FALSE) {
  N <- nchar(word) 
  out <- list()
  for (letter in letters) {
    out[[letter]] <- rep(word, N + 1)
    for (i in 1:(N + 1)) {
      out[[letter]][i] <- paste(substr(word, i - N, i - 1), letter, 
                                substr(word, i, N), sep = "")
    }
  }
  out <- unlist(out)
  return(out)
}

# Single-letter replacements.
Replaces <- function(word = FALSE) {
  N <- nchar(word) 
  out <- list()
  for (letter in letters) {
    out[[letter]] <- rep(word, N)
    for (i in 1:N) {
      out[[letter]][i] <- paste(substr(word, i - N, i - 1), letter, 
                                substr(word, i + 1, N + 1), sep = "")
    }
  }
  out <- unlist(out)
  return(out)
}
# All Neighbors with distance "1"
Neighbors <- function(word) {
  neighbors <- c(word, Replaces(word), Deletes(word),
                 Insertions(word), Transpositions(word))
  return(neighbors)
}

# Probability as determined by our corpus.
Probability <- function(word, dtm) {
  # Number of words, total
  N <- length(dtm)
  word.number <- which(names(dtm) == word)
  count <- dtm[word.number]
  pval <- count/N
  return(pval)
}

# Correct a single word.
Correct <- function(word, dtm) {
  neighbors <- Neighbors(word)
  # If it is a word, just return it.
  if (word %in% names(dtm)) {
    out <- word
  }
  # Otherwise, check for neighbors.
  else {
    # Which of the neighbors are known words?
    known <- which(neighbors %in% names(dtm))
    N.known <- length(known)
    # If there are no known neighbors, including the word,
    # look farther away.
    if (N.known == 0) {
      print(paste("Having a hard time matching '", word, "'...", sep = ""))
      neighbors <- unlist(lapply(neighbors, Neighbors))
    }
    # Then out non-words.
    neighbors <- neighbors[which(neighbors %in% names(dtm))]
    N <- length(neighbors)
    # If we found some neighbors, find the one with the highest
    # p-value.
    if (N > 1) {
      P <- 0*(1:N)
      for (i in 1:N) {
        P[i] <- Probability(neighbors[i], dtm)
      }
      out <- neighbors[which.max(P)]
    }
    # If no neighbors still, return the word.
    else {
      out <- word
    }
  }
  return(out)
}

# Correct an entire document.
CorrectDocument <- function(document, dtm) {
  by.word <- unlist(strsplit(document, " "))
  N <- length(by.word)
  for (i in 1:N) {
    by.word[i] <- Correct(by.word[i], dtm = dtm)
  }
  corrected <- paste(by.word, collapse = " ")
  return(corrected)
}

words <- scan("http://norvig.com/big.txt", what = character())
words <- strip.text(words)
counts <- table(words)
Correct("speling", dtm = counts)

#---correct a document 
CorrectDocument("the quick bruwn fowx jumpt ovre tha lasy dog", dtm = counts)

请问好吗?

谢谢

1 个答案:

答案 0 :(得分:1)

函数Correct有一个错误,你应该添加一个unlist,即行:

Correct <- function(word, dtm) {
  neighbors <- Neighbors(word)

应更改为:

Correct <- function(word, dtm) {
  neighbors <- unlist(Neighbors(word))

编辑:

这是一个纠正文档文件行(覆盖它)的函数:

CorrectDocumentFile <- function(file,dtm){
  # read the file lines
  textLines <- unlist(readLines(file))

  # for each line not empty or blank, correct the text
  for(i in which(!grepl("^\\s*$",textLines))){
    line <- textLines[[i]]
    textLines[i] <- CorrectDocument(line,dtm)
  }

  # overwrite the file with the correction
  writeLines(textLines, file)
}

用法:

CorrectDocumentFile(file="fileToBeCorrected.txt", dtm=counts)