从网址获取数据:
suppressMessages(library(readr))
suppressMessages(library(RCurl))
amazon_url <- getURL('http://s3.amazonaws.com/assets.datacamp.com/production/course_935/datasets/500_amzn.csv',
ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
amazon <- read.csv(textConnection(amazon_url), header = TRUE)
创建amzn_cons
:
amazon_cons <- amazon$cons
为文本组织构建基于qdap
包的清理功能:
suppressWarnings(library(qdap))
qdap_clean <- function(x) {
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
根据文本组织的tm
包构建清理功能:
suppressWarnings(library(tm))
tm_clean <- function(corpus) {
corpus<- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Amazon","company"))
return(corpus)
}
Word清洁:
amzn_cons <- qdap_clean(amazon_cons)
amzn_cons <- VCorpus(VectorSource(amzn_cons))
amzn_cons_corp <- tm_clean(amzn_cons)
构建自定义函数以提取bigram功能:
suppressWarnings(library(RWeka))
tokenizer <- function(x)
NGramTokenizer(x, Weka_control(min = 2, max = 2))
应用标记化功能来获取双字母词:
amzn_c_tdm <- TermDocumentMatrix(
amzn_cons_corp,control = list(tokenize = tokenizer) )
这会导致以下错误:
Error in .jcall("RWekaInterfaces", "[S", "tokenize", .jcast(tokenizer, :
java.lang.NullPointerException
如何解决此错误?