
时间:2015-09-09 12:52:52

标签: r dictionary



t <- "In order to perform operations inside the abdomen, surgeons must make an incision large enough to offer adequate visibility, provide access to the abdominal organs and allow the use of hand-held surgical instruments.  These incisions may be placed in different parts of the abdominal wall.  Depending on the size of the patient and the type of operation, the incision may be 6 to 12 inches in length.  There is a significant amount of discomfort associated with these incisions that can prolong the time spent in the hospital after surgery and can limit how quickly a patient can resume normal daily activities.  Because traditional techniques have long been used and taught to generations of surgeons, they are widely available and are considered the standard treatment to which newer techniques must be compared."


dict <- c("hand-held surgical instruments", "intensive care unit", "traditional techniques")


#Preprocessing of data
corpus <- Corpus(VectorSource(t))
corpus <- tm_map(corpus,content_transformer(tolower))
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,PlainTextDocument)

#Bigram Tokenization
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm <- TermDocumentMatrix(corpus,control=list(tokenize=BigramTokenizer, dictionary=dict))


<<TermDocumentMatrix (terms: 3, documents: 1)>>
Non-/sparse entries: 1/2
Sparsity           : 67%
Maximal term length: 30
Weighting          : term frequency (tf)

Terms                            character(0)
hand-held surgical instruments            0
intensive care unit                       0
traditional techniques                    1


1 个答案:

答案 0 :(得分:0)



词典:   要列表的字符向量。 结果中不会列出其他条款。默认为NULL,表示列出了doc中的所有术语。



#Preprocessing of data
corpus <- Corpus(VectorSource(t))
corpus <- tm_map(corpus,content_transformer(tolower))
corpus <- tm_map(corpus,PlainTextDocument)

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

# dictionary bigrams removed.
tdm_bigram_no_dict <- TermDocumentMatrix(corpus,control=list(stopwords = BigramTokenizer(dict), tokenize = BigramTokenizer))
# dictionary bigrams from corpus
tdm_bigram_dict <- TermDocumentMatrix(corpus,control=list(tokenize = BigramTokenizer, dictionary = dict))

<<TermDocumentMatrix (terms: 3, documents: 1)>>
Non-/sparse entries: 1/2
Sparsity           : 67%
Maximal term length: 30
Weighting          : term frequency (tf)

Terms                            character(0)
  hand-held surgical instruments            0
  intensive care unit                       0
  traditional techniques                    1

# dictionary trigrams from corpus
tdm_trigram_dict <- TermDocumentMatrix(corpus,control=list(tokenize = TrigramTokenizer, dictionary = dict))

<<TermDocumentMatrix (terms: 3, documents: 1)>>
Non-/sparse entries: 1/2
Sparsity           : 67%
Maximal term length: 30
Weighting          : term frequency (tf)

Terms                            character(0)
  hand-held surgical instruments            1
  intensive care unit                       0
  traditional techniques                    0

# combine term document matrices into one. you can use rbind since tdm's are sparse matrices. If you want extra speed, look into the slam package.
tdm_total <- rbind(tdm_bigram_no_dict, tdm_bigram_dict, tdm_trigram_dict)


df <- data.frame(terms = rownames(as.matrix(tdm_total)),   freq = rowSums(as.matrix(tdm_total)), row.names = NULL, stringsAsFactors = FALSE)
df <- df %>% group_by(terms) %>% summarise(sum(freq))