我在R中加载并清理语料库:
myTxt <- Corpus(DirSource("."), readerControl = list(language="lat"))
corp <- tm_map(myTxt, removeWords, c(stopwords("french")))
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, content_transformer(removeNumbers))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, removeWords, stopwords("french"))
corp <- tm_map(corp, stripWhitespace); #inspect(docs[1])
tdm <- TermDocumentMatrix(corp)
使用treetagger我写了一个函数:
require(koRpus)
lemmatisation <- function(my.df){
##my.df est un objet Corpus issu de du chargement du corpus avec tm
print(my.df)
dictionnaire <- data.frame()
for(i in 1 : length(my.df)){
lemma <- treetag(corp[[i]][[1]], treetagger = "manual", format = "obj", TT.tknz = FALSE,
lang = "fr", TT.options = list(path = "treetagger", preset = "fr-utf8"))
dictionnaire <- rbind(dictionnaire, lemma@TT.res )
}
return(unique(dictionnaire))
}
此时我tdm
有类似
Docs
Terms Urbain.txt Versele.txt
sudest 0 1
suit 0 0
suivi 0 0
sujets 0 0
supplémentaire 0 0
suzanne 0 0
symbols 0 0
tant 0 0
tdm 0 0
télévisés 0 0
tempérament 0 0
temps 1 0
termdocumentmatrixcorp 0 0
terms 0 0
terre 0 0
tête 0 0
text 0 0
textcat 0 0
the 0 1
théâtre 0 0
thème 0 0
themebw 0 0
thérapeute 0 0
thérapie 0 0
thèse 0 0
tissent 0 0
tmmapcorp 0 0
tmmapmytxt 0 0
tokyo 0 0
tôt 0 0
touchent 0 0
toujours 0 0
tournant 0 0
tous 0 0
tout 0 0
toute 0 0
toutes 0 0
traditionnelle 0 1
transformé 0 0
travail 0 0
travaillant 0 1
travaille 0 0
travaillé 0 0
travaillent 0 0
现在我想用我的lemme dictonnary汇总字数,用于灌输travaillé,travaille,travaillant,travaillent ......
在我的功能化的结果中,我有:
my.lemma[my.lemma$lemma == "travailler",]
token tag lemma lttr wclass desc stop stem
665 travaillé VER:pper travailler 9 verb verb past participle NA NA
835 travaille VER:pres travailler 9 verb verb present NA NA
1369 travaillent VER:pres travailler 11 verb verb present NA NA
1713 travaillant VER:ppre travailler 11 verb verb present participle NA NA
我不知道如何进行此聚合
答案 0 :(得分:1)
你可以尝试
aggregate(.~lemma, merge(tdm, mylemma[, c("token", "lemma")], by.x="row.names", by.y="token")[-1], sum)
应该给你类似的东西
# lemma Urbain.txt Versele.txt
# 1 travailler 0 1
# ...
答案 1 :(得分:0)
另一种方法是使用dplyr。
我将所有代码放在R函数中:lemmatization.tdm
语料库是来自tm :: Corpus的对象,lang是langage参数(请参阅help(koRpus :: kRp.POS.tags)以获取可用的语言)和treetaggerfilepath是您已安装treetagger的文件({{3} })。
lemmatization.tdm <- function( corpus, lang = "fr",
treetaggerfilepath = "~/Programs/treetagger/" )
{
# get packages
require( koRpus ) ;
require( tm ) ;
require( dplyr ) ;
# run treetager
dictionnaire <- data.frame()
for(i in 1:length(corpus) )
{
lemma <- treetag(corpus[[i]][[1]],
treetagger = "manual",
format = "obj", TT.tknz = FALSE,
lang = lang,
TT.options = list(path = treetaggerfilepath,
preset = lang )
)
dictionnaire <- rbind(dictionnaire, lemma@TT.res )
} ;
# select outpu from treetagger (token are originals words and lemma the lemmas)
dictionnaire[, c("token", "lemma")] -> dictionnaire ;
# treetagger give sometimes more than one ouput, you need select one.
dictionnaire %>% select(., token) %>% unique() %>% row.names() -> rownames1 ;
dictionnaire[ as.numeric( rownames1 ), ] -> dictionnaire ;
# prepare classic tdm
TermDocumentMatrix(corpus ) %>%
as.matrix(.) %>%
as.data.frame(., stringsAsFactors = FALSE ) ->
tdm ;
# bind terms in data
cbind( token = rownames( tdm ), tdm ) ->
tdm ;
# bind dictionnaire and tdm on terms (token)
right_join( dictionnaire, tdm,
by = "token" ) -> tdm.lemma ;
# agreggate count for lemmas
tdm.lemma[,-1] %>%
as.data.frame(.) %>%
group_by(., lemma) %>%
summarise_all(., funs( sum( na_if(., 0L), na.rm = TRUE ) ) ) ->
tdm.lemma1 ;
#prepare output
rownames( tdm.lemma1 ) <- tdm.lemma1$lemma ;
as.matrix( tdm.lemma1[, -1] ) -> tdm.lemma1 ;
return( tdm.lemma1 ) ;
} ;
# return a tdm in matrix class.
tdm %>% rowSums %>% sort(., decreasing = TRUE) -> countofterm # to see results by terms
countofterm %>% sum -> numberofanalyzedwords
countofterm / numberofanalyzedwords -> freqterm
freqterm %>% filter(., freqterm > 0.1) # get words with a frequency above 0.1