在webscraping

时间:2017-04-24 07:17:10

标签: r

这是我使用R进行webscraping的代码,效果很好。

require(httr)
require(rvest)
library(KoNLP)
useSejongDic()


total = c()
site = "http://movie.daum.net/moviedb/grade?
movieId=92107&type=netizen&page="

for( i in 1:10){
    url = paste0(site, i);
    mention = GET(url) %>% read_html() %>% 
    html_nodes("p.desc_review") %>% 
    html_text() %>% 
    repair_encoding();
    total =  rbind(total, mention);
}


total = gsub("\t","",total)
total = gsub("\n","",total)

text1=extractNoun(total)
noun= sapply(total,extractNoun,USE.NAMES = FALSE)
undata= unlist(noun)
write(undata,"C:\\Users\\dlgof\\DATA.txt")

但是,txt文件中有太多空格,我该如何删除它们?

enter image description here

1 个答案:

答案 0 :(得分:0)

看起来你的undata有很多" &#34 ;.添加

undata = undata[undata != " "]
在你写作之前