这是我使用R进行webscraping的代码,效果很好。
require(httr)
require(rvest)
library(KoNLP)
useSejongDic()
total = c()
site = "http://movie.daum.net/moviedb/grade?
movieId=92107&type=netizen&page="
for( i in 1:10){
url = paste0(site, i);
mention = GET(url) %>% read_html() %>%
html_nodes("p.desc_review") %>%
html_text() %>%
repair_encoding();
total = rbind(total, mention);
}
total = gsub("\t","",total)
total = gsub("\n","",total)
text1=extractNoun(total)
noun= sapply(total,extractNoun,USE.NAMES = FALSE)
undata= unlist(noun)
write(undata,"C:\\Users\\dlgof\\DATA.txt")
但是,txt文件中有太多空格,我该如何删除它们?
答案 0 :(得分:0)
看起来你的undata有很多" &#34 ;.添加
undata = undata[undata != " "]
在你写作之前。