这需要很长时间,因为读取行必须循环循环并单独进行。您能帮助并行化阅读行和rbind吗?
请随时进行改进。
循环必须经过40,000个以上的链接
代码:
library(tidyverse)
library(tidytext)
my_data2 <-c()
urls=readLines("InputLinks.csv")
for ( url in urls) {
valid_url <- TRUE
tryCatch({my_data =
readLines(str_c("http://https://www.google.co.in/search?q=",url,collapse=''))}, error=function(e) valid_url <<- FALSE)
#the above link is just a sample , I will be doing this in my internal website which will look like this - website+URL+other paramaters
if (!valid_url){
next}
my_data <- data_frame(document = url, text = my_data)
my_data2<-rbind(my_data2,my_data)
}
my_data<-my_data2
my_data1 <- my_data %>%
unnest %>%
unnest_tokens(word, text, strip_numeric = TRUE) %>%
group_by(document, word) %>%
summarise(count = n())