我正在1个内核中并并行运行以下功能:
简单版本似乎可以正常工作,但是在加载库(并行)时会出现错误。您应该能够立即运行以下代码:
# install.packages('stringr')
# install.packages('rvest')
library(stringr)
library(rvest)
scrape_insider_forms <- function(date) {
#### READ WEBPAGE ####
page <- paste0('https://www.secform4.com/', date ,'/selling.htm') %>% read_html()
#### PARSE FULL TABLE #####
## nodes
insider_sales_node <- html_nodes(page, "table")
## table content
data <- html_table(insider_sales_node[[2]], fill = TRUE)
#### PARSE JOB & INVESTMENT TYPE ####
## nodes
positions_node <- html_nodes(page, "span")
## txt content
positions_txt <- as.matrix(html_text(positions_node))
job_title <- as.matrix(positions_txt[!positions_txt[,1] == '(Direct)'
& !positions_txt == '(IndirectDirect)'
& !positions_txt == '(Indirect)'
& !positions_txt == '(DirectIndirect)',])
## direct / indirect investment
dir_indir <- as.matrix(positions_txt[positions_txt[,1] == '(Direct)'
| positions_txt == '(IndirectDirect)'
| positions_txt == '(Indirect)'
| positions_txt == '(DirectIndirect)',])
## remove header row
data <- data[-1,]
## Add jobs and inv type
data$FilerJob <- job_title
data$DirIndirect <- dir_indir
## set matching colnames for output rbind
if (ncol(data) == 12) colnames(data) <- c('TransactionDate', 'ReportedDateTime', 'Company',
'Symbol', 'InsiderRelationship', 'SharesTraded',
'AveragePrice', 'TotalAmount', 'SharesOwned', 'Filing',
'FilerJob', 'DirIndirect')
## store output
insider_sales_MASTER <<- rbind(insider_sales_MASTER, data)
cat('\nFinished ----- ', as.character(date), ' --- ')
}
它在非并行状态下工作良好:
## DOWNLOAD #####
## set up vector to loop/apply over
date_series <- seq(as.Date("2005-01-01"), as.Date("2005-01-10"), "days")
## set up output data frame for function
insider_sales_MASTER <- data.frame()
## loop
test <- lapply(date_series, scrape_insider_forms)
但是并行运行时出现一个奇怪的错误:
library(parallel)
cl <- makeCluster(detectCores()-1)
matrix_of_sums <- parLapply(cl, date_series, scrape_insider_forms)
错误如下:
checkForRemoteErrors(val)中的错误: 3个节点产生错误;第一个错误:找不到函数“%>%”
> sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: i386-w64-mingw32/i386 (32-bit)
Running under: Windows >= 8 x64 (build 9200)
Matrix products: default
locale:
[1] LC_COLLATE=English_Ireland.1252 LC_CTYPE=English_Ireland.1252 LC_MONETARY=English_Ireland.1252 LC_NUMERIC=C
[5] LC_TIME=English_Ireland.1252
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] devtools_1.13.6 snow_0.4-3 future.apply_1.0.1 future_1.9.0 RSelenium_1.7.4 stringr_1.3.0 rvest_0.3.2 xml2_1.2.0
[9] XML_3.98-1.11
loaded via a namespace (and not attached):
[1] Rcpp_0.12.16 magrittr_1.5 rappdirs_0.3.1 R6_2.2.2 httr_1.3.1 globals_0.12.3 caTools_1.17.1 tools_3.5.0 binman_0.1.1
[10] git2r_0.23.0 withr_2.1.2 selectr_0.4-1 semver_0.2.0 subprocess_0.8.3 digest_0.6.15 openssl_1.0.1 yaml_2.1.18 assertthat_0.2.0
[19] codetools_0.2-15 bitops_1.0-6 curl_3.2 memoise_1.1.0 wdman_0.2.4 stringi_1.1.7 compiler_3.5.0 jsonlite_1.5 listenv_0.7.0
>
谢谢。
更新:
感谢指针:
library(doParallel)
## create N-1 cores clusters
cl <- makeCluster(detectCores() - 1, # number of cores to use
type = "PSOCK")
## load the libraries inside the cluster
clusterEvalQ(cl, library(rvest))
clusterExport(cl, 'date_series')
clusterExport(cl, 'insider_sales_MASTER')
matrix_of_sums <- parLapply(cl, date_series, scrape_insider_forms)