我正在运行以下功能:
my_func <- function(year_to_process){
data <- df %>%
filter(filing_date_year == year_to_process | filing_date_year == year_to_process - 1)
data <- data[(which(nchar(data$text) > 10000)),]
data_text <- data %>%
mutate(doc_id = paste(cik, filing_date_year, sep = "_"),
text = tolower(text)) %>%
select(doc_id, text) %>%
setNames(c("doc_id", "text"))
# Cleaning the text
t1_text_processing <- Sys.time()
docs = VCorpus(DataframeSource(data_text))
docs <- tm_map(docs, removeWords, stopwords('english'))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, Remove_my_words_before_stem)
docs <- tm_map(docs, stemDocument, language = 'english')
docs <- tm_map(docs, removeWords, Remove_my_words_after_stem)
docs <- tm_map(docs, content_transformer(nchar_rm))
docs <- tm_map(docs, stripWhitespace)
t2_text_processing <- Sys.time()
text_processing_time <- t2_text_processing - t1_text_processing
tdm <- TermDocumentMatrix(docs, control = list(weighting = function(x)
weightTfIdf(x, normalize = TRUE)))
tdm_sparse <- removeSparseTerms(tdm, .99)
t1_cosine_time <- Sys.time()
cosine_dist_mat <- crossprod_simple_triplet_matrix(tdm_sparse)/(sqrt(col_sums(tdm_sparse^2) %*% t(col_sums(tdm_sparse^2))))
t2_cosine_time <- Sys.time()
cosine_time <- t2_cosine_time - t1_cosine_time
saveRDS(cosine_dist_mat, file = file.path(working_directory, "cosine_results", "Cosine", "mgnt_results", paste(year_to_process, year_to_process - 1, "cosine_matrix_mgnt.rds", sep = "_")))
cols <- colnames(cosine_dist_mat)
rows <- rownames(cosine_dist_mat)
colsrows <- as.data.frame(cbind(cols, rows))
colsrows <- colsrows %>%
separate(cols, c("id_col", "year_col"), "_", remove = FALSE) %>%
separate(rows, c("id_row", "year_row"), "_", remove = FALSE) %>%
mutate(year_row_plus_1 = as.numeric(year_row) + 1,
rows = paste0(id_row, "_", year_row_plus_1)) %>%
select(cols, rows)
ids <- colsrows[colsrows$cols %in% colnames(cosine_dist_mat) &
colsrows$rows %in% rownames(cosine_dist_mat), ]
res <- melt(cosine_dist_mat[as.matrix(colsrows[colsrows$cols %in% colnames(cosine_dist_mat) &
colsrows$rows %in% rownames(cosine_dist_mat), ][2:1])])
ids_res <- as.data.frame(cbind(ids, res))
ids_res <- ids_res %>%
separate(cols, c("id_col", "year_col"), "_", remove = FALSE) %>%
separate(rows, c("id_row", "year_row"), "_", remove = FALSE) %>%
mutate(compare_year = paste0(year_col, "_", year_row)) %>%
setNames(c("cols_mgnt", "id_col_mgnt", "year_col_mgnt", "rows_mgnt", "id_row_mgnt", "year_row_mgnt", "value_mgnt", "compare_year_mgnt"))
write.csv(ids_res, file = file.path(working_directory, "cosine_results", "final_cosine_results", paste(year_to_process, year_to_process - 1, "cosine_results_mgnt.csv", sep = "_")))
print(paste("Text Processing Time", text_processing_time, "for", year_to_process, "and", year_to_process - 1))
print(paste("Cosine Matrix processing Time", cosine_time, "for", year_to_process, "and", year_to_process - 1))
}
years_in_data
如下所示:
> years_in_data
[1] 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
我应用了sapply
sapply(years_in_data, my_func)
我得到了错误:
gsub(“ [[:space:]] +”,“”,x)中的错误:'Calloc'无法分配 内存(226427个字节)
该代码在开始的几年中有效,其时间打印如下:
> sapply(years_in_data, text_to_cosine)
[1] "Text Processing Time 6.91588410139084 for 2005 and 2004"
[1] "Cosine Matrix processing Time 5.06124997138977 for 2005 and 2004"
[1] "Text Processing Time 12.2985820492109 for 2006 and 2005"
[1] "Cosine Matrix processing Time 26.73282289505 for 2006 and 2005"
但是多年来2007
和2006
我收到了错误的内存calloc错误。我在网上阅读过类似的问题,有人建议一个或多个程序包中有错误,但这不可能是因为数据使用了2年(第3年中断)。 >
对此有何建议?