gsub(“ [[:: space:]] +”,“”,x)中的错误:'Calloc'无法分配内存(226427个字节)

时间:2019-06-07 21:34:03

标签: r

我正在运行以下功能:

my_func <- function(year_to_process){
  data <- df %>%
    filter(filing_date_year == year_to_process | filing_date_year == year_to_process - 1)
  data <- data[(which(nchar(data$text) > 10000)),]
  data_text <- data %>%
    mutate(doc_id = paste(cik, filing_date_year, sep = "_"),
           text = tolower(text)) %>%
    select(doc_id, text) %>%
    setNames(c("doc_id", "text"))
  # Cleaning the text
  t1_text_processing <- Sys.time()
  docs = VCorpus(DataframeSource(data_text))
  docs <- tm_map(docs, removeWords, stopwords('english'))
  docs <- tm_map(docs, content_transformer(tolower))
  docs <- tm_map(docs, removePunctuation)
  docs <- tm_map(docs, removeNumbers)
  docs <- tm_map(docs, removeWords, Remove_my_words_before_stem)
  docs <- tm_map(docs, stemDocument, language = 'english')
  docs <- tm_map(docs, removeWords, Remove_my_words_after_stem)
  docs <- tm_map(docs, content_transformer(nchar_rm))
  docs <- tm_map(docs, stripWhitespace)
  t2_text_processing <- Sys.time()
  text_processing_time <- t2_text_processing - t1_text_processing
  tdm <- TermDocumentMatrix(docs, control = list(weighting = function(x)
    weightTfIdf(x, normalize = TRUE)))
  tdm_sparse <- removeSparseTerms(tdm, .99)
  t1_cosine_time <- Sys.time()
  cosine_dist_mat <- crossprod_simple_triplet_matrix(tdm_sparse)/(sqrt(col_sums(tdm_sparse^2) %*% t(col_sums(tdm_sparse^2))))
  t2_cosine_time <- Sys.time()
  cosine_time <- t2_cosine_time - t1_cosine_time
  saveRDS(cosine_dist_mat, file = file.path(working_directory, "cosine_results", "Cosine", "mgnt_results", paste(year_to_process, year_to_process - 1, "cosine_matrix_mgnt.rds", sep = "_")))

  cols <- colnames(cosine_dist_mat)
  rows <- rownames(cosine_dist_mat)
  colsrows <- as.data.frame(cbind(cols, rows))

  colsrows <- colsrows %>%
    separate(cols, c("id_col", "year_col"), "_", remove = FALSE) %>%
    separate(rows, c("id_row", "year_row"), "_", remove = FALSE) %>%
    mutate(year_row_plus_1 = as.numeric(year_row) + 1,
           rows = paste0(id_row, "_", year_row_plus_1)) %>%
    select(cols, rows)

  ids <- colsrows[colsrows$cols %in% colnames(cosine_dist_mat) & 
                    colsrows$rows %in% rownames(cosine_dist_mat), ]
  res <- melt(cosine_dist_mat[as.matrix(colsrows[colsrows$cols %in% colnames(cosine_dist_mat) & 
                                                   colsrows$rows %in% rownames(cosine_dist_mat), ][2:1])])
  ids_res <- as.data.frame(cbind(ids, res))

  ids_res <- ids_res %>%
    separate(cols, c("id_col", "year_col"), "_", remove = FALSE) %>%
    separate(rows, c("id_row", "year_row"), "_", remove = FALSE) %>%
    mutate(compare_year = paste0(year_col, "_", year_row)) %>%
    setNames(c("cols_mgnt", "id_col_mgnt", "year_col_mgnt", "rows_mgnt", "id_row_mgnt", "year_row_mgnt", "value_mgnt", "compare_year_mgnt"))

  write.csv(ids_res,  file = file.path(working_directory, "cosine_results", "final_cosine_results", paste(year_to_process, year_to_process - 1, "cosine_results_mgnt.csv", sep = "_")))

  print(paste("Text Processing Time", text_processing_time, "for", year_to_process, "and", year_to_process - 1))
  print(paste("Cosine Matrix processing Time", cosine_time, "for", year_to_process, "and", year_to_process - 1))
  }

years_in_data如下所示:

> years_in_data
 [1] 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014

我应用了sapply

sapply(years_in_data, my_func)

我得到了错误:

  

gsub(“ [[:space:]] +”,“”,x)中的错误:'Calloc'无法分配   内存(226427个字节)

该代码在开始的几年中有效,其时间打印如下:

> sapply(years_in_data, text_to_cosine)
[1] "Text Processing Time 6.91588410139084 for 2005 and 2004"
[1] "Cosine Matrix processing Time 5.06124997138977 for 2005 and 2004"
[1] "Text Processing Time 12.2985820492109 for 2006 and 2005"
[1] "Cosine Matrix processing Time 26.73282289505 for 2006 and 2005"

但是多年来20072006我收到了错误的内存calloc错误。我在网上阅读过类似的问题,有人建议一个或多个程序包中有错误,但这不可能是因为数据使用了2年(第3年中断)。 >

对此有何建议?

0 个答案:

没有答案