Question

我是R的新手。我有一个包含139列和超过46.5k行的数据集。我已经测量了数据集中行之间的成对余弦相似性矩阵，其中一行将与其他行的其余行进行比较，并且在下一次迭代期间将被排除，并且该过程将继续用于数据集的其余部分。这种实现方式适用于小样本数据集，例如有500行。但是，当我尝试使用整个数据集（46k）时，它变得讨厌（我已经等了将近30个小时运行代码但没有输出）。这是我迄今为止的实施：

library(reshape2)
library(lsa)


psm_sample <- read.csv("psm_final_sample.csv")
numRows = nrow(psm_sample)


##################################

normalize <- function(x) {
  return ( (2 * ((x - min(x)) / (max(x) - min(x))) ) - 1 )
}

##################################
cat_normalize <- function(x) {

  norm <-  ( (2 * ((x - min(x)) / (max(x) - min(x))) ) - 1 )
  return (ifelse(norm < 0 , -1, 1))
}

#############################

cat_gender <- function (sex){
  sex <- as.character(sex)

  if( sex == 'M' ) {
    return (as.integer(1))
  }
  else{
    return(as.integer(2))
  }
}

##################################

cat_admsn_type <- function (type){
  type <- as.character(type)

  if( type == 'EMERGENCY' ) {
    return(as.integer(1))
  }
  else if ( type == 'URGENT'){ 
    return(as.integer(2))
  }
  else{
    return(as.integer(3))
  }
}

#############################

cat_first_icu <- function (ficu){
  type <- as.character(ficu)

  if( ficu == 'CCU' ) {
    return(as.integer(1))
  }
  else if ( ficu == 'CSRU'){ 
    return(as.integer(2))
  }
  else if ( ficu == 'MICU'){ 
    return(as.integer(3))
  }
  else if ( ficu == 'NICU'){ 
    return(as.integer(4))
  }
  else if ( ficu == 'SICU'){ 
    return(as.integer(5))
  }
  else{
    return(as.integer(6))
  }
}

##################################

cat_last_icu <- function (licu){
  type <- as.character(licu)

  if( licu == 'CCU' ) {
    return(as.integer(1))
  }
  else if ( licu == 'CSRU'){ 
    return(as.integer(2))
  }
  else if ( licu == 'MICU'){ 
    return(as.integer(3))
  }
  else if ( licu == 'NICU'){ 
    return(as.integer(4))
  }
  else if ( licu == 'SICU'){ 
    return(as.integer(5))
  }
  else{
    return(as.integer(6))
  }
}

#################################################################################

gender <- sapply(psm_sample$gender,cat_gender)
admission_type <- sapply(psm_sample$admission_type,cat_admsn_type)
first_icu_service_type <- sapply(psm_sample$first_icu_service_type,cat_first_icu)
last_icu_service_type <- sapply(psm_sample$last_icu_service_type,cat_last_icu)

################################################################################

psm_sample_cont_norm_df <- as.data.frame(lapply(psm_sample[8:138], normalize))
psm_sample_cat_df <- data.frame(gender,admission_type,first_icu_service_type,last_icu_service_type)
psm_sample_cat_norm_df <- as.data.frame(lapply(psm_sample_cat_df, cat_normalize))

psm_temp_df <- cbind.data.frame(psm_sample[1], psm_sample_cat_norm_df, psm_sample_cont_norm_df)


row.names(psm_temp_df ) <- make.names(paste0("patid.", as.character(1:nrow(psm_temp_df ))))
psm_final_df <- psm_temp_df[2:136]

###############################################################################


#mycosine <- function(x,y){
#c <- sum(x*y) / (sqrt(sum(x*x)) * sqrt(sum(y*y)))
  #return(c)
#}

#cosinesim <- function(x) {
  # initialize similarity matrix
  #m <- matrix(NA, nrow=ncol(x),ncol=ncol(x),dimnames=list(colnames(x),colnames(x)))
  #cos <- as.data.frame(m)

  #for(i in 1:ncol(x)) {
    #for(j in i:ncol(x)) {
      #co_rate_1 <- x[which(x[,i] & x[,j]),i]
      #co_rate_2 <- x[which(x[,i] & x[,j]),j]  
      #cos[i,j]= mycosine(co_rate_1,co_rate_2)
      #cos[j,i]=cos[i,j]        
    #}
  #}
  #return(cos)
#}

cs <- lsa::cosine(t(psm_final_df))

cs_round <-round(cs,digits = 2)



#cs_norm <- as.data.frame(lapply(cs,normalize))
#print(cs_norm)
#print(cs_round)

##########################################

numCols = 3;
totalROws = (numRows * (numRows-1)) / 2;
result <- matrix(nrow = totalROws, ncol = numCols)
#result<- big.matrix( nrow = totalROws, ncol = numCols, type = "double",shared = TRUE)
#options(bigmemory.allow.dimnames=TRUE)

colnames(result) <- c("PatA","PatB","Similarity")

index = 1;
for (i in 1:nrow(cs_round)) {
  patA = rownames(cs_round)[i]
  for (j in i:ncol(cs_round)) {
    if (j > i) {
      patB = colnames(cs_round)[j]
      result[index, 1] = patA
      result[index, 2] = patB
      result[index, 3] = cs_round[i,j]

      index = index + 1;
    }
  }
}

print(result)

write.csv(result, file = "C:/cosine/output.csv", row.names = F)
#ord_result<-result[order(result[,3],decreasing=TRUE),]
#print(ord_result)

在这种情况下，我可以将数据集拆分为最高的10个子集。然后，每个数据集中将有4650行。因此，对于4650行，它仍然是一个非常大的矩阵计算，我必须等待很长时间才能输出。

我已经尝试过使用大内存，ff和矩阵包来实现这个实现，但是根据我的知识没有取得丰硕成果。

任何类型的建议或代码修改或如何有效地完成它对我都非常有帮助。

注意：我的机器有8GBDDR3 RAM和i3处理器，时钟速度为2.10GHz。我使用的是64位R studio。

整个数据集的链接（46.5 KRows - psm_final_without_null.csv）＆gt;＆gt; https://1drv.ms/u/s!AhoddsPPvdj3hVVFC-yl1tDKEfo8

样本数据集的链接（4700行 - psm_final_sample.csv）＆gt;＆gt; https://1drv.ms/u/s!AhoddsPPvdj3hVjrNTgkV0noqMk8

Answer 1

有很多空间可以优化代码/算法。仅举几例：

co_rate_1 <- x[which(x[,i] & x[,j]),i]
co_rate_2 <- x[which(x[,i] & x[,j]),j]

主要的计算负担是which函数，显然你不需要计算两次，btw which通常是一个慢函数，在计算中使用它通常不是一个好主意密集的代码。更新：我认为此处不需要which，您可以安全地将其删除。

来自cosinesim的结果矩阵是对称矩阵，这意味着您实际上只需要计算一半的元素。

你在函数中使用的for循环构成了一个“令人尴尬的并行”问题，这意味着你可以从一些简单的并行函数实现中受益，例如mclapply。

另外我相信在Rcpp中重写cosinesim会有很大的帮助。

R中非常大的矩阵计算

1 个答案: