加速循环中的代码

时间:2017-01-19 06:11:13

标签: r performance optimization

我有一个包含超过2百万条记录的数据框。由于数据安全原因,我只分享了几条记录。我希望你们能理解我的理由。

data <- data[order(data$email_address_hash),]
skip_row <- c()
data$hash_time <- rep('NA',NROW(data)) #adding new column to our data
rownames(data) <- as.character(1:NROW(data))
dput(droplevels(data))
structure(list(email_address_hash = structure(c(2L, 1L, 1L, 2L
), .Label = c("0004eca7b8bed22aaf4b320ad602505fe9fa9d26", "35c0ef2c2a804b44564fd4278a01ed25afd887f8"
), class = "factor"), open_time = structure(c(2L, 1L, 3L, 4L), .Label = c(" 04:39:24", 
" 09:57:20", " 10:39:43", " 19:00:09"), class = "factor")), .Names = c("email_address_hash", 
"open_time"), row.names = c(41107L, 47808L, 3973L, 8307L), class = "data.frame")

str(data)
'data.frame':   4 obs. of  2 variables:
 $ email_address_hash: Factor w/ 36231 levels "00012aec4ca3fa6f2f96cf97fc2a3440eacad30e",..: 7632 2 2 7632
 $ open_time         : Factor w/ 34495 levels " 00:00:03"," 00:00:07",..: 15918 5096 16971 24707
.

skip_row <- c()


data$hash_time <- rep('NA',NROW(data)) #adding new column to our data

rownames(data) <- as.character(1:NROW(data))

for(i in 1:NROW(data)){
#Skipping the email_address_hash that was already used for grouping 
  if(i %in% skip_row) next   
  hash_row_no <- c()
#trimming data so that we don't need to look into whole dataframe 


trimmed_data <- data[i:NROW(data),] 
  # Whenever we search for email_address_hash the previous one was ignored or removed from the check 

  #extracting rownames so that we can used that as rownumber inside the skip_row
  hash_row_no <-  rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],])

#note: - 我们知道b / w rownames和rownumber

之间的区别
#converting rownames into numeric so that we can use them as rowno      
hash_row_no <- as.numeric(hash_row_no) 


  first_no <- hash_row_no[1]
  last_no <- hash_row_no[NROW(hash_row_no)]
  skip_row <- append(skip_row,hash_row_no)

  data$hash_time[first_no] <- paste(data$open_time[first_no:last_no], collapse = "")
}

请注意我也尝试了以下方法来加快这个过程,但这似乎是无效的

hash_row_no <-  rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],])

将数据帧转换为data.table

setDT(data)

执行任一操作给出相似的时间

system.time(rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],]))

system.time(rownames(trimmed_data)[trimmed_data[["email_address_hash"]] == trimmed_data$email_address_hash[1]])

你们可以帮我加速我的代码,因为我的数据包含超过2百万条记录,并且需要超过30分钟甚至更长时间吗?

1 个答案:

答案 0 :(得分:1)

显然你想这样做:

library(data.table)
setDT(data)
data[, .(open_times = paste(open_time, collapse = "")), by = email_address_hash]
#                         email_address_hash         open_times
#1: 35c0ef2c2a804b44564fd4278a01ed25afd887f8  09:57:20 19:00:09
#2: 0004eca7b8bed22aaf4b320ad602505fe9fa9d26  04:39:24 10:39:43

或者可能这样:

data[email_address_hash == "0004eca7b8bed22aaf4b320ad602505fe9fa9d26", 
     paste(open_time, collapse = "")]
#[1] " 04:39:24 10:39:43"