我有一个包含超过2百万条记录的数据框。由于数据安全原因,我只分享了几条记录。我希望你们能理解我的理由。
data <- data[order(data$email_address_hash),]
skip_row <- c()
data$hash_time <- rep('NA',NROW(data)) #adding new column to our data
rownames(data) <- as.character(1:NROW(data))
dput(droplevels(data))
structure(list(email_address_hash = structure(c(2L, 1L, 1L, 2L
), .Label = c("0004eca7b8bed22aaf4b320ad602505fe9fa9d26", "35c0ef2c2a804b44564fd4278a01ed25afd887f8"
), class = "factor"), open_time = structure(c(2L, 1L, 3L, 4L), .Label = c(" 04:39:24",
" 09:57:20", " 10:39:43", " 19:00:09"), class = "factor")), .Names = c("email_address_hash",
"open_time"), row.names = c(41107L, 47808L, 3973L, 8307L), class = "data.frame")
str(data)
'data.frame': 4 obs. of 2 variables:
$ email_address_hash: Factor w/ 36231 levels "00012aec4ca3fa6f2f96cf97fc2a3440eacad30e",..: 7632 2 2 7632
$ open_time : Factor w/ 34495 levels " 00:00:03"," 00:00:07",..: 15918 5096 16971 24707
.
skip_row <- c()
data$hash_time <- rep('NA',NROW(data)) #adding new column to our data
rownames(data) <- as.character(1:NROW(data))
for(i in 1:NROW(data)){
#Skipping the email_address_hash that was already used for grouping
if(i %in% skip_row) next
hash_row_no <- c()
#trimming data so that we don't need to look into whole dataframe
trimmed_data <- data[i:NROW(data),]
# Whenever we search for email_address_hash the previous one was ignored or removed from the check
#extracting rownames so that we can used that as rownumber inside the skip_row
hash_row_no <- rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],])
#note: - 我们知道b / w rownames和rownumber
之间的区别#converting rownames into numeric so that we can use them as rowno
hash_row_no <- as.numeric(hash_row_no)
first_no <- hash_row_no[1]
last_no <- hash_row_no[NROW(hash_row_no)]
skip_row <- append(skip_row,hash_row_no)
data$hash_time[first_no] <- paste(data$open_time[first_no:last_no], collapse = "")
}
hash_row_no <- rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],])
setDT(data)
system.time(rownames(trimmed_data[trimmed_data$email_address_hash==trimmed_data$email_address_hash[1],]))
system.time(rownames(trimmed_data)[trimmed_data[["email_address_hash"]] == trimmed_data$email_address_hash[1]])
你们可以帮我加速我的代码,因为我的数据包含超过2百万条记录,并且需要超过30分钟甚至更长时间吗?
答案 0 :(得分:1)
显然你想这样做:
library(data.table)
setDT(data)
data[, .(open_times = paste(open_time, collapse = "")), by = email_address_hash]
# email_address_hash open_times
#1: 35c0ef2c2a804b44564fd4278a01ed25afd887f8 09:57:20 19:00:09
#2: 0004eca7b8bed22aaf4b320ad602505fe9fa9d26 04:39:24 10:39:43
或者可能这样:
data[email_address_hash == "0004eca7b8bed22aaf4b320ad602505fe9fa9d26",
paste(open_time, collapse = "")]
#[1] " 04:39:24 10:39:43"