我已经从运行OS X 10.6.8的旧版iMac迁移了一些脚本,其中包括4GB Ram和2.53GHz Core 2 Duo处理器(2008年圈赛)到运行Windows Server 2008的新虚拟AWS机器,64位,有15GB的内存。尽管如此,以下代码在新的Windows机器上运行的时间是4倍。
我已经更改了一些周围的代码,但这部分是瓶颈,是相同的。输入变量(unique_dates
,summary_data
,in_data
)在两台计算机之间的大小也相似。
在Windows机器上,每个date
周期需要大约30秒,相比之前Mac上的5秒钟。
非常感谢任何有关如何提高速度的建议。
*更新*
从正在发表的评论中我得到的印象是,有一种更快的方法可以在没有循环的情况下执行此操作,这使得原始问题变得毫无意义。
我正在努力实现以下目标: 我有许多公司的时间序列数据。对于每个日期,每家公司都有1个月,3个月,6个月,12个月,18个月和24个月的回报。每家公司也被分配到一个小组(fractile)。对于每个日期,我想计算每个日期的平均值和中位数1mth,3mth,6mth等等。
然后将结果存储在预先分配的summary_data
数据帧中。
实际数据包含343个日期,每个日期有500家公司,25家公司可以满足要求。最慢的部分是ss_in_data <- subset(in_data, ...)
,与其余部分相比需要几秒钟,总共约为0.02秒。
我对R很新,但在MYSQL中,我可以使用MEAN
和GROUP BY DATE, FRACTILE
来计算这个。在R中是否有类似的方法?
再次感谢您的帮助。
我当前的代码低于示例数据:
# In this sample data there are 12 companies, 3 dates. Each company is
# assigned to 1 of 3 fractiles
# Consequently there are 4 companies in each fractile.
# Create input data
date1 <- as.Date(as.character('2010-01-31'),"%Y-%m-%d")
date2 <- as.Date(as.character('2010-02-28'),"%Y-%m-%d")
date3 <- as.Date(as.character('2010-03-31'),"%Y-%m-%d")
dates <- c(rep(date1,12),rep(date2,12),rep(date3,12))
coys <- rep(c('A','B','C','D','E','F','G','H','I','J','K','L'),3)
ret_3mth <- rep(seq(0.1, by=0.2, length.out = 12),3)
ret_6mth <- rep(seq(0.2, by=0.2, length.out = 12),3)
ret_12mth <- rep(seq(0.3, by=0.2, length.out = 12),3)
fractiles <- rep(rep(c(1,2,3),4),3)
in_data <- data.frame(dates, coys, ret_3mth, ret_6mth, ret_12mth, fractiles)
# Initialise summary data frame
dates <- c(rep(date1,3),rep(date2,3),rep(date3,3))
fractiles <- rep(c(1,2,3),3)
mean_3mth <- rep(NA,9)
mean_6mth <- rep(NA,9)
mean_12mth <- rep(NA,9)
summary <- data.frame(dates, fractiles, mean_3mth, mean_6mth, mean_12mth)
# Other variables
unique_dates <- unique(dates)
num_fract_curr <- 3
目前的方法:
for (date in unique_dates) {
# Only write to screen ever x loops (as set by write_line_freq)
if (counter%%write_line_freq == 0) {
writeLines(paste(run_name,' : Summary calcs ROCE ',roce, ' Date: ',as.Date(date, origin='1970-01-01'),
' ',counter,':',num_dates,' Time: ',format.timediff(start_time),sep=''))
}
counter <- counter + 1
for(i in 1:num_fract_curr) {
# Create subsets to speed up processing
ss_summary_data <- subset(summary_data, date_base == as.Date(date, origin='1970-01-01') & summary_data[summary_data_fractile] == i)
ss_in_data <- subset(in_data, date_base == as.Date(date, origin='1970-01-01') & in_data[in_data_fractile] == i)
# Causes error if ss_in_data is empty
if (nrow(ss_in_data) > 0) {
ss_summary_data$mean1mth <- mean(ss_in_data$ret_1mth, na.rm = TRUE)
ss_summary_data$median1mth <- median(ss_in_data$ret_1mth, na.rm = TRUE)
ss_summary_data$mean3mth <- mean(ss_in_data$ret_3mth, na.rm = TRUE)
ss_summary_data$median3mth <- median(ss_in_data$ret_3mth, na.rm = TRUE)
ss_summary_data$mean6mth <- mean(ss_in_data$ret_6mth, na.rm = TRUE)
ss_summary_data$median6mth <- median(ss_in_data$ret_6mth, na.rm = TRUE)
ss_summary_data$mean12mth <- mean(ss_in_data$ret_12mth, na.rm = TRUE)
ss_summary_data$median12mth <- median(ss_in_data$ret_12mth, na.rm = TRUE)
ss_summary_data$mean18mth <- mean(ss_in_data$ret_18mth, na.rm = TRUE)
ss_summary_data$median18mth <- median(ss_in_data$ret_18mth, na.rm = TRUE)
ss_summary_data$mean24mth <- mean(ss_in_data$ret_24mth, na.rm = TRUE)
ss_summary_data$median24mth <- median(ss_in_data$ret_24mth, na.rm = TRUE)
# Save the updated summary data back into the 'summary_data' data frame
summary_data[(summary_data$date == date) & (summary_data[summary_data_fractile] == i),] <- ss_summary_data
}
}
}
答案 0 :(得分:3)
我没有达到速度差的底部,但这是一种在R中执行此计算的荒谬方法。
而是使用@Roland建议的ddply
。
计算时间从12小时减少到几秒钟。
感谢大家的帮助。
新方法:
summary_data <- ddply(in_data, c('date_base',fractile_name),summarise, mean_1mth = mean(ret_1mth, na.rm=TRUE),
mean_3mth = mean(ret_3mth, na.rm=TRUE),mean_6mth = mean(ret_6mth, na.rm=TRUE),
mean_12mth = mean(ret_12mth, na.rm=TRUE), mean_18mth = mean(ret_18mth, na.rm=TRUE),
mean_24mth = mean(ret_24mth, na.rm=TRUE),
median_1mth = median(ret_1mth, na.rm=TRUE), median_3mth = median(ret_3mth, na.rm=TRUE),
median_6mth = median(ret_6mth, na.rm=TRUE), median_12mth = median(ret_12mth, na.rm=TRUE),
median_18mth = median(ret_18mth, na.rm=TRUE),median_24mth = median(ret_24mth, na.rm=TRUE))