为什么在Windows Server 2008上R脚本运行速度比OSX 10.6慢?

时间:2014-01-31 12:00:16

标签: windows r performance macos

我已经从运行OS X 10.6.8的旧版iMac迁移了一些脚本,其中包括4GB Ram和2.53GHz Core 2 Duo处理器(2008年圈赛)到运行Windows Server 2008的新虚拟AWS机器,64位,有15GB的内存。尽管如此,以下代码在新的Windows机器上运行的时间是4倍。

我已经更改了一些周围的代码,但这部分是瓶颈,是相同的。输入变量(unique_datessummary_datain_data)在两台计算机之间的大小也相似。

在Windows机器上,每个date周期需要大约30秒,相比之前Mac上的5秒钟。

非常感谢任何有关如何提高速度的建议。

*更新*

从正在发表的评论中我得到的印象是,有一种更快的方法可以在没有循环的情况下执行此操作,这使得原始问题变得毫无意义。

我正在努力实现以下目标: 我有许多公司的时间序列数据。对于每个日期,每家公司都有1个月,3个月,6个月,12个月,18个月和24个月的回报。每家公司也被分配到一个小组(fractile)。对于每个日期,我想计算每个日期的平均值和中位数1mth,3mth,6mth等等。

然后将结果存储在预先分配的summary_data数据帧中。

实际数据包含343个日期,每个日期有500家公司,25家公司可以满足要求。最慢的部分是ss_in_data <- subset(in_data, ...),与其余部分相比需要几秒钟,总共约为0.02秒。

我对R很新,但在MYSQL中,我可以使用MEANGROUP BY DATE, FRACTILE来计算这个。在R中是否有类似的方法?

再次感谢您的帮助。

我当前的代码低于示例数据:

# In this sample data there are 12 companies, 3 dates. Each company is 
# assigned to 1 of 3 fractiles
# Consequently there are 4 companies in each fractile.

# Create input data
date1 <- as.Date(as.character('2010-01-31'),"%Y-%m-%d")
date2 <- as.Date(as.character('2010-02-28'),"%Y-%m-%d")
date3 <- as.Date(as.character('2010-03-31'),"%Y-%m-%d")

dates <- c(rep(date1,12),rep(date2,12),rep(date3,12))

coys <- rep(c('A','B','C','D','E','F','G','H','I','J','K','L'),3)
ret_3mth <- rep(seq(0.1, by=0.2, length.out = 12),3)
ret_6mth <- rep(seq(0.2, by=0.2, length.out = 12),3)
ret_12mth <- rep(seq(0.3, by=0.2, length.out = 12),3)
fractiles <- rep(rep(c(1,2,3),4),3)

in_data <- data.frame(dates, coys, ret_3mth, ret_6mth, ret_12mth, fractiles)

# Initialise summary data frame
dates <- c(rep(date1,3),rep(date2,3),rep(date3,3))
fractiles <- rep(c(1,2,3),3)
mean_3mth <- rep(NA,9)
mean_6mth <- rep(NA,9)
mean_12mth <- rep(NA,9)
summary <- data.frame(dates, fractiles, mean_3mth, mean_6mth, mean_12mth)

# Other variables
unique_dates <- unique(dates)
num_fract_curr <- 3

目前的方法:

for (date in unique_dates) {
      # Only write to screen ever x loops (as set by write_line_freq)
      if (counter%%write_line_freq == 0) {
        writeLines(paste(run_name,' : Summary calcs ROCE ',roce, '  Date: ',as.Date(date, origin='1970-01-01'),
                         ' ',counter,':',num_dates,'  Time: ',format.timediff(start_time),sep=''))
      }
      counter <- counter + 1


  for(i in 1:num_fract_curr) {
    # Create subsets to speed up processing

    ss_summary_data <- subset(summary_data, date_base == as.Date(date, origin='1970-01-01') & summary_data[summary_data_fractile] == i)
    ss_in_data <- subset(in_data, date_base == as.Date(date, origin='1970-01-01') & in_data[in_data_fractile] == i)


    # Causes error if ss_in_data is empty
    if (nrow(ss_in_data) > 0) {
      ss_summary_data$mean1mth <- mean(ss_in_data$ret_1mth, na.rm = TRUE)
      ss_summary_data$median1mth <- median(ss_in_data$ret_1mth, na.rm = TRUE)

      ss_summary_data$mean3mth <- mean(ss_in_data$ret_3mth, na.rm = TRUE)
      ss_summary_data$median3mth <- median(ss_in_data$ret_3mth, na.rm = TRUE)

      ss_summary_data$mean6mth <- mean(ss_in_data$ret_6mth, na.rm = TRUE)
      ss_summary_data$median6mth <- median(ss_in_data$ret_6mth, na.rm = TRUE)

      ss_summary_data$mean12mth <- mean(ss_in_data$ret_12mth, na.rm = TRUE)
      ss_summary_data$median12mth <- median(ss_in_data$ret_12mth, na.rm = TRUE)

      ss_summary_data$mean18mth <- mean(ss_in_data$ret_18mth, na.rm = TRUE)
      ss_summary_data$median18mth <- median(ss_in_data$ret_18mth, na.rm = TRUE)

      ss_summary_data$mean24mth <- mean(ss_in_data$ret_24mth, na.rm = TRUE)
      ss_summary_data$median24mth <- median(ss_in_data$ret_24mth, na.rm = TRUE)

      # Save the updated summary data back into the 'summary_data' data frame
      summary_data[(summary_data$date == date) & (summary_data[summary_data_fractile] == i),] <- ss_summary_data
    }
  }
}

1 个答案:

答案 0 :(得分:3)

我没有达到速度差的底部,但这是一种在R中执行此计算的荒谬方法。

而是使用@Roland建议的ddply

计算时间从12小时减少到几秒钟。

感谢大家的帮助。

新方法:

 summary_data <- ddply(in_data, c('date_base',fractile_name),summarise, mean_1mth = mean(ret_1mth, na.rm=TRUE), 
                                          mean_3mth = mean(ret_3mth, na.rm=TRUE),mean_6mth = mean(ret_6mth, na.rm=TRUE),
                                          mean_12mth = mean(ret_12mth, na.rm=TRUE), mean_18mth = mean(ret_18mth, na.rm=TRUE),
                                          mean_24mth = mean(ret_24mth, na.rm=TRUE),
                                          median_1mth = median(ret_1mth, na.rm=TRUE), median_3mth = median(ret_3mth, na.rm=TRUE),
                                          median_6mth = median(ret_6mth, na.rm=TRUE), median_12mth = median(ret_12mth, na.rm=TRUE), 
                                          median_18mth = median(ret_18mth, na.rm=TRUE),median_24mth = median(ret_24mth, na.rm=TRUE))