麻烦加快算法

时间:2017-09-20 13:51:23

标签: r performance dataframe

我在R中制作了一个算法,将多个传感器读数组合在一个时间戳下。

大多数传感器读数每500毫秒读一次,但有些传感器仅报告变化。因此,我必须制作一个算法,在给定时间内获取传感器的最后已知值。

现在算法有效,但速度太快,以至于当我开始将它用于实际的20多个传感器时,它需要很长时间才能完成。我的假设是,由于我使用数据帧或我访问和移动数据的方式,它很慢。

我尝试过一次只通过每个数据帧一次,而不是为每个时间戳迭代它们。我还预先分配了数据所需的所有空间。

非常欢迎任何建议。我是R语言的新手,所以我真的不知道哪些数据类型很慢,哪些数据类型很快。

library(tidyverse)
library(tidytext)
library(stringr)
library(readr)
library(dplyr)
library(pracma)    

# take a list of dataframes as a parameter
generalise_data <- function(dataframes, timeinterval){
  if (typeof(dataframes) == "list"){
    # get the biggest and smallest datetime stamp from every dataframe
    # this will be used to calculate the size of the resulting frame ((largest time - smallest time)/1000 = dataframe rows)
    # this means one value every second

    largest_time <- 0
    smallest_time <- as.numeric(Sys.time())*1000 # everything will be smaller than the current time
    for (i in 1:length(dataframes)){
      dataframe_max <- max(dataframes[[i]]$TIMESTAMP)
      dataframe_min <- min(dataframes[[i]]$TIMESTAMP)

      if (dataframe_max > largest_time) largest_time <- dataframe_max
      if (dataframe_min < smallest_time) smallest_time <- dataframe_min
    }

    # result dataframe wil have ... rows
    result.size <- floor((largest_time - smallest_time)/timeinterval)
    sprintf("Result size: %i", result.size)

    # create a numeric array that contains the indexes of every dataframe, all set to 1
    dataframe_indexes <- numeric(length(dataframes))
    dataframe_indexes[dataframe_indexes == 0] <- 1

    # data vectors for the dataframe
    result.timestamps <- numeric(result.size)
    result <- list(result.timestamps)
    for (i in 2:(length(dataframes)+1)) result[[i]] <- numeric(result.size) # add an empty vector for every datapoint

    # use progressbar
    pb <- txtProgressBar(1, result.size, style = 3)

    # make a for loop to run through every data row of the resulting data frame (creating a row every run through)
    # every run through increase the index of dataframes until the resulting row exceeds the result rows timestamp, than go one index back
    #for (i in 1:200){
    for (i in 1:result.size){
      current_timestamp <- smallest_time + timeinterval*(i-1)
      result[[1]][i] <- current_timestamp

      for (i2 in 1:length(dataframes)){
        while (dataframes[[i2]]$TIMESTAMP[dataframe_indexes[i2]] < current_timestamp && dataframes[[i2]]$TIMESTAMP[dataframe_indexes[i2]] != max(dataframes[[i2]]$TIMESTAMP)){
            dataframe_indexes[i2] <- dataframe_indexes[i2]+1
        }

        if (dataframe_indexes[i2] > 1){
            dataframe_indexes[i2] <- dataframe_indexes[i2]-1 # take the one that's smaller
        }

        result[[i2+1]][i] <- dataframes[[i2]]$VALUE[dataframe_indexes[i2]]
      }

      setTxtProgressBar(pb, i)
    }

    close(pb)

    result.final <- data.frame(result)

    return(result.final)
  } else {
    return(NA)
  }
}

1 个答案:

答案 0 :(得分:1)

我今天通过将每个数据帧更改为矩阵来修复它。代码运行时间为9.5秒而不是70分钟。

结论:数据帧对性能非常不利。

library(tidyverse)
library(tidytext)
library(stringr)
library(readr)
library(dplyr)
library(pracma)
library(compiler)    

# take a list of dataframes as a parameter
generalise_data <- function(dataframes, timeinterval){
  time.start <- Sys.time()
  if (typeof(dataframes) == "list"){
    # store the sizes of all the dataframes
    resources.largest_size <- 0
    resources.sizes <- numeric(length(dataframes))

    for (i in 1:length(dataframes)){
      resources.sizes[i] <- length(dataframes[[i]]$VALUE)
      if (resources.sizes[i] > resources.largest_size) resources.largest_size <- resources.sizes[i]
    }

    # generate a matrix that can hold all needed dataframe values
    resources <- matrix(nrow = resources.largest_size, ncol = length(dataframes)*2)
    for (i in 1:length(dataframes)){
      j <- i*2
      resources[1:resources.sizes[i],j-1] <- dataframes[[i]]$TIMESTAMP
      resources[1:resources.sizes[i],j] <- dataframes[[i]]$VALUE
    }

    # get the biggest and smallest datetime stamp from every dataframe
    # this will be used to calculate the size of the resulting frame ((largest time - smallest time)/1000 = dataframe rows)
    # this means one value every second
    largest_time <- 0
    smallest_time <- as.numeric(Sys.time())*1000 # everything will be smaller than the current time
    for (i in 1:length(dataframes)){
      dataframe_max <- max(dataframes[[i]]$TIMESTAMP)
      dataframe_min <- min(dataframes[[i]]$TIMESTAMP)

      if (dataframe_max > largest_time) largest_time <- dataframe_max
      if (dataframe_min < smallest_time) smallest_time <- dataframe_min
    }

    # result dataframe wil have ... rows
    result.size <- floor((largest_time - smallest_time)/timeinterval)
    sprintf("Result size: %i", result.size)

    # create a numeric array that contains the indexes of every dataframe, all set to 1
    dataframe_indexes <- numeric(length(dataframes))
    dataframe_indexes[dataframe_indexes == 0] <- 1

    # data matrix for the result
    result <- matrix(data = 0, nrow = result.size, ncol = length(dataframes)+1)

    # use progressbar
    pb <- txtProgressBar(1, result.size, style = 3)

    # make a for loop to run through every data row of the resulting data frame (creating a row every run through)
    # every run through increase the index of dataframes until the resulting row exceeds the result rows timestamp, than go one index back
    #for (i in 1:200){
    for (i in 1:result.size){
      current_timestamp <- smallest_time + timeinterval*(i-1)
      result[i,1] <- current_timestamp

      for (i2 in 1:length(dataframes)){
        j <- i2*2
        while (resources[dataframe_indexes[i2],j-1] < current_timestamp && resources[dataframe_indexes[i2],j-1] != resources.sizes[i2]){
          dataframe_indexes[i2] <- dataframe_indexes[i2]+1
        }

        # at the moment the last value of the array is never selected, needs to be fixed
        if (dataframe_indexes[i2] > 1){
          dataframe_indexes[i2] <- dataframe_indexes[i2]-1 # take the one that's smaller
        }

        result[i,i2+1] <- resources[dataframe_indexes[i2], j] #dataframes[[i2]]$VALUE[dataframe_indexes[i2]]
      }

      setTxtProgressBar(pb, i)
    }

    close(pb)

    result.final <- data.frame(result)

    time.end <- Sys.time()
    print(time.end-time.start)

    return(result.final)
  } else {
    return(NA)
  }
}