如何(以矢量化方式)从包含数字数组的数据框单元格中检索单个值数量

时间:2015-06-22 15:36:27

标签: r dataframe vectorization

我有一个数据框,其中包含右侧的列:

  lengthArray                    speed_max
1           4               24, 18, 24, 18
2          10 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
3           4       -999, -999, -999, -999
4           2                   -999, -999
5           2                       18, 18
6           1                         -999

对于这些列,我编写了一个函数来从数组中为数据帧的每一行提取平均值,中值,最大值或最小值,但我感觉这可以更快地完成。这就是我所拥有的:

get_scalar <- function(name, to_return = 1)
{

  vec_list = mydata[[name]]
  alt_vector = vector(mode = "numeric", length = length(alt_max))
  i = 1
  # depending on what user wants, return max, min, mean or median 
  # for each array one per row
  if(to_return == 0){
    for(entry in alt_max){
      alt_vector[i] = max(which(alt_max[i][[1]] != -999))
      i = i + 1
    }
  }else if (to_return==1){
    for(entry in alt_max){
      alt_vector[i] = min(which(alt_max[i][[1]] != -999))
      i = i + 1
    }
  }
  ...
  #and repeated for two other cases
  ...
  #then finally return the results as numeric vector
  alt_vector = as.numeric(alt_vector)   

}

此函数的预期/期望输出是数字向量,每行对应于数据帧中每个数组行的所需度量。因此,例如,如果我运行get_scalar("speed_max", to_return = 0),我会期望根据上面粘贴的数据返回第一行为(24, 2, NA....)的数字向量,因为第一行的“speed_max”数组的最大值为24且最大值为第二行的“speed_max”数组为2,第三行不包含任何相关数据(-999表示省略)。

我无法找到一种方法来使用sapply来编写每个单元格的列表的第一个成员。例如,以下语法错误:

> gg = max(mydata[[speed_max]][[1]])
Error in (function(x, i, exact) if (is.matrix(i)) as.matrix(x)[[i]] else .subset2(x,  : 
  object 'speed_max' not found

如果我尝试像这样重写,我似乎无法访问每行的单个数组。例如,此函数只打印出许多0:

get_scalar_sapply <- function(name, to_return = 1)
{
  vec_list = mydata[[name]]
  alt_vector = vector(mode = "numeric", length = length(alt_max))
  if(to_return == 1){
      #alt_vector =sapply(alt_vector, function(x)  max(which(x[[1]] != -999)))
    alt_vector = sapply(alt_vector, function(x)  print(x[[1]]))
  }
  alt_vector = as.numeric(alt_vector)   

}

附录,作为dput(mydata)

的请求输出
> dput(head(mydata))
structure(list(endo = c(20216392L, 20167990L, 20211929L, 20214641L, 
20206551L, 20178293L), lengthArray = c(4L, 10L, 4L, 2L, 2L, 1L
), sport = list(c(24, 18, 24, 18), c("2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2"), c("-999", "-999", "-999", "-999"), 
    c("-999", "-999"), c("18", "18"), "-999"), local_start_time = list(
    c(NA_real_, NA_real_, NA_real_, NA_real_), c("u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'", "u'2015-05-03T17:14:13.000Z'", 
    "u'2015-05-03T17:13:22.000Z'"), c("u'2015-02-25T10:02:10.000Z'", 
    "u'2015-02-02T22:37:34.000Z'", "u'2015-02-25T10:02:10.000Z'", 
    "u'2015-02-02T22:37:34.000Z'"), c("u'2015-02-02T18:28:23.000Z'", 
    "u'2015-02-02T18:28:23.000Z'"), c("u'2015-02-02T10:42:27.000Z'", 
    "u'2015-02-02T10:42:27.000Z'"), "u'2015-01-31T10:35:54.000Z'"), 
    distance = list(c(-999, 1.32598698139191, -999, 1.32598698139191
    ), c("-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999"), c("15.499165534973145", "-999", 
    "15.499165534973145", "-999"), c("6.071850776672363", "6.071850776672363"
    ), c("-999", "-999"), "-999"), duration = list(c(4, 1103, 
    4, 1103), c("8.0", "15.0", "8.0", "15.0", "8.0", "15.0", 
    "8.0", "15.0", "8.0", "15.0"), c("19492.0", "56.0", "19492.0", 
    "56.0"), c("1936.0", "1936.0"), c("3.0", "3.0"), "4083.49"), 
    speed_avg = list(c(-999, 4.32779069175962, -999, 4.32779069175962
    ), c("-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999"), c("2.862558789549729", "-999", "2.862558789549729", 
    "-999"), c("11.290631609514724", "11.290631609514724"), c("-999", 
    "-999"), "-999"), altitude_max = list(c(-999, 366, -999, 
    366), c("-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999"), c("335.5", "-999", "335.5", "-999"
    ), c("520.0", "520.0"), c("624.0", "624.0"), "-999"), altitude_min = list(
        c(-999, 223, -999, 223), c("-999", "-999", "-999", "-999", 
        "-999", "-999", "-999", "-999", "-999", "-999"), c("-156.0", 
        "-999", "-156.0", "-999"), c("453.0", "453.0"), c("624.0", 
        "624.0"), "-999"), speed_max = list(c(-999, 5.01253, 
    -999, 5.01253), c("-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999", "-999", "-999"), c("66.8202", "-999", 
    "66.8202", "-999"), c("19.8268", "19.8268"), c("-999", "-999"
    ), "-999"), ascent = list(c(-999, 140, -999, 140), c("-999", 
    "-999", "-999", "-999", "-999", "-999", "-999", "-999", "-999", 
    "-999"), c("-999", "-999", "-999", "-999"), c("173.0", "173.0"
    ), c("-999", "-999"), "-999"), descent = list(c(-999, 272, 
    -999, 272), c("-999", "-999", "-999", "-999", "-999", "-999", 
    "-999", "-999", "-999", "-999"), c("-999", "-999", "-999", 
    "-999"), c("174.0", "174.0"), c("-999", "-999"), "-999"), 
    title = list(c(-999, -999, -999, -999), c("-999", "-999", 
    "-999", "-999", "-999", "-999", "-999", "-999", "-999", "-999"
    ), c("-999", "-999", "-999", "-999"), c("-999", "-999"), 
        c("-999", "-999"), "-999"), num_runs = c(0L, 0L, 0L, 
    0L, 0L, 0L), percent_runs = c(0, 0, 0, 0, 0, 0)), .Names = c("endo", 
"lengthArray", "sport", "local_start_time", "distance", "duration", 
"speed_avg", "altitude_max", "altitude_min", "speed_max", "ascent", 
"descent", "title", "num_runs", "percent_runs"), row.names = c(NA, 
6L), class = "data.frame")

1 个答案:

答案 0 :(得分:1)

看起来您正在尝试从列表中的每个条目中获取摘要函数,忽略设置为-999的元素。您可以使用以下内容执行此操作:

get_scalar <- function(name, FUN=max) {
  sapply(mydata[,name], function(x) if(all(x == -999)) NA else FUN(as.numeric(x[x != -999])))
}

请注意,我通过传递一个实际函数来应用于每个列表元素而不是对应于函数的数字代码,从而稍微改变了您的函数。这使您的功能更加灵活,因为它现在可以轻松地执行任何处理功能。

让我们看看提供的mydata列表中的示例:

# Look at the list:
mydata$speed_max
# [[1]]
# [1] -999.00000    5.01253 -999.00000    5.01253
# 
# [[2]]
#  [1] "-999" "-999" "-999" "-999" "-999" "-999" "-999" "-999" "-999" "-999"
# 
# [[3]]
# [1] "66.8202" "-999"    "66.8202" "-999"   
# 
# [[4]]
# [1] "19.8268" "19.8268"
# 
# [[5]]
# [1] "-999" "-999"
# 
# [[6]]
# [1] "-999"

# Minimum element in each row
get_scalar("speed_max", min)
# [1]  5.01253       NA 66.82020 19.82680       NA       NA

# Number set (NA if none)
get_scalar("speed_max", length)
# [1]  2 NA  2  2 NA NA