Question

我有一个包含1M +记录的JSON文件。我创建了一个自定义函数以对C = 1进行过滤并计算出简单的差值。

JSON文件具有以下结构。读入R data.frame后，“组”列是data.frame的列表。

“ JSON_data.txt”：

{"ID":1,"Group":[{"A":0,"B":"10.62","C":1},{"A":0,"B":"10.61","C":2},{"A":1,"B":"11.37","C":1},{"A":1,"B":"11.4","C":2}]}
{"ID":2,"Group":[{"A":0,"B":"10.65","C":1},{"A":0,"B":"10.63","C":2},{"A":1,"B":"11.31","C":1},{"A":1,"B":"11.50","C":2}]}

使用jasonlite进行阅读后，我生成了2000条记录用于说明目的

library(jsonlite)
library(tidyverse)

## Read JSON file / stream in
json_data <- stream_in(file("JSON_data.txt"), verbose = TRUE)

# simulate 2000 records by repeating
multiplied <- json_data[rep(seq_len(nrow(json_data)), each=1000),]

# custom function
get_difference <- function(x){

  # create a temporary data.frame
  temp <- as.data.frame(x) %>% 
    filter(C == 1) %>% # filter on C = 1
    mutate(B = as.numeric(B)) %>% #put B values to numeric
    select(A, B, C)

  if (nrow(temp) < 2){  # if less than two records, put NA
    difference <- NA
  } else {

    difference <- (filter(temp, A == 1)[2] - filter(temp, A == 0)[2])

  }
  return(difference)

}


multiplied$difference <- sapply(multiplied[, 2], function(x) get_difference(x))

对2000条记录使用sapply需花费约数。一分钟！ 问题：如何在大数据框架上加快计算速度？

Answer 1

不确定您的预期输出是什么，但是data.table可能是加快处理速度的一个不错的起点。

library(jsonlite)
library(tidyverse)
library(data.table)

## Read JSON file / stream in
json_data <- stream_in(file("JSON_data.txt"), verbose = TRUE)

# simulate 2000 records by repeating
multiplied <- json_data[rep(seq_len(nrow(json_data)), each=1000),]

dt <- data.table::rbindlist(multiplied$Group)[C == 1,]
dt[, B := as.numeric(B)]
dt[ , difference := ifelse( A == 0, B - shift( B, n = 1L, fill = NA, type = "lead"), NA)]
multiplied$difference = dt[A == 0, difference]

Answer 2

这对我立即有效：

get_difference <- function(x){
    idx0 = which(x$C==1 & x$A==0)
    idx1 = which(x$C==1 & x$A==1)
    if(length(idx1)==length(idx0))
        diff(as.numeric(x$B[c(idx1, idx0)]), lag=length(idx1))
    else
        NA
}

在data.frame列上有效地应用自定义函数，该列本身就是data.frames的列表

2 个答案: