时间差不均匀的群体的滚动总和

时间:2017-01-18 12:51:47

标签: r date dplyr cumsum

这是我previously posted question的调整。这是我的数据:

<input id="pac-input" class="controls" type="text" placeholder="Search Box" name="pac-input">
<div id="map"></div>
<script>

  function initAutocomplete() {
    var map = new google.maps.Map(document.getElementById('map'), {
      center: {lat: -33.8688, lng: 151.2195},
      zoom: 13,
      mapTypeId: 'roadmap'
    });

    // Create the search box and link it to the UI element.
    var input = document.getElementById('pac-input');
    var searchBox = new google.maps.places.SearchBox(input);
    map.controls[google.maps.ControlPosition.TOP_LEFT].push(input);



    // Bias the SearchBox results towards current map's viewport.
    map.addListener('bounds_changed', function() {
      searchBox.setBounds(map.getBounds());
    });

    var markers = [];
    // Listen for the event fired when the user selects a prediction and retrieve
    // more details for that place.
    searchBox.addListener('places_changed', function() {
      var places = searchBox.getPlaces();

      if (places.length == 0) {
        return;
      }

      // Clear out the old markers.
      markers.forEach(function(marker) {
        marker.setMap(null);
      });
      markers = [];

      // For each place, get the icon, name and location.
      var bounds = new google.maps.LatLngBounds();
      places.forEach(function(place) {
        if (!place.geometry) {
          console.log("Returned place contains no geometry");
          return;
        }
        var icon = {
          url: place.icon,
          size: new google.maps.Size(71, 71),
          origin: new google.maps.Point(0, 0),
          anchor: new google.maps.Point(17, 34),
          scaledSize: new google.maps.Size(25, 25)
        };

        // Create a marker for each place.
        markers.push(new google.maps.Marker({
          map: map,
          icon: icon,
          title: place.name,
          position: place.geometry.location
        }));

        if (place.geometry.viewport) {
          // Only geocodes have viewport.
          bounds.union(place.geometry.viewport);
        } else {
          bounds.extend(place.geometry.location);
        }
      });
      map.fitBounds(bounds);
    });
  }

</script>
<script src="https://maps.googleapis.com/maps/api/js?key=AIzaSyB9RRjfBs5H4kP7Pa-1SePt6FzrzmC6KX8&libraries=places&callback=initAutocomplete"
     async defer></script>

这一次,我想计算指定时间段内每个set.seed(3737) DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)), date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)), value = round(rnorm(14, 15, 5), 1)) user_id date value 27 2016-01-01 15.0 27 2016-01-03 22.4 27 2016-01-05 13.3 27 2016-01-07 21.9 27 2016-01-10 20.6 27 2016-01-14 18.6 27 2016-01-16 16.4 11 2016-01-01 6.8 11 2016-01-03 21.3 11 2016-01-05 19.8 11 2016-01-07 22.0 11 2016-01-10 19.4 11 2016-01-14 17.5 11 2016-01-16 19.3 的{​​{1}}累积总和';例如过去7天,14天。理想的解决方案如下:

value

理想情况下,我想使用user_id,但其他套餐也没问题。

6 个答案:

答案 0 :(得分:6)

逻辑:首先按user_id分组,然后是date。现在,对于每个数据子集,我们使用返回逻辑向量的between()检查当前日期和7/14天之间的所有日期。

根据此逻辑向量,我添加了value

library(data.table)
setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]), 
                 v_minus14 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
           by = c("user_id", "date")][]
 #   user_id       date value v_minus7 v_minus14
 #1:      27 2016-01-01  15.0     15.0      15.0
 #2:      27 2016-01-03  22.4     37.4      37.4
 #3:      27 2016-01-05  13.3     50.7      50.7
 #4:      27 2016-01-07  21.9     72.6      72.6
 #5:      27 2016-01-10  20.6     78.2      93.2
 #6:      27 2016-01-14  18.6     61.1     111.8
 #7:      27 2016-01-16  16.4     55.6     113.2
 #8:      11 2016-01-01   6.8      6.8       6.8
 #9:      11 2016-01-03  21.3     28.1      28.1
#10:      11 2016-01-05  19.8     47.9      47.9
#11:      11 2016-01-07  22.0     69.9      69.9
#12:      11 2016-01-10  19.4     82.5      89.3
#13:      11 2016-01-14  17.5     58.9     106.8
#14:      11 2016-01-16  19.3     56.2     119.3
# from alexis_laz answer.
ff = function(date, value, minus){
  cs = cumsum(value)  
  i = findInterval(date - minus, date, rightmost.closed = TRUE) 
  w = which(as.logical(i))
  i[w] = cs[i[w]]
  cs - i
} 
setDT(DF2)
DF2[, `:=`( v_minus7 = ff(date, value, 7), 
            v_minus14 = ff(date, value, 14)), by = c("user_id")]

答案 1 :(得分:4)

首先填写缺失日期后,您可以使用rollapply中的zoo

library(dplyr)
library(zoo)

set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
             date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
             value = round(rnorm(14, 15, 5), 1))

all_combinations <- expand.grid(user_id=unique(DF2$user_id), 
                            date=seq(min(DF2$date), max(DF2$date), by="day"))

res <- DF2 %>% 
    merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
    group_by(user_id) %>% 
    arrange(date) %>% 
    mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
           v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
    filter(!is.na(value))

答案 2 :(得分:3)

以下是一些使用动物园的方法。

1)定义一个函数sum_last,给定一个zoo对象获取时间在系列中最后一天k天内的值的总和,并定义{{1将函数应用于整个系列的函数。然后,对于k = 7,使用rollave应用于每个roll,对于k = 14,使用user_id一次。

请注意,这会使用最新版本的动物园中引入的coredata rollapply参数,因此请确保您没有早期版本。

library(zoo)

# compute sum of values within k time units of last time point
sum_last <- function(z, k) {
  tt <- time(z)
  sum(z[tt > tail(tt, 1) - k])
}

# given indexes ix run rollapplyr on read.zoo(DF2[ix, -1])
roll <- function(ix, k) {
 rollapplyr(read.zoo(DF2[ix, -1]), k, sum_last, coredata = FALSE, partial = TRUE, k = k)
}

nr <- nrow(DF2)
transform(DF2, 
  v_minus7 = ave(1:nr, user_id, FUN = function(x) roll(x, 7)),
  v_minus14 = ave(1:nr, user_id, FUN = function(x) roll(x, 14)))

2)另一种方法是将roll替换为下面显示的版本。这会将DF2[ix, -1]转换为"zoo"并将其与带有填充间隙的零宽度网格合并。然后rollapply应用于该window,我们使用roll <- function(ix, k) { z <- read.zoo(DF2[ix, -1]) g <- zoo(, seq(start(z), end(z), "day")) m <- merge(z, g, fill = 0) r <- rollapplyr(m, k, sum, partial = TRUE) window(r, time(z)) } 将其子集化回原始时间。

batchtracing

答案 3 :(得分:3)

这是findInterval的另一个想法,可以最大限度地减少比较和操作。首先定义一个函数来容纳忽略分组的基本部分。以下函数计算累积和,并从每个位置减去过去日期的累计和:

ff = function(date, value, minus)
{
    cs = cumsum(value)  
    i = findInterval(date - minus, date, left.open = TRUE) 
    w = which(as.logical(i))
    i[w] = cs[i[w]]
    cs - i
}

按小组申请:

do.call(rbind, 
        lapply(split(DF2, DF2$user_id), 
               function(x) data.frame(x, 
                         minus7 = ff(x$date, x$value, 7), 
                         minus14 = ff(x$date, x$value, 14))))
#      user_id       date value minus7 minus14
#11.8       11 2016-01-01   6.8    6.8     6.8
#11.9       11 2016-01-03  21.3   28.1    28.1
#11.10      11 2016-01-05  19.8   47.9    47.9
#11.11      11 2016-01-07  22.0   69.9    69.9
#11.12      11 2016-01-10  19.4   82.5    89.3
#11.13      11 2016-01-14  17.5   58.9   106.8
#11.14      11 2016-01-16  19.3   56.2   119.3
#27.1       27 2016-01-01  15.0   15.0    15.0
#27.2       27 2016-01-03  22.4   37.4    37.4
#27.3       27 2016-01-05  13.3   50.7    50.7
#27.4       27 2016-01-07  21.9   72.6    72.6
#27.5       27 2016-01-10  20.6   78.2    93.2
#27.6       27 2016-01-14  18.6   61.1   111.8
#27.7       27 2016-01-16  16.4   55.6   113.2

上述逐组操作当然可以用任何优先方法代替。

答案 4 :(得分:1)

这是使用dplyrtbrf的新选项

library(tbrf)
library(dplyr)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
                 date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
                 value = round(rnorm(14, 15, 5), 1))

DF2 %>%
  group_by(user_id) %>%
  tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
  arrange(user_id, date) %>%
  rename(v_minus7 = sum) %>%
  tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
  rename(v_minus14 = sum)

创建小标题:

# A tibble: 14 x 5
   user_id date       value v_minus7 v_minus14
     <dbl> <date>     <dbl>    <dbl>     <dbl>
 1      11 2016-01-01   6.8      6.8      21.8
 2      27 2016-01-01  15       15        21.8
 3      11 2016-01-03  21.3     28.1      65.5
 4      27 2016-01-03  22.4     37.4      65.5
 5      11 2016-01-05  19.8     47.9      98.6
 6      27 2016-01-05  13.3     50.7      98.6
 7      11 2016-01-07  22       69.9     142. 
 8      27 2016-01-07  21.9     72.6     142. 
 9      11 2016-01-10  19.4     82.5     182. 
10      27 2016-01-10  20.6     78.2     182. 
11      11 2016-01-14  17.5     58.9     219. 
12      27 2016-01-14  18.6     61.1     219. 
13      11 2016-01-16  19.3     56.2     232. 
14      27 2016-01-16  16.4     55.6     232. 

我怀疑这不是使用较大数据集的最快解决方案,但它在dplyr链中很好用。

答案 5 :(得分:1)

如果要计算时间/日期窗口,请尝试使用runner软件包。转到github documentation并检查Windows depending on date部分。

library(runner)
DF2 %>%
    group_by(user_id) %>%
    mutate(
      v_minus7 = sum_run(value, 7, idx = date),
      v_minus14 = sum_run(value, 14, idx = date)
    )

这里的基准

library(data.table)
library(dplyr)
library(zoo)
library(tbrf)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
                 date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
                 value = round(rnorm(14, 15, 5), 1))



# example 1
data_table <- function(DF2) {
  setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]),
                    v_minus14 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
             by = c("user_id", "date")][]
}


# example 2
dplyr_grid <- function(DF2) {
  all_combinations <- expand.grid(user_id=unique(DF2$user_id),
                                  date=seq(min(DF2$date), max(DF2$date), by="day"))

  DF2 %>%
    merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
    group_by(user_id) %>%
    arrange(date) %>%
    mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
           v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
    filter(!is.na(value))
}

# example 3
dplyr_tbrf <- function(DF2) {
  DF2 %>%
    group_by(user_id) %>%
    tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
    arrange(user_id, date) %>%
    rename(v_minus7 = sum) %>%
    tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
    rename(v_minus14 = sum)
}

# example 4
runner <- function(DF2) {
  DF2 %>%
    group_by(user_id) %>%
    mutate(
      v_minus7 = sum_run(value, 7, idx = date),
      v_minus14 = sum_run(value, 14, idx = date)
    )
}


microbenchmark::microbenchmark(
  runner = runner(DF2),
  data.table = data_table(DF2),
  dplyr = dplyr_tbrf(DF2),
  dplyr_tbrf = dplyr_tbrf(DF2),
  times = 100L
)

# Unit: milliseconds
#       expr       min        lq      mean    median        uq        max neval
#     runner  1.478331  1.797512  2.350416  2.083680  2.559875   9.181675   100
# data.table  5.432618  5.970619  7.107540  6.424862  7.563405  13.674661   100
#      dplyr 63.841710 73.652023 86.228112 79.861760 92.304231 256.841078   100
# dplyr_tbrf 60.582381 72.511075 90.175891 80.435700 92.865997 307.454643   100