这是我previously posted question的调整。这是我的数据:
<input id="pac-input" class="controls" type="text" placeholder="Search Box" name="pac-input">
<div id="map"></div>
<script>
function initAutocomplete() {
var map = new google.maps.Map(document.getElementById('map'), {
center: {lat: -33.8688, lng: 151.2195},
zoom: 13,
mapTypeId: 'roadmap'
});
// Create the search box and link it to the UI element.
var input = document.getElementById('pac-input');
var searchBox = new google.maps.places.SearchBox(input);
map.controls[google.maps.ControlPosition.TOP_LEFT].push(input);
// Bias the SearchBox results towards current map's viewport.
map.addListener('bounds_changed', function() {
searchBox.setBounds(map.getBounds());
});
var markers = [];
// Listen for the event fired when the user selects a prediction and retrieve
// more details for that place.
searchBox.addListener('places_changed', function() {
var places = searchBox.getPlaces();
if (places.length == 0) {
return;
}
// Clear out the old markers.
markers.forEach(function(marker) {
marker.setMap(null);
});
markers = [];
// For each place, get the icon, name and location.
var bounds = new google.maps.LatLngBounds();
places.forEach(function(place) {
if (!place.geometry) {
console.log("Returned place contains no geometry");
return;
}
var icon = {
url: place.icon,
size: new google.maps.Size(71, 71),
origin: new google.maps.Point(0, 0),
anchor: new google.maps.Point(17, 34),
scaledSize: new google.maps.Size(25, 25)
};
// Create a marker for each place.
markers.push(new google.maps.Marker({
map: map,
icon: icon,
title: place.name,
position: place.geometry.location
}));
if (place.geometry.viewport) {
// Only geocodes have viewport.
bounds.union(place.geometry.viewport);
} else {
bounds.extend(place.geometry.location);
}
});
map.fitBounds(bounds);
});
}
</script>
<script src="https://maps.googleapis.com/maps/api/js?key=AIzaSyB9RRjfBs5H4kP7Pa-1SePt6FzrzmC6KX8&libraries=places&callback=initAutocomplete"
async defer></script>
这一次,我想计算指定时间段内每个set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
user_id date value
27 2016-01-01 15.0
27 2016-01-03 22.4
27 2016-01-05 13.3
27 2016-01-07 21.9
27 2016-01-10 20.6
27 2016-01-14 18.6
27 2016-01-16 16.4
11 2016-01-01 6.8
11 2016-01-03 21.3
11 2016-01-05 19.8
11 2016-01-07 22.0
11 2016-01-10 19.4
11 2016-01-14 17.5
11 2016-01-16 19.3
的{{1}}累积总和';例如过去7天,14天。理想的解决方案如下:
value
理想情况下,我想使用user_id
,但其他套餐也没问题。
答案 0 :(得分:6)
逻辑:首先按user_id
分组,然后是date
。现在,对于每个数据子集,我们使用返回逻辑向量的between()
检查当前日期和7/14天之间的所有日期。
根据此逻辑向量,我添加了value
列
library(data.table)
setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]),
v_minus14 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
by = c("user_id", "date")][]
# user_id date value v_minus7 v_minus14
#1: 27 2016-01-01 15.0 15.0 15.0
#2: 27 2016-01-03 22.4 37.4 37.4
#3: 27 2016-01-05 13.3 50.7 50.7
#4: 27 2016-01-07 21.9 72.6 72.6
#5: 27 2016-01-10 20.6 78.2 93.2
#6: 27 2016-01-14 18.6 61.1 111.8
#7: 27 2016-01-16 16.4 55.6 113.2
#8: 11 2016-01-01 6.8 6.8 6.8
#9: 11 2016-01-03 21.3 28.1 28.1
#10: 11 2016-01-05 19.8 47.9 47.9
#11: 11 2016-01-07 22.0 69.9 69.9
#12: 11 2016-01-10 19.4 82.5 89.3
#13: 11 2016-01-14 17.5 58.9 106.8
#14: 11 2016-01-16 19.3 56.2 119.3
# from alexis_laz answer.
ff = function(date, value, minus){
cs = cumsum(value)
i = findInterval(date - minus, date, rightmost.closed = TRUE)
w = which(as.logical(i))
i[w] = cs[i[w]]
cs - i
}
setDT(DF2)
DF2[, `:=`( v_minus7 = ff(date, value, 7),
v_minus14 = ff(date, value, 14)), by = c("user_id")]
答案 1 :(得分:4)
首先填写缺失日期后,您可以使用rollapply
中的zoo
:
library(dplyr)
library(zoo)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
all_combinations <- expand.grid(user_id=unique(DF2$user_id),
date=seq(min(DF2$date), max(DF2$date), by="day"))
res <- DF2 %>%
merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
group_by(user_id) %>%
arrange(date) %>%
mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
filter(!is.na(value))
答案 2 :(得分:3)
以下是一些使用动物园的方法。
1)定义一个函数sum_last
,给定一个zoo对象获取时间在系列中最后一天k天内的值的总和,并定义{{1将函数应用于整个系列的函数。然后,对于k = 7,使用roll
将ave
应用于每个roll
,对于k = 14,使用user_id
一次。
请注意,这会使用最新版本的动物园中引入的coredata
rollapply
参数,因此请确保您没有早期版本。
library(zoo)
# compute sum of values within k time units of last time point
sum_last <- function(z, k) {
tt <- time(z)
sum(z[tt > tail(tt, 1) - k])
}
# given indexes ix run rollapplyr on read.zoo(DF2[ix, -1])
roll <- function(ix, k) {
rollapplyr(read.zoo(DF2[ix, -1]), k, sum_last, coredata = FALSE, partial = TRUE, k = k)
}
nr <- nrow(DF2)
transform(DF2,
v_minus7 = ave(1:nr, user_id, FUN = function(x) roll(x, 7)),
v_minus14 = ave(1:nr, user_id, FUN = function(x) roll(x, 14)))
2)另一种方法是将roll
替换为下面显示的版本。这会将DF2[ix, -1]
转换为"zoo"
并将其与带有填充间隙的零宽度网格合并。然后rollapply
应用于该window
,我们使用roll <- function(ix, k) {
z <- read.zoo(DF2[ix, -1])
g <- zoo(, seq(start(z), end(z), "day"))
m <- merge(z, g, fill = 0)
r <- rollapplyr(m, k, sum, partial = TRUE)
window(r, time(z))
}
将其子集化回原始时间。
batchtracing
答案 3 :(得分:3)
这是findInterval
的另一个想法,可以最大限度地减少比较和操作。首先定义一个函数来容纳忽略分组的基本部分。以下函数计算累积和,并从每个位置减去过去日期的累计和:
ff = function(date, value, minus)
{
cs = cumsum(value)
i = findInterval(date - minus, date, left.open = TRUE)
w = which(as.logical(i))
i[w] = cs[i[w]]
cs - i
}
按小组申请:
do.call(rbind,
lapply(split(DF2, DF2$user_id),
function(x) data.frame(x,
minus7 = ff(x$date, x$value, 7),
minus14 = ff(x$date, x$value, 14))))
# user_id date value minus7 minus14
#11.8 11 2016-01-01 6.8 6.8 6.8
#11.9 11 2016-01-03 21.3 28.1 28.1
#11.10 11 2016-01-05 19.8 47.9 47.9
#11.11 11 2016-01-07 22.0 69.9 69.9
#11.12 11 2016-01-10 19.4 82.5 89.3
#11.13 11 2016-01-14 17.5 58.9 106.8
#11.14 11 2016-01-16 19.3 56.2 119.3
#27.1 27 2016-01-01 15.0 15.0 15.0
#27.2 27 2016-01-03 22.4 37.4 37.4
#27.3 27 2016-01-05 13.3 50.7 50.7
#27.4 27 2016-01-07 21.9 72.6 72.6
#27.5 27 2016-01-10 20.6 78.2 93.2
#27.6 27 2016-01-14 18.6 61.1 111.8
#27.7 27 2016-01-16 16.4 55.6 113.2
上述逐组操作当然可以用任何优先方法代替。
答案 4 :(得分:1)
这是使用dplyr
和tbrf
的新选项
library(tbrf)
library(dplyr)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
DF2 %>%
group_by(user_id) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
arrange(user_id, date) %>%
rename(v_minus7 = sum) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
rename(v_minus14 = sum)
创建小标题:
# A tibble: 14 x 5
user_id date value v_minus7 v_minus14
<dbl> <date> <dbl> <dbl> <dbl>
1 11 2016-01-01 6.8 6.8 21.8
2 27 2016-01-01 15 15 21.8
3 11 2016-01-03 21.3 28.1 65.5
4 27 2016-01-03 22.4 37.4 65.5
5 11 2016-01-05 19.8 47.9 98.6
6 27 2016-01-05 13.3 50.7 98.6
7 11 2016-01-07 22 69.9 142.
8 27 2016-01-07 21.9 72.6 142.
9 11 2016-01-10 19.4 82.5 182.
10 27 2016-01-10 20.6 78.2 182.
11 11 2016-01-14 17.5 58.9 219.
12 27 2016-01-14 18.6 61.1 219.
13 11 2016-01-16 19.3 56.2 232.
14 27 2016-01-16 16.4 55.6 232.
我怀疑这不是使用较大数据集的最快解决方案,但它在dplyr
链中很好用。
答案 5 :(得分:1)
如果要计算时间/日期窗口,请尝试使用runner软件包。转到github documentation并检查Windows depending on date
部分。
library(runner)
DF2 %>%
group_by(user_id) %>%
mutate(
v_minus7 = sum_run(value, 7, idx = date),
v_minus14 = sum_run(value, 14, idx = date)
)
这里的基准
library(data.table)
library(dplyr)
library(zoo)
library(tbrf)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
value = round(rnorm(14, 15, 5), 1))
# example 1
data_table <- function(DF2) {
setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]),
v_minus14 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
by = c("user_id", "date")][]
}
# example 2
dplyr_grid <- function(DF2) {
all_combinations <- expand.grid(user_id=unique(DF2$user_id),
date=seq(min(DF2$date), max(DF2$date), by="day"))
DF2 %>%
merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
group_by(user_id) %>%
arrange(date) %>%
mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
filter(!is.na(value))
}
# example 3
dplyr_tbrf <- function(DF2) {
DF2 %>%
group_by(user_id) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
arrange(user_id, date) %>%
rename(v_minus7 = sum) %>%
tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
rename(v_minus14 = sum)
}
# example 4
runner <- function(DF2) {
DF2 %>%
group_by(user_id) %>%
mutate(
v_minus7 = sum_run(value, 7, idx = date),
v_minus14 = sum_run(value, 14, idx = date)
)
}
microbenchmark::microbenchmark(
runner = runner(DF2),
data.table = data_table(DF2),
dplyr = dplyr_tbrf(DF2),
dplyr_tbrf = dplyr_tbrf(DF2),
times = 100L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# runner 1.478331 1.797512 2.350416 2.083680 2.559875 9.181675 100
# data.table 5.432618 5.970619 7.107540 6.424862 7.563405 13.674661 100
# dplyr 63.841710 73.652023 86.228112 79.861760 92.304231 256.841078 100
# dplyr_tbrf 60.582381 72.511075 90.175891 80.435700 92.865997 307.454643 100