我试图获得总数&基于日期的一些变量的平均值。我有燃料数据和驾驶几辆车的数据。燃料数据由几个日期组成,而驾驶数据包含更多日期(逻辑上,您只能在x次旅行后加油)。我的最终结果是根据燃油日期获得总和/平均驾驶数据。
燃料数据:
plate = c("AB123", "AB123", "AB123", "AB123", "AC234", "AC234", "AC234", "AC234", "AD345", "AD345")
date = c("2017-09-08", "2017-09-11", "2017-09-13", "2017-09-20", "2017-09-06", "2017-09-08", "2017-09-15", "2017-09-23", "2017-09-10", "2017-09-18")
liter = c(33, 15, 28, 40, 43, 20, 25, 50, 26, 48)
df1 = data.frame(plate, date, liter)
驾驶数据:
plate = c("AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345")
date = c("2017-09-01", "2017-01-05", "2017-09-08", "2017-09-10", "2017-09-11", "2017-09-12", "2017-09-13", "2017-09-16", "2017-09-17", "2017-09-20", "2017-09-22", "2017-09-25", "2017-09-02", "2017-09-03", "2017-09-06", "2017-09-07", "2017-09-08", "2017-09-09", "2017-09-13", "2017-09-15", "2017-09-17", "2017-09-20", "2017-09-23", "2017-09-25", "2017-09-01", "2017-09-04", "2017-09-09", "2017-09-12", "2017-09-15", "2017-09-18", "2017-09-19", "2017-09-20", "2017-09-23", "2017-09-27", "2017-09-30")
mileage = c(50, 64, 45, 70, 58, 41, 22, 15, 90, 48, 52, 48, 29, 65, 70, 46, 88, 71, 40, 51, 38, 91, 74, 61, 41, 33, 59, 81, 72, 65, 43, 81, 20, 49, 39)
accx = c(0, 3, 4, 0, 8, 11, 2, 5, 9, 10, 2, 22, 9, 6, 7, 6, 8, 1, 0, 1, 8, 1, 7, 6, 4, 3, 9, 11, 22, 15, 13, 1, 2, 4, 9)
df2 = data.frame(plate, date, mileage,accx)
合并两个数据
df.all = left_join(df2, df1, by.x =c("plate", "date"))
我想根据燃油日期获得总里程(总和)和平均accx。最终结果应如下所示:
有没有办法使用dplyr来改变所需的结果?仅供参考我只需要他们的板块的变异参数(结果线1,2,4,6,8等不需要) 提前谢谢!
答案 0 :(得分:2)
可能有更优雅的方式,但这有效:
library(dplyr)
df.all %>%
mutate(date = as.Date(date)) %>%
group_by(plate) %>%
arrange(plate, date) %>%
mutate(t.mileage = cumsum(mileage) * !is.na(liter),
t.accx = cumsum(accx) * !is.na(liter),
n = seq_len(n())) %>%
filter(!is.na(liter)) %>%
mutate(t.mileage.lag = lag(t.mileage),
t.accx.lag = lag(t.accx),
n.lag = lag(n),
t.mileage = ifelse(!is.na(t.mileage.lag), t.mileage - t.mileage.lag, t.mileage),
a.accx = ifelse(!is.na(t.accx.lag), (t.accx - t.accx.lag)/(n - n.lag), t.accx/n)) %>%
select(-t.mileage.lag, -t.accx.lag, -n.lag, -n, -t.accx)
# A tibble: 9 x 7
# Groups: plate [3]
# plate date mileage accx liter t.mileage a.accx
# <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 AB123 2017-09-08 45 4 33 159 2.3333333
# 2 AB123 2017-09-11 58 8 15 128 4.0000000
# 3 AB123 2017-09-13 22 2 28 63 6.5000000
# 4 AB123 2017-09-20 48 10 40 153 8.0000000
# 5 AC234 2017-09-06 70 7 43 164 7.3333333
# 6 AC234 2017-09-08 88 8 20 134 7.0000000
# 7 AC234 2017-09-15 51 1 25 162 0.6666667
# 8 AC234 2017-09-23 74 7 50 203 5.3333333
# 9 AD345 2017-09-18 65 15 48 351 10.6666667
按板分组仅计算以下每个不同的板。然后使用cumsum
计算总里程和总计accx,但只保留我们没有丢失升的值。还要计算我们有多少驱动器用n。然后,由于我们只对我们为汽车加油的信息感兴趣,因此我们过滤了不丢失的升。使用lag
从每个值中减去先前的总milage和accx(除非没有先前的值,即滞后为NA
),然后计算平均accx。
df1 <-
data.frame(plate = c("AB123", "AB123", "AB123", "AB123", "AC234", "AC234", "AC234", "AC234", "AD345", "AD345"),
date = c("2017-09-08", "2017-09-11", "2017-09-13", "2017-09-20", "2017-09-06", "2017-09-08",
"2017-09-15", "2017-09-23", "2017-09-10", "2017-09-18"),
liter = c(33, 15, 28, 40, 43, 20, 25, 50, 26, 48),
stringsAsFactors = F)
df2 <-
data.frame(plate = c("AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AB123", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AC234", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345", "AD345"),
date = c("2017-09-01", "2017-01-05", "2017-09-08", "2017-09-10", "2017-09-11", "2017-09-12", "2017-09-13", "2017-09-16", "2017-09-17", "2017-09-20", "2017-09-22", "2017-09-25", "2017-09-02", "2017-09-03", "2017-09-06", "2017-09-07", "2017-09-08", "2017-09-09", "2017-09-13", "2017-09-15", "2017-09-17", "2017-09-20", "2017-09-23", "2017-09-25", "2017-09-01", "2017-09-04", "2017-09-09", "2017-09-12", "2017-09-15", "2017-09-18", "2017-09-19", "2017-09-20", "2017-09-23", "2017-09-27", "2017-09-30"),
mileage = c(50, 64, 45, 70, 58, 41, 22, 15, 90, 48, 52, 48, 29, 65, 70, 46, 88, 71, 40, 51, 38, 91, 74, 61, 41, 33, 59, 81, 72, 65, 43, 81, 20, 49, 39),
accx = c(0, 3, 4, 0, 8, 11, 2, 5, 9, 10, 2, 22, 9, 6, 7, 6, 8, 1, 0, 1, 8, 1, 7, 6, 4, 3, 9, 11, 22, 15, 13, 1, 2, 4, 9),
stringsAsFactors = F)
df.all <- left_join(df2, df1, by = c("plate", "date"))
答案 1 :(得分:1)
虽然@kath提供了一个更方便的解决方案,但这里有一个基础R
(如果只是因为我花了一些时间来处理它):
# generate factor to split on
temp <- which(!is.na(df.all$liter))
vec <- temp - c(0, temp[-length(temp)])
df.all$split <- rep(seq(1, length(temp)+1), c(vec, nrow(df.all)-temp[length(temp)]))
# split df.all and calculate t.mileage and a.accx for each subsample
df.temp <- split(df.all, df.all$split)
t.mileage <- sapply(df.temp, function(x) sum(x[, "mileage"]))
a.accx <- sapply(df.temp, function(x) mean(x[, "accx"]))
# generate new variables and insert calculated values
df.all$t.mileage <- NA
df.all$t.mileage[temp] <- t.mileage[-length(t.mileage)]
df.all$a.accx <- NA
df.all$a.accx[temp] <- a.accx[-length(a.accx)]
# display df.all without splitting factor
df.all <- subset(df.all, select = -split)
> df.all
plate date mileage accx liter t.mileage a.accx
1 AB123 2017-09-01 50 0 NA NA NA
2 AB123 2017-01-05 64 3 NA NA NA
3 AB123 2017-09-08 45 4 33 159 2.3333333
4 AB123 2017-09-10 70 0 NA NA NA
5 AB123 2017-09-11 58 8 15 128 4.0000000
6 AB123 2017-09-12 41 11 NA NA NA
7 AB123 2017-09-13 22 2 28 63 6.5000000
8 AB123 2017-09-16 15 5 NA NA NA
9 AB123 2017-09-17 90 9 NA NA NA
10 AB123 2017-09-20 48 10 40 153 8.0000000
11 AB123 2017-09-22 52 2 NA NA NA
12 AB123 2017-09-25 48 22 NA NA NA
13 AC234 2017-09-02 29 9 NA NA NA
14 AC234 2017-09-03 65 6 NA NA NA
15 AC234 2017-09-06 70 7 43 264 9.2000000
16 AC234 2017-09-07 46 6 NA NA NA
17 AC234 2017-09-08 88 8 20 134 7.0000000
18 AC234 2017-09-09 71 1 NA NA NA
19 AC234 2017-09-13 40 0 NA NA NA
20 AC234 2017-09-15 51 1 25 162 0.6666667
21 AC234 2017-09-17 38 8 NA NA NA
22 AC234 2017-09-20 91 1 NA NA NA
23 AC234 2017-09-23 74 7 50 203 5.3333333
24 AC234 2017-09-25 61 6 NA NA NA
25 AD345 2017-09-01 41 4 NA NA NA
26 AD345 2017-09-04 33 3 NA NA NA
27 AD345 2017-09-09 59 9 NA NA NA
28 AD345 2017-09-12 81 11 NA NA NA
29 AD345 2017-09-15 72 22 NA NA NA
30 AD345 2017-09-18 65 15 48 412 10.0000000
31 AD345 2017-09-19 43 13 NA NA NA
32 AD345 2017-09-20 81 1 NA NA NA
33 AD345 2017-09-23 20 2 NA NA NA
34 AD345 2017-09-27 49 4 NA NA NA
35 AD345 2017-09-30 39 9 NA NA NA
顺便说一下,必须有一种更简单的方法来生成上面第1步的因子,有谁知道怎么做?