这是我的数据框:
dput(test)
structure(list(Branch = c("11 Oktomvri", "11 Oktomvri", "11 Oktomvri",
"11 Oktomvri", "11 Oktomvri", "11 Oktomvri", "11 Oktomvri", "Aerodrom",
"Aerodrom", "Aerodrom", "Aerodrom", "Aerodrom", "Aerodrom", "Aerodrom",
"Aerodrom 2", "Aerodrom 2", "Aerodrom 2", "Aerodrom 2", "Aerodrom 2",
"Aerodrom 2", "Aerodrom 2", "Bitola", "Bitola", "Bitola", "Bitola",
"Bitola", "Bitola", "Bitola"), period = c("January", "February",
"March", "April", "May", "June", "July", "January", "February",
"March", "April", "May", "June", "July", "January", "February",
"March", "April", "May", "June", "July", "January", "February",
"March", "April", "May", "June", "July"), value = c(1513, 1511,
1520, 1524, 1508, 1504, 1517, 1364, 1381, 1400, 1403, 1401, 1406,
1430, 674, 687, 689, 690, 696, 705, 715, 4400, 4393, 4365, 4342,
4345, 4373, 4389)), .Names = c("Branch", "period", "value"), row.names = c(NA,
-28L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("Branch",
"period"), drop = TRUE, indices = list(3L, 1L, 0L, 6L, 5L, 2L,
4L, 10L, 8L, 7L, 13L, 12L, 9L, 11L, 17L, 15L, 14L, 20L, 19L,
16L, 18L, 24L, 22L, 21L, 27L, 26L, 23L, 25L), group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
Branch = c("11 Oktomvri", "11 Oktomvri", "11 Oktomvri", "11 Oktomvri",
"11 Oktomvri", "11 Oktomvri", "11 Oktomvri", "Aerodrom",
"Aerodrom", "Aerodrom", "Aerodrom", "Aerodrom", "Aerodrom",
"Aerodrom", "Aerodrom 2", "Aerodrom 2", "Aerodrom 2", "Aerodrom 2",
"Aerodrom 2", "Aerodrom 2", "Aerodrom 2", "Bitola", "Bitola",
"Bitola", "Bitola", "Bitola", "Bitola", "Bitola"), period = c("April",
"February", "January", "July", "June", "March", "May", "April",
"February", "January", "July", "June", "March", "May", "April",
"February", "January", "July", "June", "March", "May", "April",
"February", "January", "July", "June", "March", "May")), row.names = c(NA,
-28L), class = "data.frame", vars = c("Branch", "period"), drop = TRUE, .Names = c("Branch",
"period")))
我不确定如何根据Branch和Period对数据进行分组,并根据period_n-period_n-1计算值。
输出应为:
city period value diff_n_1
Bitola March 4365 -28
Bitola April 2000 13
我的观察:
results <- sample2 %>%
group_by(Branch, period) %>%
arrange(Branch) %>%
mutate(lagged_period = lag(value), client_diff = value - lagged_period)
我不确定如何拉下最后一行。
有什么想法吗?
答案 0 :(得分:2)
我认为你几乎没有,但你不需要Period
中的group_by
,因为这确实是我们使用lag
的变量。目前,您的上述工作仅产生NAs
,因为每个定义的组中只有一个元素(因此没有可用的延迟)。
这应该有效:
library(dplyr)
sample2 %>%
group_by(Branch) %>%
arrange(Branch) %>%
mutate(lagged_period = lag(value),
client_diff = value - lagged_period)
然后,当然,如果您想从计算字段中删除NAs
,您可以将上述内容输入:
filter(!is.na(client_diff))