我喜欢下面提到的数据帧(Dput):
structure(list(ID = c("TTR-1", "TTR-1", "TTR-2", "TTR-2", "TTR-2",
"TTR-3", "TTR-4", "TTR-4", "TTR-4", "TTR-5"), Value1 = c(100L,
400L, 147L, 159L, 165L, 178L, 166L, 150L, 150L, 169L), Value2 = c(15,
5.05, 13, 13, 13, 7.25, 11, 13, 15, 19), Value3 = c(25L, 25L,
18L, 32L, 32L, 14L, 45L, 57L, 60L, 22L), Date = c("26/08/2017 06:08:12",
"26/08/2017 15:45:18", "01/09/2017 12:04:16", "04/09/2017 15:02:47",
"04/09/2017 18:22:15", "12/09/2017 19:07:17", "15/09/2017 21:19:19",
"15/09/2017 22:12:47", "15/09/2017 23:15:18", "20/10/2017 09:37:14"
)), class = "data.frame", row.names = c(NA, -10L))
我想将其分为三个不同的类别。
考虑到从开始日期到日期的逐日变化,ID
对值组的更改次数。
ID Value1 Value2 Value3
TTR-1 1 1 0
TTR-2 2 0 1
TTR-3 0 0 0
TTR-4 1 2 2
TTR-5 0 0 0
按Date
的值组中的更改数(该日期在唯一ID
中进行的更改数)。
Date T U C Value1 Value2 Value3
26/08/2017 2 1 1 1 0 1
01/09/2017 1 1 0 0 0 0
04/09/2017 2 0 1 2 0 1
12/09/2017 1 1 0 0 0 0
15/09/2017 3 1 1 1 2 1
20/10/2017 1 1 0 0 0 0
“价值”组中按月显示的更改数。
Month T U C Value1 Value2 Value3
Aug-17 1 1 1 1 0 1
Sep-17 7 3 4 3 2 2
Oct-17 1 1 0 0 0 0
答案 0 :(得分:1)
我们可以按“ ID”分组,为“值”列获取n_distinct
,然后减去一个
1)按ID
library(dplyr)
df1 %>%
group_by(ID) %>%
summarise_at(vars(starts_with("Value")), funs(n_distinct(.) -1))
# A tibble: 5 x 4
# ID Value1 Value2 Value3
# <chr> <dbl> <dbl> <dbl>
#1 TTR-1 1 1 0
#2 TTR-2 2 0 1
#3 TTR-3 0 0 0
#4 TTR-4 1 2 2
#5 TTR-5 0 0 0
2)按日期
类似地,我们可以基于更改group_by
变量来完成此操作
library(lubridate)
df1 %>%
group_by(Date =as.Date(dmy_hms(Date))) %>%
summarise_at(vars(starts_with("Value")), funs(n_distinct(.)-1))
# A tibble: 6 x 4
# Date Value1 Value2 Value3
# <date> <dbl> <dbl> <dbl>
#1 2017-08-26 1 1 0
#2 2017-09-01 0 0 0
#3 2017-09-04 1 0 0
#4 2017-09-12 0 0 0
#5 2017-09-15 1 2 2
#6 2017-10-20 0 0 0
3)按月份
library(zoo)
df1 %>%
group_by(ID, Month = as.yearmon(Date, "%d/%m/%Y %H:%M:%S")) %>%
summarise_at(vars(starts_with("Value")), funs(n_distinct(.)-1)) %>%
group_by(Month) %>%
summarise_at(vars(starts_with("Value")), sum)
# A tibble: 3 x 4
# Month Value1 Value2 Value3
# <S3: yearmon> <dbl> <dbl> <dbl>
#1 Aug 2017 1 1 0
#2 Sep 2017 3 2 3
#3 Oct 2017 0 0 0
答案 1 :(得分:1)
使用dplyr
:
通过ID
library(dplyr)
df <- df %>%
mutate(d= as.Date(Date,"%d/%m/%Y"),
m= format.Date(Date,"%m-%d"))
df %>% group_by(ID) %>%
summarize_at(2:4,~sum(diff(.x)!=0))
# # A tibble: 5 x 4
# ID Value1 Value2 Value3
# <chr> <int> <int> <int>
# 1 TTR-1 1 1 0
# 2 TTR-2 2 0 1
# 3 TTR-3 0 0 0
# 4 TTR-4 1 2 2
# 5 TTR-5 0 0 0
按日期
df %>% group_by(ID,d) %>%
summarize_at(2:4,~sum(diff(.x)!=0)) %>%
group_by(d) %>%
summarize_at(3:5,sum)
# # A tibble: 6 x 4
# d Value1 Value2 Value3
# <date> <int> <int> <int>
# 1 2017-08-26 1 1 0
# 2 2017-09-01 0 0 0
# 3 2017-09-04 1 0 0
# 4 2017-09-12 0 0 0
# 5 2017-09-15 1 2 2
# 6 2017-10-20 0 0 0
按月
df %>% group_by(ID,m) %>%
summarize_at(2:4,~sum(diff(.x)!=0)) %>%
group_by(m) %>%
summarize_at(3:5,sum)
# # A tibble: 3 x 4
# m Value1 Value2 Value3
# <chr> <int> <int> <int>
# 1 08-20 1 1 0
# 2 09-20 3 2 3
# 3 10-20 0 0 0
答案 2 :(得分:0)
这是一种有趣的方法,可以使用OP的数据作为数据帧DF,根据基数R中问题描述中定义的选定变量来计算唯一变化数:
DF <- within(DF, {Date <- as.Date(strptime(Date, "%d/%m/%Y %H:%M:%S"))
Month <- format(Date, '%m-%Y')})
cols <- c("Value1", "Value2", "Value3")
namesVector <- c("Date", "ID", "Month")
formulaChar <- paste0('column ~ ', namesVector)
for (i in seq_along(namesVector)){
temp <- Reduce(function(x, y) merge(x, y, by = namesVector[i], all = T),
lapply(DF[, cols],
function(column) aggregate(formula = as.formula(formulaChar[i]),
data = DF,
FUN = function(g) length(unique(g))-1)))
names(temp) <- c(namesVector[i], cols)
print(temp)
assign(paste0("DFby", namesVector[i]), temp)
}
rm(cols, namesVector, formulaChar, i, temp)