我正在寻找一种方法,在id
组内,计算数据数据TF
中tbl
中唯一出现的价值转移。
我想在TF
和1
或0
和o
之间1
更改时向前和向后计算。计数将存储在一个新的变量PM##
中,以便PM##
s在TF
中保存每个唯一的班次,包括加号和减号。下面的MWE导致下午7点的结果,但我的生产数据可以有15个或更多班次。如果TF
之间的NA
值没有变化,我想将其标记为0
。
此问题类似于a question I previously asked,但关于TF
独立的最后一部分是新的。 Uwe和Psidom都使用data.table
here并使用tidyverse
here为初始问题提供了优雅答案。 after conferencing with Uwe,我发布了我的问题的略微修改版本。
如果此问题违反了任何SO政策,请告知我们,我将很乐意重新打开我的初步问题,或者将此问题附加到这个问题上。
用最小工作示例来说明我的问题。我有这样的数据,
# install.packages(c("tidyverse"), dependencies = TRUE)
library(tibble)
tbl <- tibble(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1, NA, 1, 0, 1, 0, 1, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
tbl %>% print(n=18)
#> # A tibble: 40 x 2
#> id TF
#> <int> <dbl>
#> 1 10 NA
#> 2 10 NA
#> 3 10 0
#> 4 10 NA
#> 5 10 0
#> 6 10 NA
#> 7 10 1
#> 8 10 1
#> 9 10 1
#> 10 10 1
#> 11 10 1
#> 12 10 NA
#> 13 10 1
#> 14 10 0
#> 15 10 1
#> 16 10 0
#> 17 10 1
#> 18 0 NA
#> # ... with 22 more rows
tblPM <- structure(list(id = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1,
NA, 1, 0, 1, 0, 1, NA, 0, NA, 0, 0, 1, 1, 1, 0, 0,
NA, NA, 0, NA, 0, 0, 0, 1, 1, 1, 0, NA, 1), PM01 = c(NA,
NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L,
-2L, -1L, 1L, 2L, 3L, NA, NA, NA), PM02 = c(NA, NA, NA, NA, 0L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -2L,
-1L, 1L, 2L, 3L, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L, -2L,
-1L, 1L, NA, NA), PM03 = c(NA, NA, NA, NA, NA, NA, 0L, 0L, 0L,
0L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -3L, -2L,
-1L, 1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0L), PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
-1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), PM05 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), PM06 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), PM07 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), .Names = c("id", "TF", "PM01", "PM02", "PM03", "PM04", "PM05",
"PM06", "PM07"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -40L
))
tblPM %>% print(n=18)
#> # A tibble: 40 x 9
#> id TF PM01 PM02 PM03 PM04 PM05 PM06 PM07
#> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
#> 1 10 NA NA NA NA NA NA NA NA
#> 2 10 NA NA NA NA NA NA NA NA
#> 3 10 0 0 NA NA NA NA NA NA
#> 4 10 NA NA NA NA NA NA NA NA
#> 5 10 0 NA 0 NA NA NA NA NA
#> 6 10 NA NA NA NA NA NA NA NA
#> 7 10 1 NA NA 0 NA NA NA NA
#> 8 10 1 NA NA 0 NA NA NA NA
#> 9 10 1 NA NA 0 NA NA NA NA
#> 10 10 1 NA NA 0 NA NA NA NA
#> 11 10 1 NA NA 0 NA NA NA NA
#> 12 10 NA NA NA NA NA NA NA NA
#> 13 10 1 NA NA NA -1 NA NA NA
#> 14 10 0 NA NA NA 1 -1 NA NA
#> 15 10 1 NA NA NA NA 1 -1 NA
#> 16 10 0 NA NA NA NA NA 1 -1
#> 17 10 1 NA NA NA NA NA NA 1
#> 18 0 NA NA NA NA NA NA NA NA
#> # ... with 22 more rows
identical([some solution], tblPM)
#> [1] TRUE
更新w / microbenchmark
2018-01-24 14:20:18Z ,
感谢Fierr和Chris花时间梳理逻辑并提交答案。启发了我的this setup我计算了一个小功能的微基准测试比较。我把Fierr s answer into the function
tidyverse_Fierr()and Chris' answer into
dt_Chris()`(如果有人想要确切的功能请告诉我,我会在这里添加。
经过一些小的调整后,当它们与tblPM
匹配时,它们都是相同的,即
identical(tblPM, tidyverse_Fierr(tbl))
#> [1] TRUE
identical(tblPM, dt_Chris(tbl))
#> [1] TRUE
现在快速微基准测试,
df_test <- bind_rows(rep(list(tbl), 111))
microbenchmark::microbenchmark(tidyverse_Fierr(df_test), dt_Chris(df_test), times = 3*1)
#> Unit: milliseconds
#> expr min mean median uq max neval cld
#> tidyverse_Fierr(df_test) 19503.366 20171.268 20080.99 20505.219 20929.4489 3 b
#> dt_Chris(df_test) 199.165 233.924 203.72 251.304 298.8887 3 a
有趣的是,在这个kinda similar comparison中,tidy_method的速度更快。
答案 0 :(得分:2)
这是一个脚本方法 - 考虑到每种情况的定制处理量(TF = NA,uniqueN(TF)= 1,uniqueN(TF)= 2,我认为这可能比dplyr链更清晰应该相当快,因为它是基于所有data.table。打开如何改进的建议!
随着所需PM列数量的增加,这会自动扩展 - 正如我在下面评论的那样,我建议删除列中的0前缀,因为可能会出现10 ^ 2..n的情况会碰到PM001的列。
library(data.table)
tbl3 <- data.table(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0L, NA, 0L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 0L, 1L, 0L, 1L, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
# create index to untimately join back to
tbl3[, row_idx := .I]
# all transformations on a replicated data.table
tbl3_tmp <- copy(tbl3)
# identify where the NA breaks occur - this splits each id into subgroups (id_group)
tbl3_tmp[, P_TF := shift(TF, 1, "lag", fill = NA), by = .(id)]
tbl3_tmp[, TF_break := is.na(TF) | is.na(P_TF)]
tbl3_tmp[, id_group := cumsum(TF_break), by = .(id)]
tbl3_tmp[, `:=`(TF_break = NULL, P_TF = NULL)] # above can be consolidated to one line which would make this line unneccesary - expanded for easier understanding
tbl3_tmp <- tbl3_tmp[!is.na(TF)] # NA rows can be safely ignored now - these will be all NA, and will be handled with the left join below
# find where subpatterns exist (runs of 0..1 or 1..0)
tbl3_tmp[, subpattern_break := TF != shift(TF, 1, "lag", fill = NA), by = .(id, id_group)]
tbl3_tmp[, subbreaks := sum(subpattern_break, na.rm = TRUE), by = .(id, id_group)] # if there are no breaks, we need to treat separately
# two cases: zero subbreaks and multiple subbreaks.
tbl3_zeros <- tbl3_tmp[subbreaks == 0]
tbl3_nonzeros <- tbl3_tmp[subbreaks > 0]
# for 1+ subbreaks, we need to double the rows - this allows us to easily create the PM_field both "forwards" and "backwards"
tbl3_nonzeros[is.na(subpattern_break), subpattern_break := TRUE]
tbl3_nonzeros[, subbreak_index := cumsum(subpattern_break), by = .(id, id_group)]
tbl3_nonzeros <- rbindlist(list(tbl3_nonzeros,tbl3_nonzeros), idcol = "base") # double the row
tbl3_nonzeros[base == 1 & subbreak_index %% 2 == 1, subbreak_index := subbreak_index + 1L] # round to nearest even
tbl3_nonzeros[base == 2 & subbreak_index %% 2 == 0, subbreak_index := subbreak_index + 1L] # round to nearest odd
# this creates an index when the subbreak starts - allows us to sequence PM properly
tbl3_nonzeros[,subbreak_start := min(row_idx), by = .(id, id_group, subbreak_index)]
# exclude the ends if there is only one unique TF value - might be able to get this to one line
tbl3_nonzeros[, TF_count := uniqueN(TF), by = .(id, id_group, subbreak_index)]
tbl3_nonzeros <- tbl3_nonzeros[TF_count > 1]
# create a 1..N column, subtract the index where the break occurs ,then add 1 to all 0+ values.
tbl3_nonzeros[,PM_field := 1:.N, by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[, PM_field := PM_field - PM_field[which(diff(TF)!=0)[1]+1], by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[PM_field >= 0, PM_field := PM_field + 1L] # base 1 after the break
# create subbreaks for zero groups
tbl3_zeros[,subbreak_start := min(row_idx), by = .(id, id_group)]
# bring zero and non zero case together
tbl3_zeros <- tbl3_zeros[, .(id, id_group, subbreak_start,row_idx = row_idx, PM_field = 0L)]
tbl3_nonzeros <- tbl3_nonzeros[,.(id, id_group, subbreak_start, row_idx, PM_field)]
tbl3_tmp <- rbindlist(list(tbl3_zeros, tbl3_nonzeros))
# Create header
tbl3_tmp <- tbl3_tmp[order(subbreak_start, PM_field)]
tbl3_tmp[, PM_header := paste0("PM0",cumsum(c(1,diff(subbreak_start)!=0)),sep = ""), by = .(id)] # I would remove 0 in PM0 here (kept for identical check)- inefficient to check if this will be 1, 2, 3 etc digits This could also be solved with; `paste0("PM", sprintf("%02d", cumsum(c(1, diff(subbreak_start) != 0))))`
# long to wide
tbl3_tmp <- dcast(tbl3_tmp, row_idx ~ PM_header, value.var = "PM_field", fun.aggregate = sum, fill = NA)
# merge back to initial dataframe
tblPM_frombase <- merge(tbl3, tbl3_tmp, by = "row_idx", all.x = TRUE)[, row_idx := NULL]
identical(tblPM, tblPM_frombase)
[1] TRUE
答案 1 :(得分:1)
喜欢挑战,以揭示这一逻辑。该方法基于tidyverse。欢迎更多关于整理它的建议!
library(data.table)
library(purrr)
library(dplyr)
library(tibble)
tbl <- tibble(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)),
TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1, NA, 1, 0, 1, 0, 1, NA, 0L, NA, 0L,
0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))
tbl <- mutate(tbl, rn = 1:n())
lookup_table <- tbl %>%
group_by(id) %>%
mutate(rl = rleid(TF)) %>%
group_by(id, rl, TF) %>%
summarise(n=n()) %>%
group_by(id) %>%
mutate(lag = lag(TF, order_by=id),
lead = lead(TF, order_by=id),
test = ifelse(is.na(lag) & is.na(lead), 1, 0)) %>%
select(id, rl, test)
tmp <- tbl %>%
group_by(id) %>%
mutate(rl = rleid(TF),
rl_nona = ifelse(is.na(TF), NA, rleid(rl)),
rl_nona = match(rl_nona, unique(na.omit(rl_nona)))) %>% # Re-indexing
left_join(lookup_table, by = c("id" = "id", "rl" = "rl")) %>%
mutate(TF_new = ifelse(test == 1, NA, TF),
rl_gap = ifelse(is.na(TF_new), NA, rleid(TF_new)),
rl_gap = match(rl_gap, unique(na.omit(rl_gap))), # Re-indexing
up_pos = ifelse(min(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap),
down_pos = ifelse(max(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap)) %>%
group_by(id, rl_gap) %>%
mutate(up = ifelse(is.na(up_pos), 0, seq_len(n())),
down = ifelse(is.na(down_pos), 0, -rev(seq_len(n())))) %>%
group_by(id) %>%
mutate(zero_pos = ifelse(test == 1 & rl_nona > max(rl_gap, na.rm = TRUE), rl_nona - 1, rl_nona)) # Correct placement of zeroes
up <- dcast(tmp, rn ~ rl_nona, value.var = 'up' , fill = 0)
down <- dcast(tmp, rn ~ rl_nona, value.var = 'down', fill = 0)
res <- (down[, 2:max(tmp$rl_nona, na.rm=TRUE)] + up[, 3:(max(tmp$rl_nona, na.rm=TRUE)+1)]) %>%
mutate_all(funs(replace(., which(.==0), NA))) %>%
bind_cols(rn = tmp$rn, test = tmp$test, zero_pos = tmp$zero_pos) %>%
right_join(tbl, by = "rn") %>%
mutate(`PM01` = ifelse(test == 1 & zero_pos == 1, 0, `1`)) %>%
mutate(`PM02` = ifelse(test == 1 & zero_pos == 2, 0, `2`)) %>%
mutate(`PM03` = ifelse(test == 1 & zero_pos == 3, 0, `3`)) %>%
mutate(`PM04` = ifelse(test == 1 & zero_pos == 4, 0, `4`)) %>%
mutate(`PM05` = ifelse(test == 1 & zero_pos == 5, 0, `5`)) %>%
mutate(`PM06` = ifelse(test == 1 & zero_pos == 6, 0, `6`)) %>%
mutate(`PM07` = ifelse(test == 1 & zero_pos == 7, 0, `7`)) %>%
select(id, TF, everything(), -rn, -test, -zero_pos, -c(1:7)) %>%
mutate_if(is.numeric, as.integer) %>%
as.tibble()
identical(tblPM, res)