我需要找到一个组出现在整个数据集中的次数并对其编号。以下是示例数据。可以说如果Group1第一次出现,然后类似地将其沿侧面编号为1,如果再次出现在车道上,则将其沿侧面编号为2,依此类推...对于每个组,请参阅示例数据和预期输出为了更清晰。
样本数据:
Group
Group1
Group1
Group1
Group1
Group1
Group1
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group2
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group1
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
Group3
预期输出:
Group No of times
Group1 1
Group1 1
Group1 1
Group1 1
Group1 1
Group1 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group2 1
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group1 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group2 2
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group1 3
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
Group3 1
答案 0 :(得分:3)
使用SELECT *
FROM test_table tt
WHERE exists(
SELECT *
FROM jsonb_array_elements(tt.settings) AS settings
WHERE (settings ->> 'start') > '2019-02-19 00:00:00'
);
data.table
的一种方法是创建一个rleid
列,为new
列中的每个更改提供不同的值。然后我们Group
group_by
,对于Group
值的每次更改,我们都使用new
递增计数。
cumsum
答案 1 :(得分:1)
与我的相比,Ronak的答案很棒,但这还是我想出的dplyr / tidyr解决方案。这个想法是:
library(dplyr)
library(tidyr)
df_clustered <-
df %>%
mutate(rownum = row_number()) %>%
arrange(Group) %>%
mutate(mark = case_when((rownum - lag(rownum)) == 1 ~ NA, TRUE ~ TRUE)) %>%
group_by(Group, mark) %>%
mutate(cluster_number = ifelse(mark == TRUE, row_number(), NA)) %>%
ungroup() %>%
fill(cluster_number) %>%
arrange(rownum) %>%
select(-rownum, -mark)
head(df_clustered, 20)
#> # A tibble: 20 x 3
#> Group Value cluster_number
#> <chr> <dbl> <int>
#> 1 Group1 1 1
#> 2 Group1 2 1
#> 3 Group1 1 1
#> 4 Group1 1.3 1
#> 5 Group1 1.2 1
#> 6 Group1 1 1
#> 7 Group2 7 1
#> 8 Group2 6 1
#> 9 Group2 2 1
#> 10 Group2 1 1
#> 11 Group2 25 1
#> 12 Group2 23 1
#> 13 Group2 24 1
#> 14 Group2 25 1
#> 15 Group1 24 2
#> 16 Group1 23 2
#> 17 Group1 26 2
#> 18 Group1 23 2
#> 19 Group1 17 2
#> 20 Group1 11 2
答案 2 :(得分:1)
这是一个纯数据表解决方案。它基于rle()
和rep()
:
library(data.table)
DT <- data.table(stringsAsFactors=FALSE,
Group = c("Group1", "Group1", "Group1", "Group1", "Group1", "Group1",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group2", "Group2", "Group1", "Group1", "Group1", "Group1",
"Group1", "Group1", "Group1", "Group1", "Group2", "Group2",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group1", "Group1", "Group1", "Group1", "Group1", "Group1"),
Value = c(1, 2, 1, 1.3, 1.2, 1, 7, 6, 2, 1, 25, 23, 24, 25, 24, 23, 26, 23,
17, 11, 2, 1, 1, 2, 2.3, 1, 3, 4, 1, 1, 2, 25, 26, 11, 17, 16)
)
lengthEncoding <- rle(DT$Group)
setDT(lengthEncoding)[, group_count := seq_len(.N), by="values"]
DT[, "No of times" := rep(lengthEncoding$group_count, lengthEncoding$lengths)]
print(DT)
顺便说一句,该解决方案比公认的答案要快:
编辑::添加了@ chinsoon12美丽的单面纸,该当之无愧!
library(microbenchmark)
library(data.table)
library(dplyr)
df <- data.frame(stringsAsFactors=FALSE,
Group = c("Group1", "Group1", "Group1", "Group1", "Group1", "Group1",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group2", "Group2", "Group1", "Group1", "Group1", "Group1",
"Group1", "Group1", "Group1", "Group1", "Group2", "Group2",
"Group2", "Group2", "Group2", "Group2", "Group2", "Group2",
"Group1", "Group1", "Group1", "Group1", "Group1", "Group1"),
Value = c(1, 2, 1, 1.3, 1.2, 1, 7, 6, 2, 1, 25, 23, 24, 25, 24, 23, 26, 23,
17, 11, 2, 1, 1, 2, 2.3, 1, 3, 4, 1, 1, 2, 25, 26, 11, 17, 16)
)
DT <- data.table(df)
results <- microbenchmark(
RonakShah = {
df %>%
mutate(new = rleid(Group)) %>%
group_by(Group) %>%
mutate(no_of_times = cumsum(c(1,diff(new) != 0))) %>%
select(-new)
},
ismirsehregal = {
lengthEncoding <- rle(DT$Group)
setDT(lengthEncoding)[, group_count := seq_len(.N), by="values"]
DT[, "No of times" := rep(lengthEncoding$group_count, lengthEncoding$lengths)]
},
chinsoon12 = {DT[, numtimes := 1L + c(0L, cumsum(diff(.I) > 1L)), by=.(Group)]}
)
print(results)
plot(results)
expr min lq mean median uq max neval cld
RonakShah 3.980914 4.253103 4.898788 4.500009 5.063746 8.021481 100 c
ismirsehregal 1.494078 1.653283 1.937947 1.828487 2.023246 5.678442 100 b
chinsoon12 1.050436 1.239666 1.469426 1.440154 1.646369 2.572168 100 a