moviewatched<- c('Comedy', 'Horror', 'Comedy', 'Horror', 'Drama', 'Comedy', 'Drama')
name<- c('john', 'john', 'john', 'john', 'john','kate','kate')
time<- c('1-2018', '1-2018', '1-2018', '2-2018', '2-2018','1-2018' ,'2-2018')

df<- data.frame(moviewatched, name, time)


     name time   movietypewatched 
     john 1-2018       2
     john 2-2018       3
     kate 1-2018       1
     kate 2-2018       2


使用dplyr的解决方案。我们可以基于moviewatchedname删除重复的行,对唯一的moviewatched进行计数,然后使用cumsum计算运行总计。 df2是最终输出。


df2 <- df %>%
  distinct(moviewatched, name, .keep_all = TRUE) %>%
  group_by(name, time) %>%
  summarise(movietypewatched = n_distinct(moviewatched)) %>%
  mutate(movietypewatched = cumsum(movietypewatched)) %>%
# # A tibble: 4 x 3
#   name  time   movietypewatched
#   <fct> <fct>             <int>
# 1 john  1-2018                2
# 2 john  2-2018                3
# 3 kate  1-2018                1
# 4 kate  2-2018                2



df2 <- df[!duplicated(df[, .(moviewatched, name)])][
  , .(movietypewatched = uniqueN(moviewatched)), by = .(name, time)][
    , movietypewatched := cumsum(movietypewatched), by = name]
#    name   time movietypewatched
# 1: john 1-2018                2
# 2: john 2-2018                3
# 3: kate 1-2018                1
# 4: kate 2-2018                2

First convert the time data to a class to establish order, e.g. with lubridate::myd with truncated = 1. From here, set the arrangement of rows to ensure they're in order, then, grouped by name, use purrr::accumulate to generate a list of unique values seen so far in moviewatched, called upon which lengths will return the number of movies seen to that point. Aggregate by month with max to get the total cumulative types for each month.


df <- data_frame(
    moviewatched =  c('Comedy', 'Horror', 'Comedy', 'Horror', 'Drama', 'Comedy', 'Drama'),
    name =  c('john', 'john', 'john', 'john', 'john','kate','kate'),
    time =  lubridate::myd(c('1-2018', '1-2018', '1-2018', '2-2018', '2-2018','1-2018' ,'2-2018'), truncated = 1)

df %>% 
    group_by(name) %>% 
    arrange(name, time) %>%
    mutate(n_types = lengths(accumulate(moviewatched, ~unique(c(...))))) %>% 
    group_by(name, time) %>% 
    summarise(n_types = max(n_types))
#> # A tibble: 4 x 3
#> # Groups:   name [2]
#>   name  time       n_types
#>   <chr> <date>       <dbl>
#> 1 john  2018-01-01       2
#> 2 john  2018-02-01       3
#> 3 kate  2018-01-01       1
#> 4 kate  2018-02-01       2

Make a table of first dates watched; count by month; and take the cumulative sum:


# fix bad date
df[, d := as.IDate(paste(time, "01", sep="-"), "%m-%Y-%d")]

# identify month first watched
fw = df[, .(d = min(d)), by=.(name, moviewatched)]

# count new movies per month
nm = fw[, .N, keyby=.(name, d)]

# take cumulative count
nm[, cN := cumsum(N), by=name]

   name          d N cN
1: john 2018-01-01 2  2
2: john 2018-02-01 1  3
3: kate 2018-01-01 1  1
4: kate 2018-02-01 1  2

You need to convert the date; otherwise the min() will be incorrect and/or broken.

There are two aggregation steps here, but the code should be fast thanks to optimization in data.table (see ?GForce).

Using data.table:

df <- unique(df) 
setDT(df)[, movietypewatched := 1:.N, by = c("moviewatched", "name")] 
df <- df[!(movietypewatched == 2), ]
df[, movietypewatched := .N, by = c("name", "time")][, moviewatched := NULL]
df <- unique(df)
df[, movietypewatched := cumsum(movietypewatched), by = name]

   name   time movietypewatched
1: john 1-2018                2
2: john 2-2018                3
3: kate 1-2018                1
4: kate 2-2018                2

  • 您需要按name, date排列数据框以累积值。
  • 您可以使用lag()来获取先前的值。由于每个name的第一个条目都没有先前的值,因此它将得到NA
  • 使用n_distinct()计算唯一类型时,您将需要删除NA。



moviewatched <- c('Comedy', 'Horror', 'Comedy', 'Horror', 'Drama', 'Comedy', 'Drama')
name <- c('john', 'john', 'john', 'john','kate','kate', 'john')
time <- c( '1-2018', '1-2018', '2-2018', '2-2018','1-2018' ,'2-2018','1-2018')

df <- data.frame(moviewatched, name, time)

df_final <- df %>% 
  arrange(name, time) %>% 
  group_by(name, time) %>%
  nest(.key= 'genre') %>% 
  group_by(name) %>% 
  mutate(genre_all = map2(genre, lag(genre), rbind) %>% map(unique)) %>% 
  ungroup() %>% 
  mutate(genre_count = map_int(genre_all, ~ lift(n_distinct)(.x, na.rm =TRUE)))


> df_final
# A tibble: 4 x 5
  name  time   genre            genre_all        genre_count
  <fct> <fct>  <list>           <list>                 <int>
1 john  1-2018 <tibble [3 x 1]> <tibble [3 x 1]>           2
2 john  2-2018 <tibble [2 x 1]> <tibble [3 x 1]>           3
3 kate  1-2018 <tibble [1 x 1]> <tibble [2 x 1]>           1
4 kate  2-2018 <tibble [1 x 1]> <tibble [2 x 1]>           2