根据条件填充多列的行

时间:2019-08-26 07:27:46

标签: r dataframe

我的数据框如下:

    Date        User    Report    Position    View
1   2019-01-01  B   report_03   Sales_Manager 1
2   2019-01-01  C   report_04   Sales_Manager 1
3   2019-01-01  C   report_04   Sales_Manager 1
4   2019-01-02  B   report_03   Sales_Manager 1
5   2019-01-02  C   report_05   Sales_Manager 1
6   2019-01-02  D   report_06   Sales_Rep     1
7   2019-01-02  D   report_06   Sales_Rep     1
8   2019-01-03  A   report_03   CEO           1
9   2019-01-03  C   report_04   Sales_Manager 1
10  2019-01-03  D   report_06   Sales_Rep     1
11  2019-01-04  A   report_01   CEO           1
12  2019-01-04  C   report_04   Sales_Manager 1
13  2019-01-04  C   report_04   Sales_Manager 1
14  2019-01-04  C   report_05   Sales_Manager 1
15  2019-01-04  D   report_05   Sales_Rep     1
16  2019-01-04  D   report_06   Sales_Rep     1
17  2019-01-05  A   report_01   CEO           1
18  2019-01-05  B   report_04   Sales_Manager 1
19  2019-01-05  B   report_04   Sales_Manager 1
20  2019-01-05  C   report_04   Sales_Manager 1

行表示仪表板上某些报告的用户登录信息。根据位置,用户可以访问不同的报告。 CEO(用户A)有权访问report_01,report_02和report_03;销售经理(用户B和C)有权访问report_03,report_04和report_05;销售代表可以访问report_05和report_06。

用户应该每天使用他们有权访问的每个报告,但是在某些时候他们根本不使用某些报告,而在其他日子里他们多次登录。我想在用户未在“查看”列中使用0登录到其报告的行中进行填充。

因此数据框应如下所示:

    Date        User    Report    Position      View
1    2019-01-01  A   report_01     CEO           0
2    2019-01-01  A   report_02     CEO           0
3    2019-01-01  A   report_03     CEO           0 
4    2019-01-01  B   report_03     Sales_Manager 1
5    2019-01-01  B   report_04     Sales_Manager 0
6    2019-01-01  B   report_05     Sales_Manager 0
7    2019-01-01  C   report_03     Sales_Manager 0
8    2019-01-01  C   report_04     Sales_Manager 1
9    2019-01-01  C   report_04     Sales_Manager 1
10   2019-01-01  C   report_05     Sales_Manager 0
11   2019-01-01  D   report_05     Sales_Rep     0
12   2019-01-01  D   report_06     Sales_Rep     0
13   2019-01-02  A   report_01     CEO           0
14   2019-01-02  A   report_02     CEO           0
15   2019-01-02  A   report_03     CEO           0
16   2019-01-02  B   report_03     Sales_Manager 1
17   2019-01-02  B   report_04     Sales_Manager 0
18   2019-01-02  B   report_05     Sales_Manager 0
.
.
.

输出输出:

structure(list(Date = structure(c(17897, 17897, 17897, 17898, 
17898, 17898, 17898, 17899, 17899, 17899, 17900, 17900, 17900, 
17900, 17900, 17900, 17901, 17901, 17901, 17901, 17901, 17901, 
17902, 17902, 17902, 17902, 17902, 17903, 17903, 17903, 17904, 
17904, 17904, 17904, 17904, 17904, 17904, 17904, 17905, 17905, 
17905, 17905, 17905, 17906, 17906, 17906, 17906, 17906, 17907, 
17907, 17907, 17907, 17907, 17908, 17908, 17908, 17908, 17908, 
17909, 17909, 17909, 17909, 17910, 17910, 17910, 17911, 17911, 
17911, 17911, 17911, 17912, 17912, 17912, 17912, 17913, 17914, 
17914, 17914, 17914, 17914, 17915, 17915, 17915, 17915, 17916, 
17916, 17916, 17916, 17917, 17917, 17917, 17918, 17918, 17918, 
17918, 17919, 17919, 17919, 17919, 17919, 17920, 17920, 17920, 
17921, 17921, 17921, 17921, 17922, 17922, 17923, 17923, 17923, 
17923, 17923, 17924, 17924, 17924, 17924, 17924, 17925, 17925, 
17925, 17925, 17926, 17926, 17926, 17927, 17927, 17927, 17927
), class = "Date"), User = structure(c(2L, 3L, 3L, 2L, 3L, 4L, 
4L, 1L, 3L, 4L, 1L, 3L, 3L, 3L, 4L, 4L, 1L, 2L, 2L, 3L, 3L, 4L, 
1L, 1L, 1L, 3L, 4L, 2L, 3L, 3L, 1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 
1L, 2L, 3L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 1L, 
1L, 3L, 3L, 4L, 1L, 1L, 4L, 4L, 1L, 1L, 4L, 1L, 2L, 3L, 4L, 4L, 
1L, 3L, 4L, 4L, 1L, 1L, 2L, 3L, 3L, 4L, 1L, 3L, 4L, 4L, 4L, 4L, 
4L, 4L, 1L, 1L, 4L, 1L, 2L, 4L, 4L, 1L, 1L, 3L, 4L, 4L, 1L, 3L, 
4L, 1L, 3L, 3L, 4L, 3L, 4L, 2L, 3L, 3L, 3L, 4L, 3L, 3L, 4L, 4L, 
4L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 3L, 4L), .Label = c("A", 
"B", "C", "D"), class = "factor"), report = structure(c(3L, 4L, 
4L, 3L, 5L, 6L, 6L, 3L, 4L, 6L, 1L, 4L, 4L, 5L, 5L, 6L, 1L, 4L, 
4L, 4L, 5L, 6L, 3L, 3L, 3L, 4L, 5L, 3L, 4L, 4L, 1L, 2L, 4L, 4L, 
5L, 5L, 6L, 6L, 2L, 3L, 5L, 5L, 6L, 4L, 5L, 6L, 6L, 6L, 4L, 5L, 
5L, 6L, 6L, 1L, 2L, 4L, 5L, 6L, 1L, 3L, 6L, 6L, 1L, 1L, 6L, 2L, 
4L, 5L, 6L, 6L, 3L, 4L, 6L, 6L, 1L, 2L, 3L, 4L, 5L, 5L, 2L, 4L, 
5L, 6L, 6L, 6L, 6L, 6L, 1L, 2L, 6L, 2L, 3L, 6L, 6L, 1L, 3L, 5L, 
5L, 5L, 2L, 5L, 5L, 2L, 4L, 5L, 5L, 5L, 6L, 3L, 4L, 4L, 5L, 6L, 
5L, 5L, 5L, 6L, 6L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 3L, 4L, 4L, 6L
), .Label = c("report_01", "report_02", "report_03", "report_04", 
"report_05", "report_06"), class = "factor"), Position = c("Sales_Manager", 
"Sales_Manager", "Sales_Manager", "Sales_Manager", "Sales_Manager", 
"Sales_Rep", "Sales_Rep", "CEO", "Sales_Manager", "Sales_Rep", 
"CEO", "Sales_Manager", "Sales_Manager", "Sales_Manager", "Sales_Rep", 
"Sales_Rep", "CEO", "Sales_Manager", "Sales_Manager", "Sales_Manager", 
"Sales_Manager", "Sales_Rep", "CEO", "CEO", "CEO", "Sales_Manager", 
"Sales_Rep", "Sales_Manager", "Sales_Manager", "Sales_Manager", 
"CEO", "CEO", "Sales_Manager", "Sales_Manager", "Sales_Manager", 
"Sales_Manager", "Sales_Rep", "Sales_Rep", "CEO", "Sales_Manager", 
"Sales_Manager", "Sales_Rep", "Sales_Rep", "Sales_Manager", "Sales_Manager", 
"Sales_Rep", "Sales_Rep", "Sales_Rep", "Sales_Manager", "Sales_Manager", 
"Sales_Rep", "Sales_Rep", "Sales_Rep", "CEO", "CEO", "Sales_Manager", 
"Sales_Manager", "Sales_Rep", "CEO", "CEO", "Sales_Rep", "Sales_Rep", 
"CEO", "CEO", "Sales_Rep", "CEO", "Sales_Manager", "Sales_Manager", 
"Sales_Rep", "Sales_Rep", "CEO", "Sales_Manager", "Sales_Rep", 
"Sales_Rep", "CEO", "CEO", "Sales_Manager", "Sales_Manager", 
"Sales_Manager", "Sales_Rep", "CEO", "Sales_Manager", "Sales_Rep", 
"Sales_Rep", "Sales_Rep", "Sales_Rep", "Sales_Rep", "Sales_Rep", 
"CEO", "CEO", "Sales_Rep", "CEO", "Sales_Manager", "Sales_Rep", 
"Sales_Rep", "CEO", "CEO", "Sales_Manager", "Sales_Rep", "Sales_Rep", 
"CEO", "Sales_Manager", "Sales_Rep", "CEO", "Sales_Manager", 
"Sales_Manager", "Sales_Rep", "Sales_Manager", "Sales_Rep", "Sales_Manager", 
"Sales_Manager", "Sales_Manager", "Sales_Manager", "Sales_Rep", 
"Sales_Manager", "Sales_Manager", "Sales_Rep", "Sales_Rep", "Sales_Rep", 
"Sales_Manager", "Sales_Rep", "Sales_Rep", "Sales_Rep", "Sales_Rep", 
"Sales_Rep", "Sales_Rep", "Sales_Manager", "Sales_Manager", "Sales_Manager", 
"Sales_Rep"), View = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -130L
), class = "data.frame")

2 个答案:

答案 0 :(得分:2)

根据您的需要,您可能需要将第一行更改为第二行。如果要获取2019年1月的数据帧,则在此期间某天中没有人阅读任何报告(例如假期),则第一行代码将不提供当天的零行,而第二行会。

还请注意,它不会检查名称用户和位置的唯一性。如果名称重复,但对应的位置不止一个,则结果将好像他们都是不同的用户。

unique.date <- unique(df[,"Date",drop=F])
# unique.date <- seq(from = as.Date("2019-01-01"), to=as.Date("2019-01-31"), by="1 day")
unique.usr.rpt <- unique(df[,c("User","Report","Position")])

unique.df <- merge(unique.date, unique.usr.rpt)

result <- merge(unique.df, df, by=c("Date", "User", "Report", "Position"), all=T)
result[is.na(result$View), "View"] <- 0

编辑:如果报表视图特权是确定性的,我们可以在顶部添加一些定义行以指定它们。随后的情况将大体相似。整个工作代码:

access <- list(
"CEO" = c("report_01", "report_02", "report_03"),
"Sales_Manager" = c("report_03", "report_04", "report_05"),
"Sales_Rep" = c("report_05", "report_06")
)

access.df <- do.call(rbind, lapply(names(access), function(x) data.frame(x, access[[x]])))
colnames(access.df) <- c("Position", "Report")

unique.usr <- unique(df[,c("User","Position")])
unique.usr.rpt <- merge(unique.usr, access.df, by=c("Position"), all=T)

# What follows is same as before
unique.date <- unique(df[,"Date",drop=F])
# unique.date <- seq(from = as.Date("2019-01-01"), to=as.Date("2019-01-31"), by="1 day")

unique.df <- merge(unique.date, unique.usr.rpt)

result <- merge(unique.df, df, by=c("Date", "User", "Report", "Position"), all=T)
result[is.na(result$View), "View"] <- 0

现在您将获得所需的行:

> head(result[result$User=="B" & result$Report=="report_05",])
         Date User    Report      Position View
6  2019-01-01    B report_05 Sales_Manager    0
18 2019-01-02    B report_05 Sales_Manager    0
30 2019-01-03    B report_05 Sales_Manager    0
41 2019-01-04    B report_05 Sales_Manager    0
54 2019-01-05    B report_05 Sales_Manager    0
67 2019-01-06    B report_05 Sales_Manager    0
> head(result[result$User=="C" & result$Report=="report_03",])
         Date User    Report      Position View
7  2019-01-01    C report_03 Sales_Manager    0
19 2019-01-02    C report_03 Sales_Manager    0
31 2019-01-03    C report_03 Sales_Manager    0
42 2019-01-04    C report_03 Sales_Manager    0
55 2019-01-05    C report_03 Sales_Manager    0
68 2019-01-06    C report_03 Sales_Manager    0

答案 1 :(得分:1)

这是一种基于tidyr::complete的解决方案,我们也使用map_if从数据集中添加缺失的组合,例如User Breport_05

library(dplyr)
library(purrr)
library(tidyr)
ndf <- df %>% complete(Date, nesting(User,report,Position), fill = list(View=0))
posrep_df <- df %>% 
                group_by(Position) %>% 
                summarise(report=paste(unique(report), collapse = ',')) %>%
                separate_rows(report,sep='\\,')

ndf %>% mutate_if(is.factor, as.character) %>% split(list(.$Date,.$User)) %>% 
        map_if(~(.x[['Position']] %in% c('CEO','Sales_Manager') && n_distinct(.x[['report']])<3) || 
                (.x[['Position']] %in% c('Sales_Rep') && n_distinct(.x[['report']])<2), 
               ~bind_rows(.x, 
                          anti_join(posrep_df %>% filter(Position==.x$Position[1]),
                                    .x ,
                                    by='report') %>% 
                          mutate(Date=.x$Date[1], User=.x$User[1], View=0)
                         )) %>% 
        bind_rows() %>% 
        arrange(Date, User)

# A tibble: 369 x 5
   Date       User  report    Position       View
  <date>     <chr> <chr>     <chr>         <dbl>
1 2019-01-01 A     report_01 CEO               0
2 2019-01-01 A     report_02 CEO               0
3 2019-01-01 A     report_03 CEO               0
4 2019-01-01 B     report_03 Sales_Manager     1
5 2019-01-01 B     report_04 Sales_Manager     0
6 2019-01-01 B     report_05 Sales_Manager     0
7 2019-01-01 C     report_04 Sales_Manager     1
8 2019-01-01 C     report_04 Sales_Manager     1
9 2019-01-01 C     report_05 Sales_Manager     0
10 2019-01-01 C     report_03 Sales_Manager     0
# ... with 359 more rows

我们可以编写一个自定义函数来减少map_if内的括号和计算量

combine_fun <- function(x){
  #browser()
  x_full <- posrep_df %>% 
    filter(Position==x$Position[1]) %>% 
    mutate(Date=x$Date[1], User=x$User[1], View=0)

  x_comp <- x_full %>% anti_join(x, by='report')

  x_final <- bind_rows(x, x_comp)

  x_final
}

#Here a simple example to explore `combine_fun`
df_test <- ndf %>% filter(Date=='2019-01-01' & User=='B')
#Before combine_fun
df_test
# A tibble: 2 x 5
  Date       User  report    Position       View
  <date>     <fct> <fct>     <chr>         <dbl>
1 2019-01-01 B     report_03 Sales_Manager     1
2 2019-01-01 B     report_04 Sales_Manager     0
#After combine_fun
combine_fun(df_test)
# A tibble: 3 x 5
   Date       User  report    Position       View
  <date>     <fct> <chr>     <chr>         <dbl>
1 2019-01-01 B     report_03 Sales_Manager     1
2 2019-01-01 B     report_04 Sales_Manager     0
3 2019-01-01 B     report_05 Sales_Manager     0

使用map_if重新构造combine_fun

ndf %>% mutate_if(is.factor, as.character)%>%split(list(.$Date,.$User)) %>% 
map_if(.p = ~(.x[['Position']] %in% c('CEO','Sales_Manager') && n_distinct(.x[['report']])<3) || 
             (.x[['Position']] %in% c('Sales_Rep') && n_distinct(.x[['report']])<2), 
       .f = ~combine_fun(.x)) %>% 
bind_rows() %>% 
arrange(Date, User)