Question

我有一个看起来像这样的数据框：

DF_1＆GT;

T_id  D1             D2                   Num     type    type_2     fig
xt-1  2017-05-01     2017-03-25 12:11:45  10      A       X          25.20
xt-2  2017-05-01     2017-03-25 21:05:25  20      A       Y          20.15
xt-3  2017-05-01     2017-03-25 08:10:55  25      B       X          15.11
xt-4  2017-05-03     2017-03-25 07:19:35  30      B       Y          22.56
xt-5  2017-05-03     2017-03-25 13:12:56  45      C       Z          35.45
xt-6  2017-05-03     2017-03-25 18:14:44  20      D       Z          27.21
xt-7  2017-04-06     2017-03-25 19:21:35  15      A       Z          23.20
xt-8  2017-04-06     2017-03-25 21:11:15  40      C       X          21.40
xt-9  2017-04-08     2017-02-25 22:25:04  20      A       A          27.50
xt-10 2017-04-06     2017-02-25 16:04:08  30      A       Y          32.20
xt-11 2017-04-05     2017-02-25 18:15:25  20      C       Z          30.20
xt-12 2017-04-01     2017-01-25 19:22:25  50      A       Z          33.15
xt-13 2017-04-02     2017-01-25 23:19:05  15      A       A          30.12
xt-14 2017-03-03     2017-01-25 14:25:09  15      D       Y          31.25
xt-15 2017-03-10     2017-01-25 23:25:36  40      A       X          25.45

从上面的数据帧我想要下面提到的两个矩阵：

1. Date (Last Three Date from `sys.date()`)

    D1    count  sum  mean_num total_sum count_A sum_A count_other sum_other mean_fig   mean_TAT

    2017-05-03 3 95  31.66     6         0       0     3           95        28.40
    2017-05-02 0 0   0         3         0       0     0           0         0.00
    2017-05-01 3 55  18.33     3         2       30    1           25        20.15

计算mean_TAT：减去D2 - D1而不是表示该日基于同一日期的count值。
total_sum将从该月的第一天开始累计。
count_A和sum_A基于type作为A的特定日期。
count_other和sum_other适用于type不是A的人。

2.基于月份（根据数据框架的最后三个月）

对于基于月份的格式，格式将相同，仅计算为月份。

每个月有5个额外的行和2列，其中前3个是基于特定月份的计数的前3个type_2。
increase_%将在上个月计算（即如果5月17日的count为50比4月17日为100，那么将为-50％，而其他5行的基础为上个月count和sum。
对于A为“A”的值，每个月的第四个type_2将保持不变。
第五个Other将不是上面提到的那些type_2。
Total将根据count和sum的列添加，而mean则会有意思。

似乎我无法正确解释，希望数据框架可以理解矩阵。

期待一些帮助。

Answer 1

这里已经是第一部分：

library(lubridate)
library(dplyr)

df2 <- df1 %>%
  mutate(ym = year(D1)*100+month(D1)) %>%
  arrange(D1) %>%
  group_by(D1,ym) %>%
  summarize(count = n(),
            sum=sum(Num),
            mean_num=mean(Num),
            count_A=sum(type=='A'),
            sum_A=sum(Num * (type=='A')),
            count_other=sum(type!='A'),
            sum_other=sum(Num * (type!='A')),
            mean_fig = mean(fig),
            mean_TAT = mean(D2-D1)) %>%
  group_by(ym) %>%
  mutate(total_sum=cumsum(count)) %>%
  ungroup %>%
  arrange(desc(D1)) %>%
  select(D1,count,sum,mean_num,total_sum,count_A,sum_A,count_other,sum_other,mean_fig,mean_TAT)


# # A tibble: 9 x 11
# D1 count   sum mean_num total_sum count_A sum_A count_other sum_other mean_fig       mean_TAT
# <date> <int> <int>    <dbl>     <int>   <int> <int>       <int>     <int>    <dbl>         <time>
# 1 2017-05-03     3    95 31.66667         6       0     0           3        95 28.40667 -39.00000 days
# 2 2017-05-01     3    55 18.33333         3       2    30           1        25 20.15333 -37.00000 days
# 3 2017-04-08     1    20 20.00000         7       1    20           0         0 27.50000 -42.00000 days
# 4 2017-04-06     3    85 28.33333         6       2    45           1        40 25.60000 -21.33333 days
# 5 2017-04-05     1    20 20.00000         3       0     0           1        20 30.20000 -39.00000 days
# 6 2017-04-02     1    15 15.00000         2       1    15           0         0 30.12000 -67.00000 days
# 7 2017-04-01     1    50 50.00000         1       1    50           0         0 33.15000 -66.00000 days
# 8 2017-03-10     1    40 40.00000         2       1    40           0         0 25.45000 -44.00000 days
# 9 2017-03-03     1    15 15.00000         1       0     0           1        15 31.25000 -37.00000 days

数据

df1 <- read.table(text="T_id D1 D2 Num type type_2 fig xt-1 2017-05-01 '2017-03-25 12:11:45' 10 A X 25.20 xt-2 2017-05-01 '2017-03-25 21:05:25' 20 A Y 20.15 xt-3 2017-05-01 '2017-03-25 08:10:55' 25 B X 15.11 xt-4 2017-05-03 '2017-03-25 07:19:35' 30 B Y 22.56 xt-5 2017-05-03 '2017-03-25 13:12:56' 45 C Z 35.45 xt-6 2017-05-03 '2017-03-25 18:14:44' 20 D Z 27.21 xt-7 2017-04-06 '2017-03-25 19:21:35' 15 A Z 23.20 xt-8 2017-04-06 '2017-03-25 21:11:15' 40 C W 21.40 xt-9 2017-04-08 '2017-02-25 22:25:04' 20 A Q 27.50 xt-10 2017-04-06 '2017-02-25 16:04:08' 30 A W 32.20 xt-11 2017-04-05 '2017-02-25 18:15:25' 20 C V 30.20 xt-12 2017-04-01 '2017-01-25 19:22:25' 50 A Z 33.15 xt-13 2017-04-02 '2017-01-25 23:19:05' 15 A Z 30.12 xt-14 2017-03-03 '2017-01-25 14:25:09' 15 D Y 31.25 xt-15 2017-03-10 '2017-01-25 23:25:36' 40 A X 25.45",h=T,strin=F) df1$D1 <- as.Date(df1$D1,"%Y-%m-%d") df1$D2 <- as.Date(df1$D2,"%Y-%m-%d") expected_output <- read.table(text="D1 count sum mean_num total_sum count_A sum_A count_other sum_other mean_fig 2017-05-03 3 95 31.66 6 0 0 3 95 28.40 2017-05-02 0 0 0 3 0 0 0 0 0.00 2017-05-01 3 55 18.33 3 2 30 1 25 20.15")

第2部分的一些提示：

如果不重新处理你的问题，我就无法创造奇迹（在这里提供准确的可重复输出是必要的）。但是，希望这是一种接近的方式：

df_month <- df1 %>% mutate(ym = year(D1)*100+month(D1)) %>% arrange(D1) %>% group_by(ym) %>% summarize(count = n(), sum=sum(Num), mean_num=mean(Num), count_A=sum(type=='A'), sum_A=sum(Num * (type=='A')), count_other=sum(type!='A'), sum_other=sum(Num * (type!='A')), mean_fig = mean(fig), mean_TAT = mean(D2-D1)) %>% mutate(type_2=paste0(month.abb[ym%% 100],"-",ym %/% 100 -2000)) %>% select(ym,type_2,count,sum,mean_num,count_A,sum_A,count_other,sum_other,mean_fig,mean_TAT) df_top3 <- df1 %>% filter(type_2 !="A") %>% mutate(ym = year(D1)*100+month(D1)) %>% arrange(desc(ym)) %>% group_by(ym,type_2) %>% summarize(count = n(), sum=sum(Num), mean_num=mean(Num), count_A=sum(type=='A'), sum_A=sum(Num * (type=='A')), count_other=sum(type!='A'), sum_other=sum(Num * (type!='A')), mean_fig = mean(fig), mean_TAT = mean(D2-D1)) %>% group_by(ym) %>% arrange(desc(count)) %>% slice(1:3) %>% ungroup %>% select(ym,type_2,count,sum,mean_num,count_A,sum_A,count_other,sum_other,mean_fig,mean_TAT) df_A <- df1 %>% filter(type_2 == "A") %>% mutate(ym = year(D1)*100+month(D1)) %>% arrange(desc(ym)) %>% group_by(ym,type_2) %>% summarize(count = n(), sum=sum(Num), mean_num=mean(Num), count_A=sum(type=='A'), sum_A=sum(Num * (type=='A')), count_other=sum(type!='A'), sum_other=sum(Num * (type!='A')), mean_fig = mean(fig), mean_TAT = mean(D2-D1)) %>% select(ym,type_2,count,sum,mean_num,count_A,sum_A,count_other,sum_other,mean_fig,mean_TAT) df_other <- df1 %>% mutate(ym = year(D1)*100+month(D1)) %>% anti_join(bind_rows(df_top3,df_A),by = c("ym","type_2")) %>% mutate(type_2="Other") %>% arrange(desc(ym)) %>% group_by(ym,type_2) %>% summarize(count = n(), sum=sum(Num), mean_num=mean(Num), count_A=sum(type=='A'), sum_A=sum(Num * (type=='A')), count_other=sum(type!='A'), sum_other=sum(Num * (type!='A')), mean_fig = mean(fig), mean_TAT = mean(D2-D1)) %>% select(ym,type_2,count,sum,mean_num,count_A,sum_A,count_other,sum_other,mean_fig,mean_TAT) # it's empty with your example data bind_rows(df_month,df_top3,df_A,df_other) %>% arrange(ym) %>% select(-ym) %>% rename(Month = type_2)

Answer 2

扩展以前的工作，这里是Table2的代码（有2个例外）：

library(dplyr)
library(lubridate)
library(magrittr)

table2 <- df1 %>% 
  mutate(ym = year(D1)*100+month(D1)) %>%
  mutate(monthLabel = paste0(year(D1), "-", month(D1, label = TRUE))) %>%
  group_by(type_2, ym, monthLabel) %>%
    summarize(count = n(),
          sum=sum(Num),
          mean_num=round(mean(Num), 1),
          count_A=sum(type=='A'),
          sum_A=sum(Num * (type=='A')),
          count_other=sum(type!='A'),
          sum_other=sum(Num * (type!='A')),
          mean_fig = round(mean(fig)),
          mean_TAT = round(mean(D2-D1))) %>%
  ungroup() %>%
  group_by(ym, monthLabel) %>%
    mutate(total_sum = sum(count)) %>%
  ungroup() %>%
  group_by(type_2) %>%
    arrange(ym) %>%
    mutate(priorC = lag(count)) %>%
    mutate(countIncrease = ifelse(count == 0, "", round(100*(count-priorC)/count))) %>%
    mutate(priorS = lag(sum)) %>%
    mutate(sumIncrease = ifelse(sum == 0, "", round(100*(sum-priorS)/sum))) %>%
    select(-priorC, -priorS) %>%
  ungroup() %>%
  arrange(desc(ym), desc(count)) %>%
  select(monthLabel, type_2, count, countIncrease, sum, sumIncrease, mean_num, total_sum, count_A, sum_A, count_other, sum_other, mean_fig, mean_TAT)

遗漏/例外：

没有行A，因为A在样本数据中绝不是Type_2。
每月显示所有Type_2，因为每个月的样本数据不超过4个。

如何在R

2 个答案: