使用基于当前年份的动态名称系列复制列

时间:2018-06-07 13:04:51

标签: r function dplyr data.table

我期待:

  • 写一个函数或
  • 使用data.table或
  • 使用dplyr mutate_cond或
  • 使用purr map函数

复制此功能:

If year = current
    columns(7,8,9) = column(6)
Else
    If year = current + 1
        columns(8,9,10) = column(7)
    Else
        If year = current + 2
            columns(9,10,11) = column(8)
        Else
            If year = current + 3
                columns(10,11,12) = column9)
            End If
        End If
    End If
End If

到目前为止,我已经能够使用以下不整洁的代码创建静态解决方案:

tbl.scholar1<-tbl.scholar1%>%mutate_cond(cohort == currentAY, ay_1819=ay_1718, ay_1920=ay_1718, ay_2021=ay_1718)
tbl.scholar1<-tbl.scholar1%>%mutate_cond(cohort == currentAY+1, ay_1920=ay_1819, ay_2021=ay_1819, ay_2122=ay_1819)
tbl.scholar1<-tbl.scholar1%>%mutate_cond(cohort == currentAY+2, ay_2021=ay_1920, ay_2122=ay_1920, ay_2223=ay_1920)
tbl.scholar1<-tbl.scholar1%>%mutate_cond(cohort == currentAY+3, ay_2122=ay_2021, ay_2223=ay_2021, ay_2324=ay_2021)

在一些黑客攻击后,我写了一个函数,它将当前年份和列名称作为输入:

tbl.scholar1<-dup.DF(tbl.scholar1, currentYR, "ay_1718", "ay_2324")

功能代码如下所示

dup.DF <- function(df1, currAY, name1, name2) {

  df1%>%mutate_cond(cohort == currAY, UQ(rlang::sym(name2)) :=  UQ(rlang::sym(name1)))              #This works!!!!

}

所以我知道有一个更优雅的解决方案,使用data.table,purrr:map或dplyr将动态变量作为向量或列表,这样我就不必重复我的函数n次迭代带有for循环。

The input looks like this....
    SYSDATE     ID           name           cohort fundCode ay_1718 ay_1819 ay_1920  ay_2021  ay_2122  ay_2223  ay_2324  ay_2425
0005-11-20  000000000   "last0, first"       1718    316001    1         0     0         0        0         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         1     0         0        0         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         0     1         0        0         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         0     0         1        0         0        0       0

我的预期输出是......

    SYSDATE     ID           name           cohort fundCode ay_1718 ay_1819 ay_1920  ay_2021  ay_2122  ay_2223  ay_2324  ay_2425
0005-11-20  000000000   "last0, first"       1718    316001    1         1     1         1        0         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         1     1         1        1         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         0     1         1        1         1        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         0     0         1        1         1        1       0

2 个答案:

答案 0 :(得分:0)

更新答案 - 在获得OP对要求的澄清后,我不得不改变方法。

st_pos <- 6                         #concerned column's start position in the given dataframe
df_bkp <- df                        #data backup

#rename concerned columns as "ay_1718", "ay_1819" etc
names(df)[st_pos:ncol(df)] <- paste("ay", paste0(as.numeric(substr(min(df$year), 1, 2)) + 0:(ncol(df) - st_pos),
                                                 as.numeric(substr(min(df$year), 3, 4)) + 0:(ncol(df) - st_pos)), 
                                    sep="_")

#copy "year" column's value to the ensuing three columns
cols <- names(df)[st_pos:ncol(df)]  #renamed columns
mapply(function(x, y) 
  df[df$year == x & df$ID == y, which(grepl(x, cols)) + (st_pos-1):(st_pos+2)] <<- 
    df[df$year == x & df$ID == y, which(grepl(x, cols)) + (st_pos-1)],
  df$year, df$ID)

给出了

> df
     SYSDATE ID         name year fundCode ay_1718 ay_1819 ay_1920 ay_2021 ay_2122 ay_2223 ay_2324 ay_2425
1 0005-11-20  0 last0, first 1718   316001     700     700     700     700       0       0       0       0
2 0005-11-20  1 last1, first 1819   316002       0      60      60      60      60       0       0       0
3 0005-11-20  2 last2, first 1920   316003       0       0      50      50      50      50       0       0
4 0005-11-20  3 last3, first 2021   316004       0       0       0     400     400     400     400       0


示例数据(注意:我已将Y1, Y2 etc的值1略微修改为其他值以用于说明目的)

df <- structure(list(SYSDATE = c("0005-11-20", "0005-11-20", "0005-11-20", 
"0005-11-20"), ID = 0:3, name = c("last0, first", "last1, first", 
"last2, first", "last3, first"), year = c(1718L, 1819L, 1920L, 
2021L), fundCode = 316001:316004, Y1 = c(700L, 0L, 0L, 0L), Y2 = c(0L, 
60L, 0L, 0L), Y3 = c(0L, 0L, 50L, 0L), Y4 = c(0L, 0L, 0L, 400L
), Y5 = c(0L, 0L, 0L, 0L), Y6 = c(0L, 0L, 0L, 0L), Y7 = c(0L, 
0L, 0L, 0L), Y8 = c(0L, 0L, 0L, 0L)), .Names = c("SYSDATE", "ID", 
"name", "year", "fundCode", "Y1", "Y2", "Y3", "Y4", "Y5", "Y6", 
"Y7", "Y8"), class = "data.frame", row.names = c(NA, -4L))

#     SYSDATE ID         name year fundCode  Y1 Y2 Y3  Y4 Y5 Y6 Y7 Y8
#1 0005-11-20  0 last0, first 1718   316001 700  0  0   0  0  0  0  0
#2 0005-11-20  1 last1, first 1819   316002   0 60  0   0  0  0  0  0
#3 0005-11-20  2 last2, first 1920   316003   0  0 50   0  0  0  0  0
#4 0005-11-20  3 last3, first 2021   316004   0  0  0 400  0  0  0  0

答案 1 :(得分:0)

这里有两种方法可以将数据从宽格式整形为长格式。

1。 melt() / dcast()

library(data.table)
long <- melt(setDT(inp)[, rn := .I], measure.vars = patterns("ay_"))
long[order(rn, variable), value := replace(value, which(value == 1L)[1L] + 1:3, 1L), by = rn]
dcast(long, rn + ... ~ variable)
   rn    SYSDATE ID         name cohort fundCode ay_1718 ay_1819 ay_1920 ay_2021 ay_2122 ay_2223 ay_2324 ay_2425
1:  1 0005-11-20  0 last0, first   1718   316001       1       1       1       1       0       0       0       0
2:  2 0005-11-20  0 last0, first   1718   316001       0       1       1       1       1       0       0       0
3:  3 0005-11-20  0 last0, first   1718   316001       0       0       1       1       1       1       0       0
4:  4 0005-11-20  0 last0, first   1718   316001       0       0       0       1       1       1       1       0

2。 gather() / spread()

library(tidyr)
library(dplyr)
inp %>% 
  group_by(rn = row_number()) %>% 
  gather(, , starts_with("ay_")) %>% 
  mutate(value = replace(value, which(value == 1L)[1L] + 1:3, 1L)) %>% 
  spread(key, value)
# A tibble: 4 x 14
# Groups:   rn [4]
  SYSDATE       ID name         cohort fundCode    rn ay_1718 ay_1819 ay_1920 ay_2021 ay_2122 ay_2223 ay_2324 ay_2425
  <chr>      <int> <chr>         <int>    <int> <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>
1 0005-11-20     0 last0, first   1718   316001     1       1       1       1       1       0       0       0       0
2 0005-11-20     0 last0, first   1718   316001     2       0       1       1       1       1       0       0       0
3 0005-11-20     0 last0, first   1718   316001     3       0       0       1       1       1       1       0       0
4 0005-11-20     0 last0, first   1718   316001     4       0       0       0       1       1       1       1       0

数据

inp <- fread('
SYSDATE     ID           name           cohort fundCode ay_1718 ay_1819 ay_1920  ay_2021  ay_2122  ay_2223  ay_2324  ay_2425
0005-11-20  000000000   "last0, first"       1718    316001    1         0     0         0        0         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         1     0         0        0         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         0     1         0        0         0        0       0
0005-11-20  000000000   "last0, first"       1718    316001    0         0     0         1        0         0        0       0')