当我在数据框中满足2个条件时,我试图添加重复行
原始数据帧如下:
ID_NO SSN DOB STATUS NEW_VALUE OLD_VALUE ADDRESS ZIP CITY COUNTRY
123 7687 1/1/91 0 NA NA xyz 45 nyc usa
456 85723 2/1/91 -1 NA NA uii 34 fll usa
789 8783 3/1/93 2 NA NA oii 56 bos usa
987 18267 9/1/99 -4 NA NA qww 67 sfo usa
765 9238 10/1/00 3 NA NA jhk 87 lax usa
我要在这里做的是为STATUS> 0和NEW_VALUE为NA的每一行添加一个重复行,并将OLD_VALUE中的值更改为认可
我尝试使用bind_rows并从dplyr中进行突变,如下所示:
df<-df %>% mutate(rownum = row_number()) %>%
bind_rows(., filter(., is.na(df$NEW_VALUE) & df$STATUS> 0) %>%
mutate(ID_NO = ID_NO,
OLD_VALUE='approved',
rownum = rownum+.5)) %>%
arrange(rownum) %>%
select(-rownum)
这是我从代码中获得的预期结果,但是我想知道这是否是正确的方法,是否还有其他替代方法?
ID_NO SSN DOB STATUS NEW_VALUE OLD_VALUE ADDRESS ZIP CITY COUNTRY
123 7687 1/1/91 0 NA NA xyz 45 nyc usa
456 85723 2/1/91 -1 NA NA uii 34 fll usa
789 8783 3/1/93 2 NA NA oii 56 bos usa
789 8783 3/1/93 2 NA approved oii 56 bos usa
987 18267 9/1/99 -4 NA NA qww 67 sfo usa
765 9238 10/1/00 3 NA NA jhk 87 lax usa
765 9238 10/1/00 3 NA approved jhk 87 lax usa
答案 0 :(得分:1)
这类似于您的尝试,但我会做类似的事情
#Change date to date object so that it is easy to arrange
df$DOB <- as.Date(df$DOB, "%d/%m/%y")
library(dplyr)
bind_rows(df, df %>%
filter(STATUS > 0 & is.na(NEW_VALUE)) %>%
mutate(OLD_VALUE = "approved")) %>%
arrange(DOB)
# ID_NO SSN DOB STATUS NEW_VALUE OLD_VALUE ADDRESS ZIP CITY COUNTRY
#1 123 7687 1991-01-01 0 NA <NA> xyz 45 nyc usa
#2 456 85723 1991-01-02 -1 NA <NA> uii 34 fll usa
#3 789 8783 1993-01-03 2 NA <NA> oii 56 bos usa
#4 789 8783 1993-01-03 2 NA approved oii 56 bos usa
#5 987 18267 1999-01-09 -4 NA <NA> qww 67 sfo usa
#6 765 9238 2000-01-10 3 NA <NA> jhk 87 lax usa
#7 765 9238 2000-01-10 3 NA approved jhk 87 lax usa
在基数R中可以写为
df1 <- rbind(df, transform(subset(df, df$STATUS > 0 & is.na(NEW_VALUE)),
OLD_VALUE = "approved"))
df1[order(df1$DOB), ]
答案 1 :(得分:1)
在完成bind_rows
之后,无需使用filter
方法就可以做到这一点。为此,我们可以rep
对满足逻辑条件的行进行拼接,然后将其传递给uncount
library(tidyverse)
df %>%
uncount((STATUS > 0 & is.na(NEW_VALUE))+1) %>%
mutate(OLD_VALUE = case_when(duplicated(ID_NO)~ "approved",
TRUE ~ NA_character_))
# ID_NO SSN DOB STATUS NEW_VALUE OLD_VALUE ADDRESS ZIP CITY COUNTRY
#1 123 7687 1/1/91 0 NA <NA> xyz 45 nyc usa
#2 456 85723 2/1/91 -1 NA <NA> uii 34 fll usa
#3 789 8783 3/1/93 2 NA <NA> oii 56 bos usa
#4 789 8783 3/1/93 2 NA approved oii 56 bos usa
#5 987 18267 9/1/99 -4 NA <NA> qww 67 sfo usa
#6 765 9238 10/1/00 3 NA <NA> jhk 87 lax usa
#7 765 9238 10/1/00 3 NA approved jhk 87 lax usa
base R
中的类似选项
i1 <- with(df, rep(seq_len(nrow(df)), (STATUS > 0 & is.na(NEW_VALUE))+1))
df1 <- df[i1, ]
df1$OLD_VALUE[duplicated(df1$ID_NO)] <- "approved"
df <- structure(list(ID_NO = c(123L, 456L, 789L, 987L, 765L), SSN = c(7687L,
85723L, 8783L, 18267L, 9238L), DOB = c("1/1/91", "2/1/91", "3/1/93",
"9/1/99", "10/1/00"), STATUS = c(0L, -1L, 2L, -4L, 3L), NEW_VALUE = c(NA,
NA, NA, NA, NA), OLD_VALUE = c(NA, NA, NA, NA, NA), ADDRESS = c("xyz",
"uii", "oii", "qww", "jhk"), ZIP = c(45L, 34L, 56L, 67L, 87L),
CITY = c("nyc", "fll", "bos", "sfo", "lax"), COUNTRY = c("usa",
"usa", "usa", "usa", "usa")), class = "data.frame", row.names = c(NA,
-5L))
答案 2 :(得分:0)
我们还可以使用group_map()
添加行。使用@akrun的数据::
library(tidyverse)
df %>%
group_by(ID_NO) %>%
group_map(~if(.$STATUS>0 & is.na(.$NEW_VALUE))
bind_rows(.,mutate(.,OLD_VALUE='approved')) else .) %>%
ungroup()
#> # A tibble: 7 x 10
#> ID_NO SSN DOB STATUS NEW_VALUE OLD_VALUE ADDRESS ZIP CITY COUNTRY
#> <int> <int> <chr> <int> <lgl> <chr> <chr> <int> <chr> <chr>
#> 1 123 7687 1/1/91 0 NA <NA> xyz 45 nyc usa
#> 2 456 85723 2/1/91 -1 NA <NA> uii 34 fll usa
#> 3 765 9238 10/1/~ 3 NA <NA> jhk 87 lax usa
#> 4 765 9238 10/1/~ 3 NA approved jhk 87 lax usa
#> 5 789 8783 3/1/93 2 NA <NA> oii 56 bos usa
#> 6 789 8783 3/1/93 2 NA approved oii 56 bos usa
#> 7 987 18267 9/1/99 -4 NA <NA> qww 67 sfo usa
在原理上也非常接近
df %>%
nest(-ID_NO) %>%
mutate(data = map_if(data, ~.$STATUS>0 & is.na(.$NEW_VALUE), ~bind_rows(.,mutate(.,OLD_VALUE='approved')))) %>%
unnest()
还有一个基本版本,我们首先通过子集复制行,然后使用重复的ID标记行
i <- with(df, rep(seq(nrow(df)),1 + (STATUS>0 & is.na(NEW_VALUE))))
df2 <- df[i,]
df2[duplicated(df2$ID_NO),"OLD_VALUE"] <- "approved"
df2