复制行并在R中创建新变量

时间:2019-09-08 00:27:43

标签: r dataframe

我要根据类别2在类别3和类别4中是否具有Y来创建类别6的以下数据。即如果Category3 == Y,则Category6 == Cat3;如果Category4 == Y,则Category6 == Cat4。

由于Orange和Grape的Category3和Category4都为Y,所以我想复制该行,使该行中的所有其他内容保持不变:

拥有:

Category1 Category2 Category3  Category4  Category5 
Phase 1   Apple     Y          N          5
Phase 1   Berry     N          Y          3
Phase 2   Orange    Y          Y          4 
Phase 2   Grape     Y          Y          2

想要

Category1 Category2 Category3  Category4  Category5 Category6 
Phase 1   Apple     Y          N          5         Cat3
Phase 1   Berry     N          Y          3         Cat4
Phase 2   Orange    Y          Y          4         Cat3
Phase 2   Orange    Y          Y          4         Cat4
Phase 2   Grape     Y          Y          2         Cat3
Phase 2   Grape     Y          Y          2         Cat4

这里是一个示例,我发现它与我的非常相似,但是由于要尝试保留其他变量,因此我似乎无法使其正确地适合我的数据。我有什么办法可以保留这些?

Duplicating Rows by creating unique columns

复制:

dat <- data.frame(Category1 = c("Phase 1", "Phase 1", "Phase 2", "Phase 2"),
                  Category2 = c("Apple", "Berry", "Orange", "Grape"),
                  Category3 = c("Y", "N", "Y", "Y"),
                  Category4 = c("N", "Y", "Y", "Y"),
                  Category5 = c("5", "3", "4", "2"))

5 个答案:

答案 0 :(得分:3)

这是基本的R方法。 dat来自Tyler's answer

nm = c("Category3", "Category4")
ind = which(dat[nm] == "Y", arr.ind = TRUE)
ind = ind[order(ind[,1], ind[,2]),]
transform(dat[ind[,1],], Category6 = nm[ind[,2]])
#    Category1 Category2 Category3 Category4 Category5 Category6
#1     Phase 1     Apple         Y         N         5 Category3
#2     Phase 1     Berry         N         Y         3 Category4
#3     Phase 2    Orange         Y         Y         4 Category3
#3.1   Phase 2    Orange         Y         Y         4 Category4
#4     Phase 2     Grape         Y         Y         2 Category3
#4.1   Phase 2     Grape         Y         Y         2 Category4

答案 1 :(得分:2)

使用tidyverse,您可以使用case_when()动词来完成此操作,如下所示:

library(tidyverse)

dat <- data.frame(
  Category1 = c("Phase 1", "Phase 1", "Phase 2", "Phase 2"), 
  Category2 = c("Apple", "Berry", "Orange", "Grape"), 
  Category3 = c("Y", "N", "Y", "Y"), 
  Category4 = c("N", "Y", "Y", "Y"), 
  Category5 = c("5", "3", "4", "2")
)

# First create a variable that identifies which of Category3 or Category4 was "Y"
dat %>%
  mutate(Category6 = case_when(
    Category3 == "Y" ~ "Cat3",
    Category4 == "Y" ~ "Cat4"
  )) -> dat

# Next duplicate rows where Category3 and Category4 are both "Y"
# but since the previous option encoded Category 6 to "Cat3" first,
# here we will give the new duplicate a Category6 value of "Cat4"
dat2 <- dat
for(i in 1:nrow(dat)){
  new_dat <- dat[i,]
  if(new_dat$Category3 == "Y" & new_dat$Category4 == "Y"){
    new_dat$Category6 <- "Cat4"
    dat2 <- rbind(dat2, new_dat)        
  }
}

输出:

   Category1 Category2 Category3 Category4 Category5 Category6
     Phase 1     Apple         Y         N         5      Cat3
     Phase 1     Berry         N         Y         3      Cat4
     Phase 2    Orange         Y         Y         4      Cat3
     Phase 2     Grape         Y         Y         2      Cat3
     Phase 2    Orange         Y         Y         4      Cat4
     Phase 2     Grape         Y         Y         2      Cat4

答案 2 :(得分:1)

这是将applydplyrtidyr结合使用的一种方法-

dat %>% 
  mutate(
    Category6 = apply(.[3:4], 1 , function(x) names(.[3:4])[x == "Y"])
  ) %>% 
  unnest()

  Category1 Category2 Category3 Category4 Category5 Category6
1   Phase 1     Apple         Y         N         5 Category3
2   Phase 1     Berry         N         Y         3 Category4
3   Phase 2    Orange         Y         Y         4 Category3
4   Phase 2    Orange         Y         Y         4 Category4
5   Phase 2     Grape         Y         Y         2 Category3
6   Phase 2     Grape         Y         Y         2 Category4

答案 3 :(得分:1)

这里是使用tidyverse

的选项
library(dplyr)
library(tidyr)
library(stringr)
dat %>%
   gather(key, val, Category3:Category4) %>%
   group_by(Category2) %>% 
   mutate(Category6 = toString(str_replace(key, "(^.{3})\\D+", 
             "\\1")[val == "Y"])) %>% 
   spread(key, val) %>%
   separate_rows(Category6) %>% 
   select(names(dat), Category6)
# A tibble: 6 x 6
# Groups:   Category2 [4]
#  Category1 Category2 Category3 Category4 Category5 Category6
#* <fct>     <fct>     <chr>     <chr>     <fct>     <chr>    
#1 Phase 1   Apple     Y         N         5         Cat3     
#2 Phase 1   Berry     N         Y         3         Cat4     
#3 Phase 2   Grape     Y         Y         2         Cat3     
#4 Phase 2   Grape     Y         Y         2         Cat4     
#5 Phase 2   Orange    Y         Y         4         Cat3     
#6 Phase 2   Orange    Y         Y         4         Cat4     

或者另一个选择是pmapunnest

library(purrr)
dat %>% 
  mutate(Category6  = select(., Category3:Category4) %>%
       pmap(~  names(which(c(...) == "Y")) %>% 
            str_replace("^(...)\\D+", "\\1") )) %>%
  unnest(Category6)

数据

dat <- structure(list(Category1 = c("Phase 1", "Phase 1", "Phase 2", 
"Phase 2"), Category2 = c("Apple", "Berry", "Orange", "Grape"
), Category3 = c("Y", "N", "Y", "Y"), Category4 = c("N", "Y", 
"Y", "Y"), Category5 = c("5", "3", "4", "2")), 
 class = "data.frame", row.names = c(NA, 
-4L))

答案 4 :(得分:1)

这使用gatherinner_join来提供笛卡尔积。

library(tidyr)
library(dplyr)

half_result <- gather(dat,key, value, c('Category3', 'Category4'))%>%
  inner_join(tibble(value = 'Y',
                    key = c('Category3', 'Category4'),
                    lookup = c('Cat3', 'Cat4'))
             )%>%
  select(Category1, Category2, Category5, Category6 = lookup)%>%
  arrange(Category1, Category2)

half_result

> half_result
  Category1 Category2 Category5 Category6
1   Phase 1     Apple         5      Cat3
2   Phase 1     Berry         3      Cat4
3   Phase 2     Grape         2      Cat3
4   Phase 2     Grape         2      Cat4
5   Phase 2    Orange         4      Cat3
6   Phase 2    Orange         4      Cat4

half_result%>%
  group_by(Category1, Category2)%>%
  mutate(category3 = ifelse(any(Category6 == 'Cat3'), 'Y', 'N'),
         Category4 = ifelse(any(Category6 == 'Cat4'), 'Y', 'N'))%>%
  ungroup()

# A tibble: 6 x 6
  Category1 Category2 Category5 Category6 category3 Category4
  <fct>     <fct>     <fct>     <chr>     <chr>     <chr>    
1 Phase 1   Apple     5         Cat3      Y         N        
2 Phase 1   Berry     3         Cat4      N         Y        
3 Phase 2   Grape     2         Cat3      Y         Y        
4 Phase 2   Grape     2         Cat4      Y         Y        
5 Phase 2   Orange    4         Cat3      Y         Y        
6 Phase 2   Orange    4         Cat4      Y         Y