通过创建唯一列来复制行

时间:2018-02-06 22:02:03

标签: r dataframe

有一种情况

one    two    three    type 
x      x               chocolate
x                      vanilla 
x      x               strawberry

如果我想根据列中的'x'进行复制,例如:

one    two    three    type 
x                      chocolate
       x               chocolate
x                      vanilla 
x                      strawberry
       x               strawberry

这里的目标是每行中有一个'x',因此如果每行中有多个'x',则复制整行,并为每个副本保留该行的唯一'x'。

重现

dat <- data.frame(one = c("x", "x", "x"), two = c("x", "", "x"), three = c("", "", ""), type = c("chocolate", "vanilla",
                                                                                                                        "strawberry"))

7 个答案:

答案 0 :(得分:4)

使用tidyr :: gather

与MKR和Anant的解决方案一脉相承。

library(tidyverse)
dat %>%
  mutate(i=row_number())         %>%
  gather("Key", "Val", -type,-i) %>%
  rowid_to_column                %>%
  spread(Key, Val,fill = "")     %>%
  `[`(rowSums(.=="x")>0,)        %>%
  arrange(i)                     %>%
  select(type,one,two,three)

#         type one two three
# 1  chocolate   x          
# 2  chocolate       x      
# 3    vanilla   x          
# 4 strawberry   x          
# 5 strawberry       x

虽然您可能对此感到满意(如果以填充的一个值填充,则会显示列three,但行会被洗牌):

dat %>%
  na_if("") %>%
  gather("Key", "Val", -type,na.rm=TRUE) %>%
  rowid_to_column  %>%
  spread(Key, Val,fill = "") %>%
  select(-1)

#          type one two
# 1   chocolate   x    
# 2     vanilla   x    
# 3  strawberry   x    
# 4   chocolate       x
# 5  strawberry       x

绑定数据框

另一种方法是将数据分成不同的数据框然后绑定它们(看起来像@Acccumulation在他的回答中试图解释的那样):

dat %>%
  map(1:3,~filter(.y[c(.x,4)],.y[.x]=="x"),.) %>%
  bind_rows                  %>%
  modify(as.character)       %>%
  `[<-`(is.na(.),value="")   %>%
  select(one,two,three,type)     

#   one two three       type
# 1   x            chocolate
# 2   x              vanilla
# 3   x           strawberry
# 4       x        chocolate
# 6       x       strawberry

使用合并

dat %>%
  na_if("") %>%
  bind_rows(.["type"])     %>%
  map(1:3,~.y[c(.x,4)],.)  %>%
  reduce(merge,all=TRUE)   %>%
  `[<-`(is.na(.),value="") %>%
  `[`(rowSums(.=="x")==1,) %>%
  distinct     

#         type one two three
# 1  chocolate   x          
# 2  chocolate       x      
# 3 strawberry   x          
# 4 strawberry       x      
# 5    vanilla   x  

使用免费

dat %>%
  imap_dfc(~{
    i <- match(.y,names(dat))
    if(.y != "type") map(.x,~`[<-`(character(3),i,.x)) else .x}) %>% 
  unnest %>%
  `[`(rowSums(.=="x")==1,) 

# # A tibble: 5 x 4
#         type   one   two three
#        <chr> <chr> <chr> <chr>
# 1  chocolate     x            
# 2  chocolate           x      
# 3    vanilla     x            
# 4 strawberry     x            
# 5 strawberry           x

使用diast

dat %>%
  rowwise %>%
  transmute(type,cols = list(setNames(data.frame(diag(c(one,two,three)=="x")),c('one','two','three')))) %>%
  unnest %>%
  modify_at(2:4, ~c('','x')[.x+1]) %>%
  `[`(rowSums(.=="x")==1,)

#   # A tibble: 5 x 4
#         type   one   two three
#        <chr> <chr> <chr> <chr>
# 1  chocolate     x            
# 2  chocolate           x      
# 3    vanilla     x            
# 4 strawberry     x            
# 5 strawberry           x

编辑 以在评论中提供OP的请求

我们采用第一个解决方案并对源数据使用right_join以确保所有行都在那里,然后用空字符串替换NAs。我们还清理了命令的第一行,因为我们不再需要i来安排。

dat2 %>%
  gather("Key", "Val", -type)    %>%
  rowid_to_column                %>%
  spread(Key, Val,fill = "")     %>%
  `[`(rowSums(.=="x")>0,)        %>%
  right_join(dat2["type"])       %>%
  `[<-`(is.na(.),value="")       %>%
  select(type,one,two,three)

#         type one two three
# 1  chocolate   x          
# 2  chocolate       x      
# 3    vanilla   x          
# 4 strawberry   x          
# 5 strawberry       x      
# 6   hazelnut         

答案 1 :(得分:1)

您可以使用dplyr::gather来完成您需要的内容,这里是一个快速示例,其中空字符串转换为NA:

library(tidyverse)

dat <- tibble(
     one = c("x", "x", "x"),
     two = c("x", "", "x"),
     three = c("", "", ""),
     type = c("chocolate", "vanilla", "strawberry")
 ) %>%
     na_if("") %>%
     gather("number", "value", one:three) %>%
     filter(!is.na(value)) %>%
     mutate(one = ifelse(number == "one", "x", ""),
            two = ifelse(number == "two", "x", ""),
            three = ifelse(number == "three", "x", "")) %>%
     select(one:three, type) %>%
     arrange(type)

 dat
# A tibble: 5 x 4
  one   two   three type      
  <chr> <chr> <chr> <chr>     
1 x     ""    ""    chocolate 
2 ""    x     ""    chocolate 
3 x     ""    ""    strawberry
4 ""    x     ""    strawberry
5 x     ""    ""    vanilla     

答案 2 :(得分:1)

使用gather然后spread可以实现一个解决方案。步骤是:

#data
df <- read.table(text = "one    two    three    type 
x      x       ''        chocolate
x      ''       ''         vanilla 
x      x       ''        strawberry", header = T, stringsAsFactors = F)

df
#  one two three       type
#1   x   x    NA  chocolate
#2   x        NA    vanilla
#3   x   x    NA strawberry
library(tidyverse)    
df1 <- gather(df, "Key", "Val", -type)

#         type   Key  Val
#1  chocolate 1   one    x
#2    vanilla 2   one    x
#3 strawberry 3   one    x
#4  chocolate 4   two    x
#5    vanilla 5   two     
#6 strawberry 6   two    x
#7  chocolate 7 three <NA>
#8    vanilla 8 three <NA>
#9 strawberry 9 three <NA>

#Make type unique before using sperad
df1$type <- paste(df1$type, 1:nrow(df1))

df2 <- spread(df1, Key, Val)

#Remove the digit part form the column name
df2$type <- gsub("(^\\w+)\\s.+","\\1",df2$type, perl = T)

# filter for the rows where either of one, two or three is 'x'
df_final <- df2 %>% filter(one == "x" | two == "x" | three == "x")

#set blank value to NA cell
df_final[is.na(df_final)] <- ""

df_final
#        type one three two
#1  chocolate   x          
#2  chocolate             x
#3 strawberry   x          
#4 strawberry             x
#5    vanilla   x          

答案 3 :(得分:1)

感谢您提出这个有趣的问题。

这是一个使用基数R的解决方案。想法是根据行的ID分割数据框,计算每行中x的数量,创建一个对角线的矩阵都等于1,基于维度在x的计数上,将矩阵转换为数据帧并将1替换为x,将0替换为&#34;&#34;,并操纵数据帧然后将它们组合一起来。

此解决方案适用于x的任意数量的列,但它假定type列是唯一与x没有任何关系的列。如果您有多个列,例如type列,则可能需要根据需要修改代码。

# Create a row ID
dat$ID <- 1:nrow(dat)

# Split the data frame based on ID
dat_list <- split(dat, f = dat$ID)

dat_list2 <- lapply(dat_list, function(x){
  x$ID <- NULL            # Remove the ID column    
  x_count <- sum(grepl("x", as.vector(unlist(x)))) # Count how many x per row
  m <- diag(x_count)      # Create a matrix with diagnol value to be 1 based on count
  cols <- names(x)[sapply(x, function(x) grepl("x", x))]  # Document the columns with "x"
  cols_n <- names(x)[!(names(x) %in% cols | names(x) %in% "type")]  # Document the columns do not have "x"
  dat <- as.data.frame(m) # Create a data frame by converting the m matrix
  colnames(dat) <- cols   # Assign column names
  dat[dat == 0] <- ""     # Replace 0 with "",
  dat[dat == "1"] <- "x"  # Replace 1 with "x"
  if (length(cols_n) >= 1){
    dat[, cols_n] <- ""     # Add all other columns in addition to type
  }
  dat$type <- x$type      # Add the type column
  return(dat)
})

# Combine all data frame in dat_list2
dat2 <- do.call(rbind, dat_list2)
# Re-assign the row name
rownames(dat2) <- 1:nrow(dat2)

dat2
#   one two three       type
# 1   x            chocolate
# 2       x        chocolate
# 3   x              vanilla
# 4   x           strawberry
# 5       x       strawberry

答案 4 :(得分:0)

使用data.table

library(data.table)
setDT(df)
df[one == "x" & two == "x", id := 1]
df1 <- df[id == 1, ]
df[, two := ""]
df1[, one := ""]
df <- rbind(df, df1)
df[, id := NULL]
df

输出:

     one two three       type
1:   x        NA  chocolate
2:   x        NA    vanilla
3:   x        NA strawberry
4:       x    NA  chocolate
5:       x    NA strawberry

答案 5 :(得分:0)

这是使用tidyverse工具的解决方案。您可能需要调整以包含所有7个列名称,但希望它仍然可以工作。洞察力是使用gather,添加唯一的rowid,然后使用spread强制仍然存在相同的行数,因为现在每行与其余行不同(通过{ {1}}和type)。其余的只是将空字符转换为rowid,然后删除所有NA的额外行。

NA

答案 6 :(得分:0)

我假设在下面的x之前总是x。即如果two中有x,那么one中就不会有{。}否则,我实际上并不确定你的期望。

因此,假设您的数据框名为df,只需找到行twothree中的xs并复制它们,同时清除其他内容。

twoMatches <- df$two == 'x'     #Find all the twos with xs
threeMatches <- df$three == 'x' #Find all the threes with xs
df$two[twoMatches] <- ""        #Clear the two field where there were twos
df$three[threeMatches] <- ""    #Clear the three field where there were threes

numTwos <- sum(twoMatches)      #Number of twos
numThrees <- sum(threeMatches)  #Number of threes

#Generate our frame of twos
twoFrame <- data.frame(one = rep("", numTwos), two = rep("x", numTwos), three = rep("", numTwos), type = df[twoMatches, "type"], stringsAsFactors = F)

#Generate our frame of threes
threeFrame <- data.frame(one = rep("", numThrees), two = rep("", numThrees), three = rep("x", numThrees), type = df[threeMatches, "type"], stringsAsFactors = F)

#Bind them all together
outdf <- dplyr::bind_rows(list(df, twoFrame, threeFrame))

#Sort them into something more meaningful
outdf <- outdf[order(outdf$type, outdf$one, outdf$two, outdf$three, decreasing = T),]

输出:

enter image description here