从长格式转变为宽格式

时间:2015-05-22 09:15:04

标签: r reshape

例如

customer_code    items
1                sugar
1                salt       
2                sugar      
2                accessories
3                salt

期望的输出

customer_code   item   item2       item3
1              sugar   salt     
2              sugar             accessories
3                      salt

4 个答案:

答案 0 :(得分:5)

你可以在这里做一个简单的dcast

library(reshape2)
dcast(df, customer_code ~ paste("items", items, sep = "_"), value.var = "items")
#   customer_code items_accessories items_salt items_sugar
# 1             1              <NA>       salt       sugar
# 2             2       accessories       <NA>       sugar
# 3             3              <NA>       salt        <NA>

或者更接近您想要的输出

library(data.table)
setDT(df)[, indx := paste0("items", .GRP), by = items]
dcast(df, customer_code ~ indx, value.var = "items")
#    customer_code items1 items2      items3
# 1:             1  sugar   salt          NA
# 2:             2  sugar     NA accessories
# 3:             3     NA   salt          NA

答案 1 :(得分:3)

您可以使用spread

中的tidyr
library(dplyr)
library(tidyr)
  mutate(df1, var=factor(items, levels=unique(items), 
      labels=paste0('items', seq(n_distinct(items))))) %>% 
            spread(var, items, fill='')
#  customer_code items1 items2      items3
#1             1  sugar   salt            
#2             2  sugar        accessories
#3             3          salt        

答案 2 :(得分:2)

您可以尝试使用函数reshape

可以获得尽可能多的列数:

new_df <- reshape(df, idvar="customer_code", timevar="items", v.names="items", direction="wide")
new_df
#  customer_code items.sugar items.salt items.accessories
#1             1       sugar       salt              <NA>
#3             2       sugar       <NA>       accessories
#5             3        <NA>       salt              <NA>

您可以使用colnames(new_df)[-1] <- paste0("item", 1:(ncol(new_df)-1))

更改列名称

另一个选项,如果您希望获得与唯一客户可以拥有的最大项目数一样多的列:

df_split <- split(df, df[, 1])
df_split <- lapply(df_split, reshape, idvar="customer_code", timevar="items", v.names="items", direction="wide")
max_item <- max(sapply(df_split, ncol))
df_split <- lapply(df_split, function(df){ 
                                 if(ncol(df) < max_item) df <- cbind(df, matrix(NA, ncol=max_item - ncol(df)))
                                 colnames(df)[-1] <- paste0("item", 1:(max_item-1))
                                 return(df)
                              })
new_df <- do.call("rbind", df_split)
new_df
#  customer_code item1       item2
#1             1 sugar        salt
#2             2 sugar accessories
#3             3  salt        <NA>

答案 3 :(得分:1)

dplyr,尤其是tidyr可以解决此类问题。这段代码可以解决问题。

require("tidyr")
require("dplyr")
df %>% group_by(customer_code) %>% spread(items, items) -> df_wide
#   customer_code accessories salt sugar
# 1             1          NA salt sugar
# 2             2 accessories   NA sugar
# 3             3          NA salt    NA

希望在必要时更改colnames没有问题:

names(df_wide)[-1] <- paste0("item", 1:(ncol(df_wide)-1))
#   customer_code       item1 item2 item3
# 1             1          NA  salt sugar
# 2             2 accessories    NA sugar
# 3             3          NA  salt    NA

另外可能会建议这种形式的输出(可能很方便):

df  %>% mutate(present = T) %>% spread(items, present, fill = F)
#   customer_code accessories  salt sugar
# 1             1       FALSE  TRUE  TRUE
# 2             2        TRUE FALSE  TRUE
# 3             3       FALSE  TRUE FALSE