将购物清单分为多列

时间:2019-11-12 15:58:19

标签: r tidyr

我有一个类似这样的购物清单数据:

df <- data.frame(id = 1:5, item = c("apple2milk5", "milk1", "juice3apple5", "egg10juice1", "egg8milk2"), stringsAsFactors = F)

#   id         item
# 1  1  apple2milk5
# 2  2        milk1
# 3  3 juice3apple5
# 4  4  egg10juice1
# 5  5    egg8milk2

我想将变量item分成多列,并记录货物后面的编号。我遇到的问题是每个人购买的商品都不相同,因此我无法使用tidyr::separate()或其他类似功能来解决它​​。我期望的是:

#   id apple milk  juice egg  
# 1  1 2     5     NA    NA   
# 2  2 NA    1     NA    NA   
# 3  3 5     NA    3     NA   
# 4  4 NA    NA    1     10   
# 5  5 NA    2     NA    8    

注:市场上的商品类别未知。所以不要以为只有4种商品。

感谢您的帮助!

10 个答案:

答案 0 :(得分:2)

可能类似这样,并且应该适用于任何项目/数量。 它只是假设数量紧随项目。

让我们使用自定义函数提取项目和数量:

my_fun <- function(w) {
  items <- stringr::str_split(w, "\\d+", simplify = T)
  items <- items[items!=""] # dont now why but you get en empty spot each time
  quantities <- stringr::str_split(w, "\\D+", simplify = T)
  quantities <- quantities[quantities!=""]

  d <- data.frame(item = items, quantity=quantities, stringsAsFactors = F)


  return(d)

}

示例:

my_fun("apple2milk5")
# gives:
#    item quantity
# 1 apple        2
# 2  milk        5

现在我们可以使用nestmap将函数应用于每个id:

library(dplyr)
df_result <- df %>% 
  nest(item) %>% 
  mutate(res = purrr::map(data, ~my_fun(.x))) %>% 
  unnest(res)

df_results
# # A tibble: 9 x 3
# id item  quantity
# <int> <chr> <chr>   
# 1     1 apple 2       
# 2     1 milk  5       
# 3     2 milk  1       
# 4     3 juice 3       
# 5     3 apple 5       
# 6     4 egg   10      
# 7     4 juice 1       
# 8     5 egg   8       
# 9     5 milk  2 

现在我们可以使用dcast()(很可能spread也可以使用):

data.table::dcast(df_result, id~item, value.var="quantity")

#     id apple  egg juice milk
#   1  1     2 <NA>  <NA>    5
#   2  2  <NA> <NA>  <NA>    1
#   3  3     5 <NA>     3 <NA>
#   4  4  <NA>   10     1 <NA>
#   5  5  <NA>    8  <NA>    2

数据:

df <- data.frame(id = 1:5, item = c("apple2milk5", "milk1", "juice3apple5", "egg10juice1", "egg8milk2"), stringsAsFactors = F)

答案 1 :(得分:2)

tmp = lapply(strsplit(df$item, "(?<=\\d)(?=\\D)|(?<=\\D)(?=\\d)", perl = TRUE),
             function(x) {
                 d = split(x, 0:1)
                 setNames(as.numeric(d[[2]]), d[[1]])
             })
nm = unique(unlist(lapply(tmp, names)))

cbind(df, do.call(rbind, lapply(tmp, function(x) setNames(x[nm], nm))))
#  id         item apple milk juice egg
#1  1  apple2milk5     2    5    NA  NA
#2  2        milk1    NA    1    NA  NA
#3  3 juice3apple5     5   NA     3  NA
#4  4  egg10juice1    NA   NA     1  10
#5  5    egg8milk2    NA    2    NA   8

答案 2 :(得分:2)

我刚想出一个tidyverse解决方案。使用str_extract()提取数量并将其名称设置为产品名称。然后reduce(bind_rows)产生预期的结果。

library(tidyverse)

df$item %>%
  map(~ set_names(str_extract_all(., "\\d+")[[1]], str_extract_all(., "\\D+")[[1]])) %>%
  reduce(bind_rows) %>%
  mutate_all(as.numeric) %>%
  bind_cols(df, .)

#   id         item apple milk juice egg
# 1  1  apple2milk5     2    5    NA  NA
# 2  2        milk1    NA    1    NA  NA
# 3  3 juice3apple5     5   NA     3  NA
# 4  4  egg10juice1    NA   NA     1  10
# 5  5    egg8milk2    NA    2    NA   8

答案 3 :(得分:2)

在每个数字子字符串之前放置一个空格,并在其后放置一个换行符。然后使用read.tableunnest it读取该数据。最后使用pivot_wider从长格式转换为宽格式。

library(dplyr)
library(tidyr)

df %>%
  mutate(item = gsub("(\\d+)", " \\1\n", item)) %>%
  rowwise %>%
  mutate(item = list(read.table(text = item, as.is = TRUE))) %>%
  ungroup %>%
  unnest(item) %>%
  pivot_wider(names_from = "V1", values_from = "V2")

给予:

# A tibble: 5 x 5
     id apple  milk juice   egg
  <int> <int> <int> <int> <int>
1     1     2     5    NA    NA
2     2    NA     1    NA    NA
3     3     5    NA     3    NA
4     4    NA    NA     1    10
5     5    NA     2    NA     8

变化

这是上述代码的变体,它消除了unnest。我们用空格,该字符串,另一个空格,id和换行符替换每个数字字符串。然后使用read.table读入。注意在%$%之前使用%>%而不是read.table。最后使用pivot_wider从长格式转换为宽格式。

library(dplyr)
library(magrittr)
library(tidyr)

df %>%
  rowwise %>%
  mutate(item = gsub("(\\d+)", paste(" \\1", id, "\n"), item)) %$%
  read.table(text = item, as.is = TRUE, col.names = c("nm", "no", "id")) %>%
  ungroup %>%
  pivot_wider(names_from = "nm", values_from = "no")

答案 4 :(得分:2)

大多数以R为基础的R,其中包含stringrdata.table的一些输入:

library(stringr)
library(data.table)
cbind(
  id = df$id,
  rbindlist(
    lapply(df$item, function(x) as.list(setNames(str_extract_all(x, "[0-9]+")[[1]], strsplit(x, "[0-9]+")[[1]]))),
    fill = TRUE
  )
)

   id apple milk juice  egg
1:  1     2    5  <NA> <NA>
2:  2  <NA>    1  <NA> <NA>
3:  3     5 <NA>     3 <NA>
4:  4  <NA> <NA>     1   10
5:  5  <NA>    2  <NA>    8

答案 5 :(得分:1)

这是基于R和stringr的简单解决方案:

goods <- unique(unlist(stringr::str_split(df$item, pattern = "[0-9]")))
goods <- goods[goods != ""]
df <- cbind(df$id, sapply(goods,
       function(x) stringr::str_extract(df$item, pattern = paste0(x,"[0-9]*"))))
df <- as.data.frame(df)
df[-1] <- lapply(df[-1], function(x) as.numeric(stringr::str_extract(x, pattern = "[0-9]*$")))
names(df)[1] <- "id"

输出

id apple milk juice egg
1  1     2    5    NA  NA
2  2    NA    1    NA  NA
3  3     5   NA     3  NA
4  4    NA   NA     1   10
5  5    NA    2    NA   8

答案 6 :(得分:1)

您可以尝试

library(tidyverse)
library(stringi)
df %>% 
  mutate(item2 =gsub("[0-9]", " ", df$item)) %>% 
  mutate(item3 =gsub("[a-z]", " ", df$item)) %>% 
  mutate_at(vars(item2, item3), ~stringi::stri_extract_all_words(.) %>% map(paste, collapse=",")) %>% 
  separate_rows(item2, item3, sep = ",") %>% 
  spread(item2, item3)
  id         item apple  egg juice milk
1  1  apple2milk5     2 <NA>  <NA>    5
2  2        milk1  <NA> <NA>  <NA>    1
3  3 juice3apple5     5 <NA>     3 <NA>
4  4  egg10juice1  <NA>   10     1 <NA>
5  5    egg8milk2  <NA>    8  <NA>    2

答案 7 :(得分:1)

#replace any digit followed by a character "positive look-ahead assertion" by the digit plus a comma
library(dplyr)
library(tidyr)
df %>% mutate(item=gsub('(\\d+(?=\\D))','\\1,' ,item, perl = TRUE)) %>% 
       separate_rows(item, sep = ",") %>% 
       extract(item, into = c('prod','quan'), '(\\D+)(\\d+)') %>% 
       spread(prod, quan, fill=0)

  id apple egg juice milk
1  1     2   0     0    5
2  2     0   0     0    1
3  3     5   0     3    0
4  4     0  10     1    0
5  5     0   8     0    2

答案 8 :(得分:1)

我将再添加一个答案。它与@ASuliman略有不同,但使用了一些较新的tidyr和一些可爱的正则表达式,使其变得更加简单。

regex技巧是模式"(?<=\\d)\\B(?=[a-z])"将匹配数字和字母之间的无边界(即空位置),从而使您可以为每种"apple5"类型的条目创建行。将字母提取到项目列中,将数字提取到计数列中。使用新的pivot_wider代替spread,您可以在整形时将这些计数转换为数值。

library(dplyr)
library(tidyr)

df %>%
  separate_rows(item, sep = "(?<=\\d)\\B(?=[a-z])") %>%
  extract(item, into = c("item", "count"), regex = "^([a-z]+)(\\d+)$") %>%
  pivot_wider(names_from = item, values_from = count, values_fn = list(count = as.numeric))
#> # A tibble: 5 x 5
#>      id apple  milk juice   egg
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     5    NA    NA
#> 2     2    NA     1    NA    NA
#> 3     3     5    NA     3    NA
#> 4     4    NA    NA     1    10
#> 5     5    NA     2    NA     8

答案 9 :(得分:0)

更清洁的data.table解决方案,其中包含stringr的输入:

df[, 
   .(it_count = str_extract_all(item, "[0-9]+")[[1]], 
     it_name = str_extract_all(item, "[^0-9]+")[[1]]), 
   by = id
   ][, dcast(.SD, id ~ it_name, value.var = "it_count")]

   id apple  egg juice milk
1:  1     2 <NA>  <NA>    5
2:  2  <NA> <NA>  <NA>    1
3:  3     5 <NA>     3 <NA>
4:  4  <NA>   10     1 <NA>
5:  5  <NA>    8  <NA>    2