我有一个类似这样的购物清单数据:
df <- data.frame(id = 1:5, item = c("apple2milk5", "milk1", "juice3apple5", "egg10juice1", "egg8milk2"), stringsAsFactors = F)
# id item
# 1 1 apple2milk5
# 2 2 milk1
# 3 3 juice3apple5
# 4 4 egg10juice1
# 5 5 egg8milk2
我想将变量item
分成多列,并记录货物后面的编号。我遇到的问题是每个人购买的商品都不相同,因此我无法使用tidyr::separate()
或其他类似功能来解决它。我期望的是:
# id apple milk juice egg
# 1 1 2 5 NA NA
# 2 2 NA 1 NA NA
# 3 3 5 NA 3 NA
# 4 4 NA NA 1 10
# 5 5 NA 2 NA 8
注:市场上的商品类别未知。所以不要以为只有4种商品。
感谢您的帮助!
答案 0 :(得分:2)
可能类似这样,并且应该适用于任何项目/数量。 它只是假设数量紧随项目。
让我们使用自定义函数提取项目和数量:
my_fun <- function(w) {
items <- stringr::str_split(w, "\\d+", simplify = T)
items <- items[items!=""] # dont now why but you get en empty spot each time
quantities <- stringr::str_split(w, "\\D+", simplify = T)
quantities <- quantities[quantities!=""]
d <- data.frame(item = items, quantity=quantities, stringsAsFactors = F)
return(d)
}
示例:
my_fun("apple2milk5")
# gives:
# item quantity
# 1 apple 2
# 2 milk 5
现在我们可以使用nest
和map
将函数应用于每个id:
library(dplyr)
df_result <- df %>%
nest(item) %>%
mutate(res = purrr::map(data, ~my_fun(.x))) %>%
unnest(res)
df_results
# # A tibble: 9 x 3
# id item quantity
# <int> <chr> <chr>
# 1 1 apple 2
# 2 1 milk 5
# 3 2 milk 1
# 4 3 juice 3
# 5 3 apple 5
# 6 4 egg 10
# 7 4 juice 1
# 8 5 egg 8
# 9 5 milk 2
现在我们可以使用dcast()
(很可能spread
也可以使用):
data.table::dcast(df_result, id~item, value.var="quantity")
# id apple egg juice milk
# 1 1 2 <NA> <NA> 5
# 2 2 <NA> <NA> <NA> 1
# 3 3 5 <NA> 3 <NA>
# 4 4 <NA> 10 1 <NA>
# 5 5 <NA> 8 <NA> 2
数据:
df <- data.frame(id = 1:5, item = c("apple2milk5", "milk1", "juice3apple5", "egg10juice1", "egg8milk2"), stringsAsFactors = F)
答案 1 :(得分:2)
tmp = lapply(strsplit(df$item, "(?<=\\d)(?=\\D)|(?<=\\D)(?=\\d)", perl = TRUE),
function(x) {
d = split(x, 0:1)
setNames(as.numeric(d[[2]]), d[[1]])
})
nm = unique(unlist(lapply(tmp, names)))
cbind(df, do.call(rbind, lapply(tmp, function(x) setNames(x[nm], nm))))
# id item apple milk juice egg
#1 1 apple2milk5 2 5 NA NA
#2 2 milk1 NA 1 NA NA
#3 3 juice3apple5 5 NA 3 NA
#4 4 egg10juice1 NA NA 1 10
#5 5 egg8milk2 NA 2 NA 8
答案 2 :(得分:2)
我刚想出一个tidyverse
解决方案。使用str_extract()
提取数量并将其名称设置为产品名称。然后reduce(bind_rows)
产生预期的结果。
library(tidyverse)
df$item %>%
map(~ set_names(str_extract_all(., "\\d+")[[1]], str_extract_all(., "\\D+")[[1]])) %>%
reduce(bind_rows) %>%
mutate_all(as.numeric) %>%
bind_cols(df, .)
# id item apple milk juice egg
# 1 1 apple2milk5 2 5 NA NA
# 2 2 milk1 NA 1 NA NA
# 3 3 juice3apple5 5 NA 3 NA
# 4 4 egg10juice1 NA NA 1 10
# 5 5 egg8milk2 NA 2 NA 8
答案 3 :(得分:2)
在每个数字子字符串之前放置一个空格,并在其后放置一个换行符。然后使用read.table
和unnest it
读取该数据。最后使用pivot_wider
从长格式转换为宽格式。
library(dplyr)
library(tidyr)
df %>%
mutate(item = gsub("(\\d+)", " \\1\n", item)) %>%
rowwise %>%
mutate(item = list(read.table(text = item, as.is = TRUE))) %>%
ungroup %>%
unnest(item) %>%
pivot_wider(names_from = "V1", values_from = "V2")
给予:
# A tibble: 5 x 5
id apple milk juice egg
<int> <int> <int> <int> <int>
1 1 2 5 NA NA
2 2 NA 1 NA NA
3 3 5 NA 3 NA
4 4 NA NA 1 10
5 5 NA 2 NA 8
这是上述代码的变体,它消除了unnest
。我们用空格,该字符串,另一个空格,id
和换行符替换每个数字字符串。然后使用read.table
读入。注意在%$%
之前使用%>%
而不是read.table
。最后使用pivot_wider
从长格式转换为宽格式。
library(dplyr)
library(magrittr)
library(tidyr)
df %>%
rowwise %>%
mutate(item = gsub("(\\d+)", paste(" \\1", id, "\n"), item)) %$%
read.table(text = item, as.is = TRUE, col.names = c("nm", "no", "id")) %>%
ungroup %>%
pivot_wider(names_from = "nm", values_from = "no")
答案 4 :(得分:2)
大多数以R为基础的R,其中包含stringr
和data.table
的一些输入:
library(stringr)
library(data.table)
cbind(
id = df$id,
rbindlist(
lapply(df$item, function(x) as.list(setNames(str_extract_all(x, "[0-9]+")[[1]], strsplit(x, "[0-9]+")[[1]]))),
fill = TRUE
)
)
id apple milk juice egg
1: 1 2 5 <NA> <NA>
2: 2 <NA> 1 <NA> <NA>
3: 3 5 <NA> 3 <NA>
4: 4 <NA> <NA> 1 10
5: 5 <NA> 2 <NA> 8
答案 5 :(得分:1)
这是基于R和stringr
的简单解决方案:
goods <- unique(unlist(stringr::str_split(df$item, pattern = "[0-9]")))
goods <- goods[goods != ""]
df <- cbind(df$id, sapply(goods,
function(x) stringr::str_extract(df$item, pattern = paste0(x,"[0-9]*"))))
df <- as.data.frame(df)
df[-1] <- lapply(df[-1], function(x) as.numeric(stringr::str_extract(x, pattern = "[0-9]*$")))
names(df)[1] <- "id"
输出
id apple milk juice egg
1 1 2 5 NA NA
2 2 NA 1 NA NA
3 3 5 NA 3 NA
4 4 NA NA 1 10
5 5 NA 2 NA 8
答案 6 :(得分:1)
您可以尝试
library(tidyverse)
library(stringi)
df %>%
mutate(item2 =gsub("[0-9]", " ", df$item)) %>%
mutate(item3 =gsub("[a-z]", " ", df$item)) %>%
mutate_at(vars(item2, item3), ~stringi::stri_extract_all_words(.) %>% map(paste, collapse=",")) %>%
separate_rows(item2, item3, sep = ",") %>%
spread(item2, item3)
id item apple egg juice milk
1 1 apple2milk5 2 <NA> <NA> 5
2 2 milk1 <NA> <NA> <NA> 1
3 3 juice3apple5 5 <NA> 3 <NA>
4 4 egg10juice1 <NA> 10 1 <NA>
5 5 egg8milk2 <NA> 8 <NA> 2
答案 7 :(得分:1)
#replace any digit followed by a character "positive look-ahead assertion" by the digit plus a comma
library(dplyr)
library(tidyr)
df %>% mutate(item=gsub('(\\d+(?=\\D))','\\1,' ,item, perl = TRUE)) %>%
separate_rows(item, sep = ",") %>%
extract(item, into = c('prod','quan'), '(\\D+)(\\d+)') %>%
spread(prod, quan, fill=0)
id apple egg juice milk
1 1 2 0 0 5
2 2 0 0 0 1
3 3 5 0 3 0
4 4 0 10 1 0
5 5 0 8 0 2
答案 8 :(得分:1)
我将再添加一个答案。它与@ASuliman略有不同,但使用了一些较新的tidyr
和一些可爱的正则表达式,使其变得更加简单。
regex技巧是模式"(?<=\\d)\\B(?=[a-z])"
将匹配数字和字母之间的无边界(即空位置),从而使您可以为每种"apple5"
类型的条目创建行。将字母提取到项目列中,将数字提取到计数列中。使用新的pivot_wider
代替spread
,您可以在整形时将这些计数转换为数值。
library(dplyr)
library(tidyr)
df %>%
separate_rows(item, sep = "(?<=\\d)\\B(?=[a-z])") %>%
extract(item, into = c("item", "count"), regex = "^([a-z]+)(\\d+)$") %>%
pivot_wider(names_from = item, values_from = count, values_fn = list(count = as.numeric))
#> # A tibble: 5 x 5
#> id apple milk juice egg
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 5 NA NA
#> 2 2 NA 1 NA NA
#> 3 3 5 NA 3 NA
#> 4 4 NA NA 1 10
#> 5 5 NA 2 NA 8
答案 9 :(得分:0)
更清洁的data.table
解决方案,其中包含stringr
的输入:
df[,
.(it_count = str_extract_all(item, "[0-9]+")[[1]],
it_name = str_extract_all(item, "[^0-9]+")[[1]]),
by = id
][, dcast(.SD, id ~ it_name, value.var = "it_count")]
id apple egg juice milk
1: 1 2 <NA> <NA> 5
2: 2 <NA> <NA> <NA> 1
3: 3 5 <NA> 3 <NA>
4: 4 <NA> 10 1 <NA>
5: 5 <NA> 8 <NA> 2