Question

我有这样的杂货篮数据：

 rbind(v1 = c("fruit", "semi-finished bread", "margarine", "ready soups"), 
       v2 = c("fruit", "yogurt", "coffee",""),
       v3 = c("whole milk","","",""), 
       v4 = c("fruit", "yogurt", "cream cheese", "meat spreads"))
...
   [,1]         [,2]                  [,3]           [,4]          
v1 "fruit"      "semi-finished bread" "margarine"    "ready soups" 
v2 "fruit"      "yogurt"              "coffee"       ""            
v3 "whole milk" ""                    ""             ""            
v4 "fruit"      "yogurt"              "cream cheese" "meat spreads"
...

如何将表格添加到每个项目为一列的表格中，以及0/1表示该购物篮是否包含此项目？

fruit;semi-finished bread;margarine;ready soups;yogurt;coffee;whole milk;cream cheese ;meat spreads
1;1;1;1;0;0;0;0;0
1;0;0;0;1;1;0;0;0
0;0;0;0;0;0;1;0;0
1;0;0;0;1;0;0;1;1
...

*编辑挑战在原始数据中，因为它不是有序的（水果有时是第一个，有时是第二个，依此类推）。空＆＃34;插槽＆＃34; （即，当物品的数量＆lt;每个篮子的最大物品数量）在行的末尾时。

首先需要确定列（数据集中的所有项目），然后定义每个篮子上找到的项目。总共可以有1000多个项目和100万行，因此手动工作需要最少。

Answer 1

使用数据

JSON

以下函数采用篮子项目的字符向量和创建一个关联矩阵

lblDate.text = [[dict valueForKey:@"data"] valueForKey:@"created"]

结果是

x <- c("fruit;semi-finished bread;margarine;ready soups",
       "fruit;yogurt;coffee;",
       "whole milk;",
       "fruit;yogurt;cream cheese ;meat spreads")

该方法不使用循环，因此可以很好地扩展。

有1000个项目和100万行，矩阵将很大（十亿个元素）并且非常稀疏。然后有意义的是使用稀疏矩阵并直接使用行baskets2incidence <- function(baskets) { ## process to computable elements baskets <- strsplit(baskets, " *; *") # each basket a vector of items items <- unique(unlist(baskets)) ## pre-allocate the result matrix m <- matrix(0, length(baskets), length(items), dimnames=list(NULL, items)) ## row and column index of basket items ridx <- rep(seq_along(baskets), lengths(baskets)) cidx <- match(unlist(baskets), items) ## update the result matrix -- a 2-column matrix can serve as index m[matrix(c(ridx, cidx), 2)] <- 1 m }和列> baskets2incidence(x) fruit semi-finished bread margarine ready soups yogurt coffee whole milk [1,] 1 1 1 0 0 0 0 [2,] 1 1 0 0 0 0 0 [3,] 1 1 0 0 0 0 0 [4,] 1 1 0 0 0 0 0 cream cheese meat spreads [1,] 0 0 [2,] 0 0 [3,] 0 0 [4,] 0 0索引，而不是制作完整的矩阵

ridx

更新的功能是

cidx

，结果是

library(Matrix)
m <- Matrix(0, length(baskets), length(items), dimnames=list(NULL, items),
                sparse=TRUE)

m[matrix(c(ridx, cidx), ncol=2)] <- 1

对于样本数据，我建议将其读入一个篮子列表而不是数据框，例如，

baskets2incidence <- function(baskets) {
    ## process to computable elements
    baskets <- strsplit(baskets, " *; *") # each basket a vector of items
    items <- unique(unlist(baskets))

    ## pre-allocate the sparse matrix
    m <- Matrix(0, length(baskets), length(items), dimnames=list(NULL, items),
                sparse=TRUE)

    ## row and column index of basket items
    ridx <- rep(seq_along(baskets), lengths(baskets))
    cidx <- match(unlist(baskets), items)

    ## update and return
    m[matrix(c(ridx, cidx), ncol=2)] <- 1
    m
}

Answer 2

我们可以使用mtabulate

中的qdapTools

library(qdapTools)
+(!!mtabulate(as.data.frame(t(df1))))
#  fruit margarine ready soups semi-finished bread V5 coffee yogurt whole milk
#V1     1         1           1                   1  0      0      0          0
#V2     1         0           0                   0  1      1      1          0
#V3     0         0           0                   0  1      0      0          1
#V4     1         0           0                   0  0      0      1          0
#   cream cheese meat spreads
#V1            0            0
#V2            0            0
#V3            0            0
#V4            1            1

如果它是@ {nicola的帖子

中的示例中显示的vector

mtabulate(strsplit(x, ';'))

数据

 df1 <- structure(list(v1 = c("fruit", "fruit", 
 "whole milk", "fruit"
 ), v2 = c("semi-finished bread", "yogurt", "", "yogurt"),
  v3 = c("margarine", 
 "coffee", "", "cream cheese"), v4 = c("ready soups", "", 
 "", 
"meat spreads")), .Names = c("v1", "v2", "v3", "v4"), 
 class = "data.frame", row.names = c(NA, -4L))

Answer 3

您可以尝试：

elements<-strsplit(x,";",fixed=TRUE)
columns<-unique(unlist(elements))
res<-do.call(rbind,lapply(elements,function(x) as.integer(columns %in% x)))
colnames(res)<-columns
#you can write the content on res on a file or on the console with write.csv
write.csv2(res,row.names=FALSE,quote=FALSE)
#fruit;semi-finished bread;margarine;ready soups;yogurt;coffee;whole milk;cream cheese ;meat spreads
#1;1;1;1;0;0;0;0;0
#1;0;0;0;1;1;0;0;0
#0;0;0;0;0;0;1;0;0
#1;0;0;0;1;0;0;1;1

数据

x<-c("fruit;semi-finished bread;margarine;ready soups", "fruit;yogurt;coffee;", "whole milk;", "fruit;yogurt;cream cheese ;meat spreads")

Answer 4

我确实喜欢mtabulate的经济，但我已经拼凑了这个togetehr所以：

txt <- "fruit;semi-finished bread;margarine;ready soups
fruit;yogurt;coffee;
whole milk;
fruit;yogurt;cream cheese ;meat spreads"
foods <- scan(text=txt,what="",sep=";")
foods <- unique(foods)

out <- paste0( foods, collapse=";")
for ( lines in readLines(textConnection(txt)) ) { out <- c(out, paste(as.numeric(foods %in%  scan(text=lines,what="",sep=";")  ), collapse=";"))}
Read 4 items
Read 4 items
Read 2 items
Read 4 items  # could use quiet=TRUE
out
####---
[1] "fruit;semi-finished bread;margarine;ready soups;yogurt;coffee;;whole milk;cream cheese ;meat spreads"
[2] "1;1;1;1;0;0;0;0;0;0"                                                                                 
[3] "1;0;0;0;1;1;1;0;0;0"                                                                                 
[4] "0;0;0;0;0;0;1;1;0;0"                                                                                 
[5] "1;0;0;0;1;0;0;0;1;1"

R中矩阵的不等长向量

4 个答案:

数据