Question

我有一个这样的数据集，带有一个包含逗号分隔代码的变量（＆＃34; item＆＃34;）：

id  item
1    102, 103,401,
2    108,102,301
3    103, 108 , 405, 505, 708

对于每个id，我想得到每个单独项目的频率，如下所示：

id  102  103   104   108  301 401 ...
1    1    1                    1
2    1                 1    1
3         1            1

我该怎么做？

Answer 1

我们可以使用mtabulate

中的qdapTools执行此操作

library(qdapTools)
cbind(dat['id'], mtabulate(strsplit(dat$item, '\\s*,\\s*')))
#  id 102 103 108 301 401 405 505 708
#1  1   1   1   0   0   1   0   0   0
#2  2   1   0   1   1   0   0   0   0
#3  3   0   1   1   0   0   1   1   1

注意：数据来自@ thelatemail的帖子。

或其他选项（如果我们需要sparseMatrix）

library(Matrix)
#split the 'item' column to `list`
lst <- strsplit(dat$item, '\\s*,\\s*')
#get the `unique` elements after `unlist`ing.
Un1 <- sort(unique(unlist(lst)))
#create a `sparseMatrix` by specifying the row
#column index along with dim names (if needed)
sM <-  sparseMatrix(rep(dat$id, lengths(lst)), 
            match(unlist(lst), Un1), x= 1, 
             dimnames=list(dat$id, Un1))
sM
#    3 x 8 sparse Matrix of class "dgCMatrix"
#   102 103 108 301 401 405 505 708
#1   1   1   .   .   1   .   .   .
#2   1   .   1   1   .   .   .   .
#3   .   1   1   .   .   1   1   1

可以使用matrix

换行将其转换为as.matrix

as.matrix(sM)
#   102 103 108 301 401 405 505 708
#1   1   1   0   0   1   0   0   0
#2   1   0   1   1   0   0   0   0
#3   0   1   1   0   0   1   1   1

Answer 2

使用strsplit，然后利用factor确保包含所有列数。

spl <- strsplit(dat$item,"\\s*,\\s*")
ulevs <- sort(unique(unlist(spl)))
tab <- t(vapply(
  spl, 
  function(x) table(factor(x,levels=ulevs)),
  FUN.VALUE=numeric(length(ulevs))
))
cbind(dat["id"],tab)

#  id 102 103 108 301 401 405 505 708
#1  1   1   1   0   0   1   0   0   0
#2  2   1   0   1   1   0   0   0   0
#3  3   0   1   1   0   0   1   1   1

使用的数据：

dat <- read.table(text="id;item
1;102, 103,401,
2;108,102,301
3;103, 108 , 405, 505, 708",header=TRUE,sep=";",stringsAsFactors=FALSE)

Answer 3

您可以使用strsplit功能执行此操作。以下是我的解决方案

library(data.table)
id <- c(1:3)
item <- c("102, 103,401",  "108,102,301", "103, 108 , 405, 505, 708")
dataT <- data.table(id, item)

reqCol <- unlist(strsplit(dataT$item, split=","))
reqCol <- gsub(" ", "", reqCol)
reqCol <-  unique(reqCol)
reqColNames <- paste0("Col_", reqCol)

for(i in 1:length(reqCol)){
    expr1 <- parse(text = paste0(reqColNames[i], ":=0"))
    expr2 <- parse(text = paste0(reqColNames[i], ":=1"))
    dataT[, eval(expr1)]
    rowIndex <- grep(reqCol[i], dataT$item)
    dataT[rowIndex, eval(expr2)] 
}

我使用了data.table而不是data.frame，因为与data.frame相比，data.table非常快。

R将数据拆分为频率

3 个答案: