Question

最小例子

我有：input<-data.frame(id=c(1,1,1,2,2,2),A=as.factor(c(1,1,2,2,1,3)),B=as.factor(c(0,1,1,1,0,0)))

我想：output<-data.frame(id=c(1,2), A1=c(2/3,1/3), A2=c(1/3,1/3), A3=c(0/3,1/3), B0=c(1/3,2/3), B1=c(2/3,1/3))

动机

我有一个带有分类数据的数据框。我想把它变成一个数据框，其中包含每个类别的比例计数。在“输出”数据框中，我希望每个变量类别组合（A1，A2等）都有一列。每行给出“id”的相对计数。例如，id = 1在“input”中有三个条目，在变量“A”下有两次类别1。列“A1”应该在该行中显示2/3。除以3，因为id = 1在“input”中有三个条目。

我的开始

function(input){

#create empty dataframe
distcID<-duplicated(input$id)
output<-data.frame(id=integer(0),A1=integer(0),A2=integer(0),A3=integer(0),
                 B0=integer(0),B1=integer(0))

count<-0

for (i in input$id[distcID]){
df.cID<-input[input$customer_ID==i]
m<- NROW(df.cID)
count<-count+1
output$customer_ID[count]<-i
output$A1[count]<-1/m*NROW(df.cID$A==1)
output$A2[count]<-1/m*NROW(df.cID$A==2)
output$A3[count]<-1/m*NROW(df.cID$A==3)
output$B0[count]<-1/m*NROW(df.cID$B==0)
output$B1[count]<-1/m*NROW(df.cID$B==1)
}
return(output)

}

有什么问题？ - 它太丑了。给定像apply和aggregate这样的函数或像plyr这样的包，应该有更好的（即更短的）解决这个问题的方法。

R不接受空列的输出初始化。
输出的列名不是自动创建的，而是手动创建的。

谢谢！如果我的问题不明确，请告诉我。这是我的第一个问题。

Answer 1

此表达式为每个非ID列（此处为2:3）创建一个表：

individuals <- lapply(2:3, function(i) {
  # Table of counts, by "id"
  x <- table(input[,c(1,i)])

  # Scale to proportions
  x <- x / rowSums(x)

  # Fix the names
  colnames(x) <- paste0(colnames(input)[i], colnames(x))

  return(x)
}
)

individuals
## [[1]]
##    A
## id         A1        A2        A3
##   1 0.6666667 0.3333333 0.0000000
##   2 0.3333333 0.3333333 0.3333333
## 
## [[2]]
##    B
## id         B0        B1
##   1 0.3333333 0.6666667
##   2 0.6666667 0.3333333

现在将它们与cbind：

放在一起

do.call(cbind, individuals)
##          A1        A2        A3        B0        B1
## 1 0.6666667 0.3333333 0.0000000 0.3333333 0.6666667
## 2 0.3333333 0.3333333 0.3333333 0.6666667 0.3333333

id列不存在，但行名称可用于此目的。

Answer 2

这不是一个完整的答案，但应该一路上帮助你（有点resphape[2] - ing：

ct <- count(input, "id")
A <- data.frame(table(input[,c(1,2)])/ct[ct$id==1,]$freq)
B <- data.frame(table(input[,c(1,3)])/ct[ct$id==2,]$freq)

print(A)
  id A      Freq
1  1 1 0.6666667
2  2 1 0.3333333
3  1 2 0.3333333
4  2 2 0.3333333
5  1 3 0.0000000
6  2 3 0.3333333

print(B)
  id B      Freq
1  1 0 0.3333333
2  2 0 0.6666667
3  1 1 0.6666667
4  2 1 0.3333333

Answer 3

以下是可能的解决方案：

library(reshape2)
library(qdap)

x <- prop.table(ftable(melt(input, id="id")))
x2 <- colpaste2df(data.frame(x), 2:3, keep.orig = FALSE, sep="", name.sep = "")
x3 <- dcast(x2, id  ~ variablevalue, value.var = "Freq")
x3[, c(TRUE, colSums(x3[, -1]) != 0)]

##   id         A1         A2         A3         B0         B1
## 1  1 0.16666667 0.08333333 0.00000000 0.08333333 0.16666667
## 2  2 0.08333333 0.08333333 0.08333333 0.16666667 0.08333333

Answer 4

可以看作是一个数据透视表（或两个数据透视表）：

>install.packages('reshape')
>library(reshape)
>ct <-count(input, "id")
>DF1<-cast(input, id ~ A, value='B')
>DF2<-cast(input, id ~ B, value="A")
>DF3<-cbind(DF1$id, DF1[names(DF1)!='id']/ct[1,]$freq, DF2[names(DF2)!='id']/ct[2,]$freq)
>names(DF3)<-c('id', paste('A', names(DF1)[-1], sep=''), paste('B', names(DF2)[-1], sep=''))
> DF3
  id        A1        A2        A3        B0        B1
1  1 0.6666667 0.3333333 0.0000000 0.3333333 0.6666667
2  2 0.3333333 0.3333333 0.3333333 0.6666667 0.3333333

Answer 5

这就是我想你想要的。只需添加行名或列名即可满足您的口味。

 tbB <- with(input, table(B, id))
 tbA <- with(input, table(A, id))
 cbind( t( tbA/rowSums(tbA)), t(tbB/rowSums(tbB)) )
          1   2 3         0         1
1 0.6666667 0.5 0 0.3333333 0.6666667
2 0.3333333 0.5 1 0.6666667 0.3333333

如何使用R将分类数据转换为相对计数

5 个答案: