我的数据包含3列: - 分割 - 类别 - 产品编号
如何创建一个交叉表(“段”作为列,“类别”作为行),它们会计算“产品编号”的唯一值“(以下示例)?
SEG1 SEG2 SEG3
CAT1 X
CAT2
CAT3
X - 来自CAT1和SEG1的唯一值的数量
数据充足
CAT<-c("CAT1","CAT3","CAT3","CAT1","CAT2","CAT3","CAT3","CAT3","CAT3","CAT2")
SEG<-c("SEG1","SEG3","SEG3","SEG2","SEG2","SEG2","SEG3","SEG3","SEG2","SEG2")
PRODUCT<-c("a","a","a","a","d","e","b","c","a","a")
data<-cbind(CAT,SEG,PRODUCT)
非常感谢提前! 最好的祝福, 鲍尔泰克
答案 0 :(得分:2)
您可以简单地计算数据的交叉,但没有重复的行,以确保只计算唯一的产品编号:
nodup <- which(!duplicated(data))
table(data[nodup, "CAT"],data[nodup, "SEG"])
SEG1 SEG2 SEG3
CAT1 1 1 0
CAT2 0 2 0
CAT3 0 2 3
答案 1 :(得分:0)
> set.seed(1)
> mydf <- data.frame(
+ Values = rep(c("111", "222", "333"), times = c(5, 3, 2)),
+ Year = c(rep(c("1999", "2000"), times = c(3, 2)),
+ "1999", "1999", "2000", "2000", "2000"),
+ Month = sample(c("Jan", "Feb", "Mar"), 10, replace = TRUE)
+ )
> mydf
Values Year Month
1 111 1999 Jan
2 111 1999 Feb
3 111 1999 Feb
4 111 2000 Mar
5 111 2000 Jan
6 222 1999 Mar
7 222 1999 Mar
8 222 2000 Feb
9 333 2000 Feb
10 333 2000 Jan
> with(mydf, tapply(Month, list(Values, Year), FUN = function(x) length(unique(x))))
1999 2000
111 2 2
222 1 1
333 NA 2
>
对于你的例子:
> data
CAT SEG PRODUCT
1 CAT1 SEG1 a
2 CAT3 SEG3 a
3 CAT3 SEG3 a
4 CAT1 SEG2 a
5 CAT2 SEG2 d
6 CAT3 SEG2 e
7 CAT3 SEG3 b
8 CAT3 SEG3 c
9 CAT3 SEG2 a
10 CAT2 SEG2 a
> with(data, tapply(PRODUCT, list(CAT, SEG), FUN = function(x) length(unique(x))))
SEG1 SEG2 SEG3
CAT1 1 1 NA
CAT2 NA 2 NA
CAT3 NA 2 3
答案 2 :(得分:0)
library(plyr)
library(reshape)
data <- data.frame(data)
a <- ddply(data,.(CAT,SEG),summarize,unq=length(unique(PRODUCT)))
b <- cast(a,CAT~SEG,mean)
这将在唯一值计数= 0
的位置生成NaN答案 3 :(得分:0)
如果您使用的是data.table,您可以真正加快对较大数据帧的操作。你可以用
library(data.table)
library(reshape)
DF<-data.table(DF)
DF_agg<-DF[,j=list(count_prod=length(unique(DF$Product_Number)),by=c("Segment","Category")]
DF_agg<-cast(DF_agg,Segment~Category,sum)
答案 4 :(得分:0)
使用dplyr和tidyr包装的高速解决方案。
library(dplyr)
library(tidyr)
CAT <- c("CAT1","CAT3","CAT3","CAT1","CAT2","CAT3","CAT3","CAT3","CAT3","CAT2")
SEG <- c("SEG1","SEG3","SEG3","SEG2","SEG2","SEG2","SEG3","SEG3","SEG2","SEG2")
PRODUCT <- c("a","a","a","a","d","e","b","c","a","a")
data <- data.frame(CAT, SEG, PRODUCT)
# Elegant solution with pipes (%>%)
data %>%
group_by(CAT, SEG) %>%
summarize(uni.prod = n_distinct(PRODUCT)) %>%
spread(CAT, uni.prod)
# Solution without use pipes
groups <- group_by(data, CAT, SEG)
s <- summarize(groups, uni.prod = n_distinct(PRODUCT))
spread(s, CAT, uni.prod)
答案 5 :(得分:0)
如果您的数据在数据框中,还有一些建议。可以使用 dplyr::n_distinct
代替 n_unique
,如下定义。
data = data.frame(
CAT = factor(CAT),
SEG = factor(SEG),
PRODUCT = as.character(PRODUCT)
)
n_unique <- function(x) length(unique(x))
使用基础 R
dat_counts <- aggregate(PRODUCT ~ CAT + SEG, data = data, FUN = n_unique)
xtabs(PRODUCT ~ CAT + SEG, data = dat_counts)
使用表格包。注意,分组变量(CAT和SEG)需要是因子,值变量(PRODUCT)需要是字符变量。
tables::tabular(
CAT ~ SEG * PRODUCT * n_unique
, data = data)
这些方法适用于小型数据集,但其他方法适用于大型数据集更快。