我有以下数据集
#mydata
Factors Transactions
a,c 2
b 0
c 0
d,a 0
a 1
a 0
b 1
我想计算那些有交易的因素。例如,我们有两次" a"与交易。我可以编写一个代码,分别给出每个变量的理想结果。以下是" a"。
nrow (subset (mydata,mydata$Transaction > 0 & length(mydata[grep("a", mydata$Factors),] )> 0))
但是我有太多的变量而且不想重复所有这些变量的代码。我认为应该有一种方法来编写代码来给我所有变量的结果。我希望得到以下结论:
#Output
a 2
b 1
c 1
d 0
答案 0 :(得分:3)
等效data.table
选项:
library(data.table)
setDT(df)[, .(Factors = unlist(strsplit(as.character(Factors), ","))),
by = Transactions][,.(Transactions = sum(Transactions > 0)), by = Factors]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
答案 1 :(得分:3)
您可以使用Factor
列的唯一值作为级别创建表格。请将df
视为您的数据集。
s <- strsplit(as.character(df$Factors), ",", fixed = TRUE)
table(factor(unlist(s[df$Transactions > 0]), levels = unique(unlist(s))))
#
# a c b d
# 2 1 1 0
用as.data.frame()
换行数据帧输出。
with(df, {
s <- strsplit(as.character(Factors), ",", fixed = TRUE)
f <- factor(unlist(s[Transactions > 0]), levels = unique(unlist(s)))
as.data.frame(table(Factors = f))
})
# Factors Freq
# 1 a 2
# 2 c 1
# 3 b 1
# 4 d 0
答案 2 :(得分:2)
使用tidyverse包,假设您的数据是字符串/因子和数字,
library(tidyr)
library(dplyr)
# separate factors with two elements
df %>% separate_rows(Factors) %>%
# set grouping for aggregation
group_by(Factors) %>%
# for each group, count how many transactions are greater than 0
summarise(Transactions = sum(Transactions > 0))
## # A tibble: 4 x 2
## Factors Transactions
## <chr> <int>
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
您也可以使用dplyr
来避免使用xtabs
,但需要进行一些清理才能达到相同的安排:
library(tidyr)
df %>% separate_rows(Factors) %>%
xtabs(Transactions > 0 ~ Factors, data = .) %>%
as.data.frame() %>%
setNames(names(df))
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
完全基础R等价物:
df2 <- do.call(rbind,
Map(function(f, t){data.frame(Factors = strsplit(as.character(f), ',')[[1]],
Transactions = t)},
df$Factors, df$Transactions))
df3 <- as.data.frame(xtabs(Transactions > 0 ~ Factors, data = df2))
names(df3) <- names(df)
df3
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
答案 3 :(得分:0)
我们可以使用cSplit
中的splitstackshape
将'因素'拆分为'长'格式,并按'因素'分组,我们得到逻辑列的sum
('Transactions&gt; 0`)。
library(splitstackshape)
cSplit(df1, "Factors", ",", "long")[, .(Transactions=sum(Transactions > 0)),.(Factors)]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
或使用base R
with(df1, table(factor(unlist(strsplit(Factors[Transactions>0], ",")),
levels = letters[1:4]) ))
# a b c d
# 2 1 1 0
df1 <- structure(list(Factors = c("a,c", "b", "c", "d,a", "a", "a",
"b"), Transactions = c(2L, 0L, 0L, 0L, 1L, 0L, 1L)), .Names = c("Factors",
"Transactions"), class = "data.frame", row.names = c(NA, -7L))