我有一个数据框df
:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
1. userID is factor variable
2. Score is numeric
3. All the 'Task_' features are factor variables with possible values 'Hard', 'Easy', 'Match' or NA
我想为每个userID
创建新的列,其中包含Task_
功能的每种可能状态的出现次数。对于上述玩具示例,所需的输出将是在df
的末尾添加的三个新列,如下所示:
userID Hard Match Easy
3108 0 0 4
3207 1 2 1
3350 3 0 1
3961 2 0 1
4021 1 0 2
更新: 该问题不是重复的问题,原始问题的相关部分已移至: R How to counting the factors in ordered sequence
答案 0 :(得分:4)
您可以将数据帧urlpatterns = [
url(r'^admin/', admin.site.urls),
url(r'^', include('dellserver.urls')),
]
与df
或map*
函数中的每个值进行比较,计算所得布尔矩阵的行和,然后将输出与原始数据框:
*apply
答案 1 :(得分:3)
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
# userID NA Easy Hard Match
# 1: 3108 0 4 0 0
# 2: 3207 0 1 1 2
# 3: 3350 0 1 3 0
# 4: 3961 1 1 2 0
# 5: 4021 1 2 1 0
答案 2 :(得分:2)
可以通过按行使用apply
来获得对第一部分的答案,并可以使用table
来计算每一行中因子水平的出现情况
cbind(df[1], t(apply(df[-c(1, 2)], 1, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match"))))))
# userID Easy Hard Match
#1 3108 4 0 0
#2 3207 1 1 2
#3 3350 1 3 0
#4 3961 1 2 0
#5 4021 2 1 0
在tidyverse
中,我们可以将数据转换为长格式,删除NA
的值,count
出现userID
和value
并取回数据到宽格式。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with("Task"), values_drop_na = TRUE) %>%
count(userID, value) %>%
pivot_wider(names_from = value, values_from = n, values_fill = list(n = 0))
数据
df <- structure(list(userID = c(3108L, 3207L, 3350L, 3961L, 4021L),
Score = c(-8, 3, 5.78, 10, 10), Task_Alpha = structure(c(1L,
2L, 2L, 1L, 1L), .Label = c("Easy", "Hard"), class = "factor"),
Task_Beta = structure(c(1L, 1L, 1L, NA, 1L), .Label = "Easy", class = "factor"),
Task_Charlie = structure(c(1L, 3L, 2L, 2L, NA), .Label = c("Easy",
"Hard", "Match"), class = "factor"), Task_Delta = structure(c(1L,
3L, 2L, 2L, 2L), .Label = c("Easy", "Hard", "Match"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
答案 3 :(得分:2)
另一个使用Rfast::rowTabulate
v <- c('Hard', 'Match', 'Easy', NA)
DT[, (v) := as.data.table(Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N))),
.SDcols=Task_Alpha:Task_Delta]
输出:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy NA
1: 3108 -8.00 Easy Easy Easy Easy 0 0 4 0
2: 3207 3.00 Hard Easy Match Match 1 2 1 0
3: 3350 5.78 Hard Easy Hard Hard 3 0 1 0
4: 3961 10.00 Easy <NA> Hard Hard 2 0 1 1
5: 4021 10.00 Easy Easy <NA> Hard 1 0 2 1
来自Wimpel的数据:
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
了解这种方法在实际数据集上的运行速度以及实际数据集是否很大将很有趣。
编辑:添加了计时
library(data.table)
set.seed(0L)
nr <- 1e6
v <- c('Hard', 'Match', 'Easy', NA)
DT <- data.table(userID=1:nr, Task_Alpha=sample(v, nr, TRUE),
Task_Beta=sample(v, nr, TRUE), Task_Charlie=sample(v, nr, TRUE),
Task_Delta=sample(v, nr, TRUE))
df <- as.data.frame(DT)
mtd0 <- function() {
t(apply(df[-1L], 1L, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match")))))
}
mtd1 <- function() {
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
}
mtd2 <- function() {
DT[, Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N)),
.SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)
时间:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 54.7s 54.7s 0.0183 137MB 1.70 1 93 54.7s <int[,3] [1,000,000 x 3]> <df[,3] [107,168 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 2.4s 2.4s 0.417 398MB 0.833 1 2 2.4s <df[,5] [1,000,000 x 5]> <df[,3] [12,517 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 252.8ms 264.4ms 3.78 107MB 3.78 2 2 528.7ms <int[,4] [1,000,000 x 4]> <df[,3] [6,509 x 3]> <bch:tm> <tibble [2 x 3]>
答案 4 :(得分:1)
如果您使用的是base R
,则以下内容可能会对您有所帮助:
df <- cbind(df,as.data.frame(sapply(c('Hard','Match','Easy'), function(v) rowSums(df == v, na.rm = T))))
输出:
> df
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy
1 3108 -8.00 Easy Easy Easy Easy 0 0 4
2 3207 3.00 Hard Easy Match Match 1 2 1
3 3350 5.78 Hard Easy Hard Hard 3 0 1
4 3961 10.00 Easy <NA> Hard Hard 2 0 1
5 4021 10.00 Easy Easy <NA> Hard 1 0 2