我对R来说还是比较新的,如果我的问题看起来太基础,请提前道歉。
我的问题如下:
我有一个包含多个因子变量的数据集,它们具有相同的类别。我需要找到这个类别,这个类别最常出现在因子变量的每个观察中。在关系的情况下,可以选择任意值,尽管如果我可以对其进行更多控制则会很好。
我的数据集包含一百多个因素。但是,结构是这样的:
id <- 1:3
var1 <- c("red","yellow","green")
var2 <- c("red","yellow","green")
var3 <- c("yellow","orange","green")
var4 <- c("orange","green","yellow")
df <- data.frame(cbind(id, var1, var2, var3, var4))
> df
id var1 var2 var3 var4
1 1 red red yellow orange
2 2 yellow yellow orange green
3 3 green green green yellow
解决方案应该是数据框中的变量,例如var5,它包含每行的最常见类别。它可以是因子或数字向量(如果数据需要首先转换为数字向量)
在这种情况下,我想有这个解决方案:
> df$var5
[1] "red" "yellow" "green"
任何建议将不胜感激!提前谢谢!
答案 0 :(得分:15)
类似的东西:
apply(df,1,function(x) names(which.max(table(x))))
[1] "red" "yellow" "green"
如果存在平局,则.max取第一个最大值。来自 which.max帮助页面:
确定位置,即(第一个)的索引 数字向量的最小值或最大值。
前:
var4 <- c("yellow","green","yellow")
df <- data.frame(cbind(id, var1, var2, var3, var4))
> df
id var1 var2 var3 var4
1 1 red red yellow yellow
2 2 yellow yellow orange green
3 3 green green green yellow
apply(df,1,function(x) names(which.max(table(x))))
[1] "red" "yellow" "green"
答案 1 :(得分:1)
如果您的数据非常大,您可能需要考虑使用data.table
包。
# Generate the data
nrow <- 10^5
id <- 1:nrow
colors <- c("red","yellow","green")
var1 <- sample(colors, nrow, replace = TRUE)
var2 <- sample(colors, nrow, replace = TRUE)
var3 <- sample(colors, nrow, replace = TRUE)
var4 <- sample(colors, nrow, replace = TRUE)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
Chargaff的解决方案很简单,在某些情况下效果很好。使用data.table
可以获得较小的性能提升(约20%)。
df <- data.frame(cbind(id, var1, var2, var3, var4))
system.time(apply(df, 1, Mode))
# user system elapsed
# 1.242 0.018 1.264
library(data.table)
dt <- data.table(cbind(id, var1, var2, var3, var4))
system.time(melt(dt, measure = patterns('var'))[, Mode(value1), by = id])
# user system elapsed
# 1.020 0.012 1.034
答案 2 :(得分:1)
对于内部软件包,我制作了一个rowMode
函数,您可以在其中选择处理关系和缺失值的方法:
rowMode <- function(x, ties = NULL, include.na = FALSE) {
# input checks data
if ( !(is.matrix(x) | is.data.frame(x)) ) {
stop("Your data is not a matrix or a data.frame.")
}
# input checks ties method
if ( !is.null(ties) && !(ties %in% c("random", "first", "last")) ) {
stop("Your ties method is not one of 'random', 'first' or 'last'.")
}
# set ties method to 'random' if not specified
if ( is.null(ties) ) ties <- "random"
# create row frequency table
rft <- table(c(row(x)), unlist(x), useNA = c("no","ifany")[1L + include.na])
# get the mode for each row
colnames(rft)[max.col(rft, ties.method = ties)]
}
几个可能的输出(基于不同的参数选项):
> rowMode(DF[,-1]) [1] "B" "E" "B" "E" "B" "C" "B" "E" "A" "E" > rowMode(DF[,-1], ties = "first") [1] "B" "B" "B" "A" "B" "C" "B" "E" "A" "E" > rowMode(DF[,-1], ties = "first", include.na = TRUE) [1] "B" NA "B" NA "B" "C" "B" "E" "A" "E" > rowMode(DF[,-1], ties = "last", include.na = TRUE) [1] "B" NA NA NA "B" "C" "B" "E" "D" "E" > rowMode(DF[,-1], ties = "last") [1] "B" "C" "B" "E" "B" "C" "B" "E" "D" "E"
使用的数据:
set.seed(2020)
DF <- data.frame(id = 1:10, matrix(sample(c(LETTERS[1:5], NA_character_), 60, TRUE), ncol = 6))
答案 3 :(得分:0)
这是另一个基本的R选项:
tab <- table(data.frame(as.vector(row(df[,-1L])), unlist(df[,-1L])))
colnames(tab)[max.col(tab, "first")]
或另一种data.table
方法:
melt(as.data.table(df), id.vars="id")[
order(id, value), ri := rowid(rleid(value))][,
value[which.max(ri)], id]$V1
计时代码:
library(data.table)
set.seed(0L)
nr <- 1e5L
nc <- 4L
DF <- data.frame(id=1L:nr, as.data.frame(matrix(sample(letters, nr*nc, TRUE), ncol=nc)))
DT <- as.data.table(DF)
mtd0 <- function(df) apply(df,1,function(x) names(which.max(table(x))))
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
mtd_dt <- function(dt) melt(dt, id.vars="id")[, Mode(value), id]$V1
mtd_dt2 <- function(dt) melt(dt, id.vars="id")[
order(id, value), ri := rowid(rleid(value))][,
value[which.max(ri)], id]$V1
mtd2 <- function(df) {
tab <- table(data.frame(as.vector(row(df[,-1L])), unlist(df[,-1L])))
colnames(tab)[max.col(tab, "first")]
}
df = data.frame(id = 1:3,
var1 = c("red","yellow","green"),
var2 = c("red","yellow","green"),
var3 = c("yellow","orange","green"),
var4 = c("orange","green","yellow"))
a0 <- mtd0(df)
identical(a0, mtd_dt(as.data.table(df)))
#[1] TRUE
identical(a0, mtd2(df))
#[1] TRUE
identical(a0, mtd_dt2(as.data.table(df)))
#[1] TRUE
microbenchmark::microbenchmark(times=1L, mtd0(DF), mtd_dt(DT), mtd_dt2(DT), mtd2(DF))
时间:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0(DF) 10083.9941 10083.9941 10083.9941 10083.9941 10083.9941 10083.9941 1
mtd_dt(DT) 1056.2319 1056.2319 1056.2319 1056.2319 1056.2319 1056.2319 1
mtd_dt2(DT) 168.6183 168.6183 168.6183 168.6183 168.6183 168.6183 1
mtd2(DF) 519.2030 519.2030 519.2030 519.2030 519.2030 519.2030 1