我正在处理一个数据集,其中源名称由变量前面的2个字母缩写指定。因此,源AA中的所有变量都以AA_var1
开头,而源bb的变量bb_variable_name_2
。实际上有很多来源和很多变量名称,但我只留下2作为一个最小的例子。
我想为任何行创建一个均值变量,其中源的数量,即该行上的数据不是NA的唯一前缀的数量,大于1.如果只有一个源,我希望那个总变量是NA。
因此,例如,我的数据如下所示:
> head(df)
AA_var1 AA_var2 myid bb_meow bb_A_v1
1 NA NA 123456 10 12
2 NA 10 194200 12 NA
3 12 10 132200 NA NA
4 12 NA 132201 NA 12
5 NA NA 132202 NA NA
6 12 13 132203 14 NA
我想要以下内容:
> head(df)
AA_var1 AA_var2 myid bb_meow bb_A_v1 rowMeanIfDiverseData
1 NA NA 123456 10 12 NA #has only bb
2 NA 10 194200 12 NA 11 #has AA and bb
3 12 10 132200 NA NA NA #has only AA
4 12 NA 132201 NA 12 12 #has AA and bb
5 NA NA 132202 NA NA NA #has neither
6 12 13 132203 14 NA 13 #has AA and bb
通常,我只是使用rowMeans()
来做这种事情。但是,仅选择变量名称遵循约定/行级别的行的附加子集使我在项目级别和我习惯的一般应用级别语句之间感到困惑。
我可以在数据帧级别获取前缀:
mynames <- names(df[!names(df) %in% c("myid")])
tmp <- str_extract(mynames, perl("[A-Za-z]{2}(?=_)"))
uniq <- unique(tmp[!is.na(tmp)])
所以,
> uniq
[1] "AA" "bb"
所以,我可以把它作为一个我可以应用于df的函数:
multiSource <- function(x){
nm = names(x[!names(x) %in% badnames]) # exclude c("myid")
tmp <- str_extract(nm, perl("[A-Za-z]{2}(?=_)")) # get prefixes
uniq <- unique(tmp[!is.na(tmp)]) # ensure unique and not NA
if (length(uniq) > 1){
return(T)
} else {
return(F)
}
}
但这显然很混乱,仍然获得数据集级别,即:
> lapply(df,multiSource)
$AA_var1
[1] FALSE
$AA_var2
[1] FALSE
$bb_meow
[1] FALSE
$bb_A_v1
[1] FALSE
和...
> apply(df,MARGIN=1,FUN=multiSource)
为所有人提供TRUE。
我不想说...
df$rowMean <- rowMeans(df, na.rm=T)
# so, in this case
rowMeansIfTest <- function(X,test) {
# is this row muliSource True?
# if yes, return(rowMeans(X))
# else return(NA)
}
df$rowMeanIfDiverseData <- rowMeansIfTest(df, test=multiSource)
但我不清楚如何在没有某种for循环的情况下做到这一点。
答案 0 :(得分:3)
此处的策略是将数据框按列拆分为变量组,并为每行标识是否存在非NA值。然后我们检查rowsums
以确保至少有两个变量具有行的非NA值,如果是,则使用cbind
添加这些值的平均值。
这将推广到任意数量的列,只要它们以AA_varXXX格式命名,并且只要不在该格式中的唯一列是myid
。如果不是严格的话,很容易修复,但这些是现在编写的代码的限制。
df.dat <- df[!names(df) == "myid"]
diverse.rows <- rowSums(
sapply(
split.default(df.dat, gsub("^([A-Z]{2})_var.*", "\\1", names(df.dat))),
function(x) apply(x, 1, function(y) any(!is.na(y)))
) ) > 1
cbind(df, div.mean=ifelse(diverse.rows, rowMeans(df.dat, na.rm=T), NA))
产地:
AA_var1 AA_var2 myid BB_var3 BB_var4 div.mean
1 NA NA 123456 10 12 NA
2 NA 10 194200 12 NA 11
3 12 10 132200 NA NA NA
4 12 NA 132201 NA 12 12
5 NA NA 132202 NA NA NA
6 12 13 132203 14 NA 13
答案 1 :(得分:1)
这个解决方案对我来说似乎有点费解,所以可能有更好的方法,但它应该适合你。
# Here's your data:
df <- data.frame(AA_var1 = c(NA,NA,12,12,NA,12),
AA_var2 = c(NA,10,10,NA,NA,13),
BB_var3 = c(10,12,NA,NA,NA,14),
BB_var4 = c(12,NA,NA,12,NA,NA))
# calculate rowMeans for each subset of variables
a <- rowMeans(df[,grepl('AA',names(df))], na.rm=TRUE)
b <- rowMeans(df[,grepl('BB',names(df))], na.rm=TRUE)
# count non-missing values for each subset of variables
a2 <- rowSums(!is.na(df[,grepl('AA',names(df))]), na.rm=TRUE)
b2 <- rowSums(!is.na(df[,grepl('BB',names(df))]), na.rm=TRUE)
# calculate means:
rowSums(cbind(a*a2,b*b2)) /
rowSums(!is.na(df[,grepl('[AA]|[BB]',names(df))]), na.rm=TRUE)
结果:
> df$rowMeanIfDiverseData <- rowSums(cbind(a*a2,b*b2)) /
+ rowSums(!is.na(df[,grepl('[AA]|[BB]',names(df))]), na.rm=TRUE)
> df
AA_var1 AA_var2 BB_var3 BB_var4 rowMeanIfDiverseData
1 NA NA 10 12 NaN
2 NA 10 12 NA 11
3 12 10 NA NA NaN
4 12 NA NA 12 12
5 NA NA NA NA NaN
6 12 13 14 NA 13
稍微清理以完全符合您的预期输出:
> df$rowMeanIfDiverseData[is.nan(df$rowMeanIfDiverseData)] <- NA
> df
AA_var1 AA_var2 BB_var3 BB_var4 rowMeanIfDiverseData
1 NA NA 10 12 NA
2 NA 10 12 NA 11
3 12 10 NA NA NA
4 12 NA NA 12 12
5 NA NA NA NA NA
6 12 13 14 NA 13
答案 2 :(得分:1)
我的尝试,有点啰嗦......
dat<-data.frame(AA_var1=c(NA,NA,12,12,NA,12),
AA_var2=c(NA,10,10,NA,NA,13),
myid=1:6,
BB_var3=c(10,12,NA,NA,NA,14),
BB_var4=c(12,NA,NA,12,NA,NA))
#what columns are associated with variables used in our mean
varcols<-grep("*var[1-9]",names(dat),value=T)
#which rows have the requisite diversification of non-nulls
#i assume these columns will start with capitals and folloowed by underscore
meanrow<-apply(!is.na(dat[,varcols]),1,function(x){n<-varcols[x]
1<length(unique(regmatches(n,regexpr("[A-Z]+_",n))))
})
#do the row mean for all
dat$meanval<-rowMeans(dat[,varcols],na.rm=T)
#null out for those without diversification (i.e. !meanrow)
dat[!meanrow,"meanval"]<-NA
答案 3 :(得分:1)
我认为一些答案使得这看起来更复杂。这样就可以了:
df$means = ifelse(rowSums(!is.na(df[, grep('AA_var', names(df))])) &
rowSums(!is.na(df[, grep('BB_var', names(df))])),
rowMeans(df[, grep('_var', names(df))], na.rm = T), NA)
# AA_var1 AA_var2 myid BB_var3 BB_var4 means
#1 NA NA 123456 10 12 NA
#2 NA 10 194200 12 NA 11
#3 12 10 132200 NA NA NA
#4 12 NA 132201 NA 12 12
#5 NA NA 132202 NA NA NA
#6 12 13 132203 14 NA 13
以下是给出评论的上述概括,假设有唯一的id(如果不是,则创建一个唯一的索引):
library(data.table)
library(reshape2)
dt = data.table(df)
setkey(dt, myid) # not strictly necessary, but makes life easier
# find the conditional
cond = melt(dt, id.var = 'myid')[,
sum(!is.na(value)), by = list(myid, sub('_var.*', '', variable))][,
all(V1 != 0), keyby = myid]$V1
# fill in the means (could also do a join, but will rely on ordering instead)
dt[cond, means := rowMeans(.SD, na.rm = T), .SDcols = grep('_var', names(dt))]
dt
# AA_var1 AA_var2 myid BB_var3 BB_var4 means
#1: NA NA 123456 10 12 NA
#2: 12 10 132200 NA NA NA
#3: 12 NA 132201 NA 12 12
#4: NA NA 132202 NA NA NA
#5: 12 13 132203 14 NA 13
#6: NA 10 194200 12 NA 11
答案 4 :(得分:0)
fun <- function(x) {
MEAN <- mean(c(x[1], x[2], x[4], x[5]), na.rm=TRUE)
CHECK <- sum(!is.na(c(x[1], x[2]))) > 0 & sum(!is.na(c(x[4], x[5])) > 0)
MEAN * ifelse(CHECK, 1, NaN)
}
df$rowMeanIfDiverseData <- apply(df, 1, fun)
df
AA_var1 AA_var2 myid BB_var3 BB_var4 rowMeanIfDiverseData
1 NA NA 123456 10 12 NaN
2 NA 10 194200 12 NA 11
3 12 10 132200 NA NA NaN
4 12 NA 132201 NA 12 12
5 NA NA 132202 NA NA NaN
6 12 13 132203 14 NA 13