目标是根据已知类别的数据为没有它们的行创建类别(cat
),这些数据用于构建data
2和4的cat
范围,见表myrange
。
在我的策略中,在myrange
df中生成变体的类别范围后,我使用了mapply。转换为数据帧后,我在列或行中得到mapply
的结果。为了将结果附加到原始数据帧,我创建了两个if
语句,以便始终在列中获取结果,我想知道是否有更好的方法来处理它。将新列附加到原始数据框后,将创建一个新的完整列(无NA)类别。
目标是使用data.table或dplyr简化函数以避免mapply,只需根据mylogicm
数据框和逻辑中的数据附加myrange
中的新列。在mapply
函数中。
text<-(" cat data
1 NA 1.50
5 2 1.08
6 2 1.10
7 2 1.30
9 2 1.40
10 2 1.41
11 2 1.45
12 2 1.50
13 2 1.56
14 2 1.56
20 4 2.55
21 4 2.71")
mydf1<-read.table(text = text, stringsAsFactors = F)
mydf2<-mydf1[which(mydf1$cat %in% 4==F),]
library(dplyr)
rangefunction<-function(df,colgroup,varcount){
#this creates ranges (cats) based on the observations
myrange<-as.data.frame(df %>% group_by_(colgroup) %>%
summarise_(MinValue = lazyeval::interp(~min(var), var=as.name(varcount)),
MaxValue = lazyeval::interp(~max(var), var=as.name(varcount))
) )
# Handling of NA categories
myrangeb<-myrange[complete.cases(myrange),]
if(nrow(myrangeb)==0){
myrange[is.na(myrange[,colgroup]),][,colgroup]<-2
}
myrange<-myrange[complete.cases(myrange),]
# Assigning 0 as MinValue
if(nrow(myrange[myrange[,colgroup]==2,])>0){
myrange[myrange[,colgroup]==2,]$MinValue<-0}
# Checking range overlap
if(nrow(myrange)>1){
for (i in 1:(nrow(myrange)-1) ) {#cat
if(myrange$MaxValue[i]>myrange$MinValue[i+1]){
print(paste("warning",i,"max > min i+1") )
}
}
}
# this is intended to evaluate the values in the data column of dfs to assign
# them to the ranges created in the previous step. This is useful for data with
#NA in the cat column (not shown).
mylogicm<-as.data.frame(mapply(function(x) {x >= myrange$MinValue & x<= myrange$MaxValue}, x=df[,varcount] ) )
# transpose to homogenize because
# mylogicm # for mydf1 in several columns while for in mydf2 several rows
if(nrow(df)==nrow(mylogicm))
{
colnames(mylogicm) <- myrange[,colgroup]
}
if(nrow(df)==ncol(mylogicm))
{
mylogicm<-t(mylogicm)
colnames(mylogicm) <- myrange[,colgroup]
}
# transform logical columns to columns with category names in them
mylogicm<-as.data.frame(mylogicm)
mylogicm[] <- lapply( mylogicm, factor)
col_names <- names(mylogicm)
mylogicm[col_names] <- lapply(mylogicm[col_names] , factor)
mylogicm[] <- lapply(mylogicm, function(x) levels(x)[x])
w <- which(mylogicm == "TRUE", arr.ind = TRUE)
mylogicm[w] <- names(mylogicm)[w[,"col"]]
mylogicm <- suppressWarnings(as.data.frame(sapply( mylogicm, as.numeric ) ) )
# merge calculating a mean of categories, when several present
df$inferredcategory<-lapply(split(mylogicm, seq(nrow(mylogicm))), function(x) mean(unlist(x), na.rm=T) )
# give a category in a new column only for rows with cat == NA
df$oldandnew<-df[,colgroup]
df[which(is.na(df[,colgroup])),]$oldandnew<-df[which(is.na(df[,colgroup])),]$inferredcategory
return(df)
}
df<-mydf1
df<-mydf2
colgroup<-"cat"
varcount<-"data"
rangefunction(df,colgroup,varcount)
cat data inferredcategory oldandnew
1 NA 1.50 2 2
5 2 1.08 2 2
6 2 1.10 2 2
7 2 1.30 2 2
9 2 1.40 2 2
10 2 1.41 2 2
11 2 1.45 2 2
12 2 1.50 2 2
13 2 1.56 2 2
14 2 1.56 2 2
20 4 2.55 4 4
21 4 2.71 4 4