根据R

时间:2017-08-24 20:45:59

标签: r dplyr data.table mapply

目标是根据已知类别的数据为没有它们的行创建类别(cat),这些数据用于构建data 2和4的cat范围,见表myrange

在我的策略中,在myrange df中生成变体的类别范围后,我使用了mapply。转换为数据帧后,我在列或行中得到mapply的结果。为了将结果附加到原始数据帧,我创建了两个if语句,以便始终在列中获取结果,我想知道是否有更好的方法来处理它。将新列附加到原始数据框后,将创建一个新的完整列(无NA)类别。

目标是使用data.table或dplyr简化函数以避免mapply,只需根据mylogicm数据框和逻辑中的数据附加myrange中的新列。在mapply函数中。

text<-(" cat data
       1  NA 1.50
       5   2 1.08
       6   2 1.10
       7   2 1.30
       9   2 1.40
       10  2 1.41
       11  2 1.45
       12  2 1.50
       13  2 1.56
       14  2 1.56
       20  4 2.55
       21  4 2.71")

mydf1<-read.table(text = text, stringsAsFactors = F)
mydf2<-mydf1[which(mydf1$cat %in% 4==F),]

library(dplyr)

rangefunction<-function(df,colgroup,varcount){
#this creates ranges (cats) based on the observations
  myrange<-as.data.frame(df %>% group_by_(colgroup) %>% 
                           summarise_(MinValue = lazyeval::interp(~min(var), var=as.name(varcount)),
                                      MaxValue = lazyeval::interp(~max(var), var=as.name(varcount))
                           ) )
# Handling of NA categories
  myrangeb<-myrange[complete.cases(myrange),]
  if(nrow(myrangeb)==0){
    myrange[is.na(myrange[,colgroup]),][,colgroup]<-2
  }
  myrange<-myrange[complete.cases(myrange),]
# Assigning 0 as MinValue
    if(nrow(myrange[myrange[,colgroup]==2,])>0){
    myrange[myrange[,colgroup]==2,]$MinValue<-0}
# Checking range overlap
    if(nrow(myrange)>1){
    for (i in 1:(nrow(myrange)-1) ) {#cat
      if(myrange$MaxValue[i]>myrange$MinValue[i+1]){
        print(paste("warning",i,"max > min i+1") )
      }
    }
    }
# this is intended to evaluate the values in the data column of dfs to assign
# them to the ranges created in the previous step. This is useful for data with 
#NA in the cat column (not shown).
  mylogicm<-as.data.frame(mapply(function(x) {x >= myrange$MinValue & x<= myrange$MaxValue}, x=df[,varcount] ) ) 
# transpose to homogenize because
# mylogicm # for mydf1 in several columns while for in mydf2 several rows
  if(nrow(df)==nrow(mylogicm))
  {
    colnames(mylogicm) <- myrange[,colgroup]
  }
  if(nrow(df)==ncol(mylogicm))
  {
    mylogicm<-t(mylogicm)
    colnames(mylogicm) <- myrange[,colgroup]
  }
# transform logical columns to columns with category names in them
  mylogicm<-as.data.frame(mylogicm)
  mylogicm[] <- lapply( mylogicm, factor) 
  col_names <- names(mylogicm)
  mylogicm[col_names] <- lapply(mylogicm[col_names] , factor)
  mylogicm[] <- lapply(mylogicm, function(x) levels(x)[x])
  w <- which(mylogicm == "TRUE", arr.ind = TRUE)
  mylogicm[w] <- names(mylogicm)[w[,"col"]]
  mylogicm <- suppressWarnings(as.data.frame(sapply( mylogicm, as.numeric ) ) )
# merge calculating a mean of categories, when several present
  df$inferredcategory<-lapply(split(mylogicm, seq(nrow(mylogicm))), function(x)  mean(unlist(x), na.rm=T) )
# give a category in a new column only for rows with cat == NA
  df$oldandnew<-df[,colgroup]
  df[which(is.na(df[,colgroup])),]$oldandnew<-df[which(is.na(df[,colgroup])),]$inferredcategory
  return(df)
}

df<-mydf1
df<-mydf2

colgroup<-"cat"
varcount<-"data"

rangefunction(df,colgroup,varcount)
   cat data inferredcategory oldandnew
1   NA 1.50                2         2
5    2 1.08                2         2
6    2 1.10                2         2
7    2 1.30                2         2
9    2 1.40                2         2
10   2 1.41                2         2
11   2 1.45                2         2
12   2 1.50                2         2
13   2 1.56                2         2
14   2 1.56                2         2
20   4 2.55                4         4
21   4 2.71                4         4

0 个答案:

没有答案