在不对大多数类进行欠采样的情况下,过度采样多类数据集

时间:2017-11-30 15:50:40

标签: r

此问题已解决,请参阅Smote for oversampling multiclass dataset

虽然我有一个小型数据集,但我想对少数类进行过采样,而不对大多数类进行欠采样以使其更适合我的模型。我的想法是将smote调整为多类过采样,而不是在下面详细介绍大多数类。代码运行良好但是如何在过采样后添加条件使所有类等于多数类? 我目前的结果如下:setosa versicolor virginica         48 25 44 这是我的代码

#NewSMOTE algorithm to balance a multiclass dataset without undersampling majority class(iris as example)
#get an unbalanced iris dataset 
set.seed(9850)
gp <-runif(nrow(iris))
iris<-iris[order(gp),]
s<-sample(150,71)
data<-iris[s,]
#sort classes in decreasing order
tab<-sort(table(data$Species),decreasing=TRUE) 
#majoritycases extrction
selMaj <- which(data$Species==names(tab[1]))
#takes data with minority classes in order to oversample it
data1<-data[data$Species!=names(tab[1]),]
#versicolor and viriginica are minority classes
Species <-factor(data1$Species)
dataminExs<-data.frame()
newExs1 <-data.frame()
class <- unique(Species)
for(i in 1:length(class)){
  # the column where the target variable is
  tgt <- which(names(data1) == "Species")
  #getting the minority class[i]
  minCl <- as.character(class[i])
   # get the cases of the minority class
  minExs <- which(data1[,tgt] == minCl)
  #concatenate all minority classes instances
   dataminExs<- rbind(dataminExs, data1[minExs,])
  print(nrow(dataminExs))
   #calculate percentage of oversampling
 perc.over= as.integer(length(selMaj)/length(minExs))*100
  # generate synthetic cases from these minExs
  newExs <- smote.exs(data1[minExs,],ncol(data1),perc.over,5)
    #all synthetic minority examples created
    newExs1<-rbind(newExs1, newExs)
  print(newExs1 )
alldatamin <-rbind(newExs1, dataminExs)
print(nrow(alldatamin))
     }
# the final data set (the undersample+the rare cases+the smoted exs)
newdataset <- rbind(data[selMaj,],alldatamin)

#function to oversample minority class
smote.exs <- function(data,tgt,N,k)
  # INPUTS:
  # data are the rare cases (the minority "class" cases)
  # tgt is the name of the target variable
  # N is the percentage of over-sampling to carry out;
  # and k is the number of nearest neighours to use for the generation
  # OUTPUTS:
  # The result of the function is a (N/100)*T set of generated
  # examples with rare values on the target
{
  nomatr <- c()
  T <- matrix(nrow=dim(data)[1],ncol=dim(data)[2]-1)
  for(col in seq.int(dim(T)[2]))
    if (class(data[,col]) %in% c('factor','character')) {
      T[,col] <- as.integer(data[,col])
      nomatr <- c(nomatr,col)
    } else T[,col] <- data[,col]

  if (N < 100) { # only a percentage of the T cases will be SMOTEd
    nT <- NROW(T)
    idx <- sample(1:nT,as.integer((N/100)*nT))
    T <- T[idx,]
    N <- 100
  }

  p <- dim(T)[2]
  nT <- dim(T)[1]

  ranges <- apply(T,2,max)-apply(T,2,min)

  nexs <-  as.integer(N/100) # this is the number of artificial exs generated
  # for each member of T
  new <- matrix(nrow=nexs*nT,ncol=p)    # the new cases

  for(i in 1:nT) {

    # the k NNs of case T[i,]
    xd <- scale(T,T[i,],ranges)
    for(a in nomatr) xd[,a] <- xd[,a]==0
    dd <- drop(xd^2 %*% rep(1, ncol(xd)))
    kNNs <- order(dd)[2:(k+1)]

    for(n in 1:nexs) {
      # select randomly one of the k NNs
      neig <- sample(1:k,1)

      ex <- vector(length=ncol(T))

      # the attribute values of the generated case
      difs <- T[kNNs[neig],]-T[i,]
      new[(i-1)*nexs+n,] <- T[i,]+runif(1)*difs
      for(a in nomatr)
        new[(i-1)*nexs+n,a] <- c(T[kNNs[neig],a],T[i,a])[1+round(runif(1),0)]

    }
  }
  newCases <- data.frame(new)
  for(a in nomatr)
    newCases[,a] <- factor(newCases[,a],levels=1:nlevels(data[,a]),labels=levels(data[,a]))

  newCases[,tgt] <- factor(rep(data[1,tgt],nrow(newCases)),levels=levels(data[,tgt]))
  colnames(newCases) <- colnames(data)
  newCases
}

0 个答案:

没有答案