Smote用于过采样多类数据集

时间:2017-12-05 21:33:56

标签: r

我正在努力调整chawla et al 2002的Smote技术,在以下链接https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html中进行描述 对于只有一个级别类的数据集是不可能的,我认为将我的数据集拆分为数据以进行过采样,称为Dataover包含所有类的过采样,同样用于欠采样和创建一个名为dataunder的数据,因为我想实现另一个名为SCUT的方法({ {3}}只有一个类别的数据。这个想法是实现过采样的smote,但是还没有采样,如下所示:

library("lattice", lib.loc="C:/Program Files/R/R-3.4.2/library")
library(grid)
library(DMwR)
library(caret)
set.seed(1234)
#claculating the mean number of instances of all classes 
m<-36
Dataunder <- NULL
Dataover <- NULL
balancedataover<- NULL
#Split D into 6 datasets with each class
class <- unique(data$CoronaryEvent)
for(i in 1:length(unique(data$CoronaryEvent))){
  class_i <- factor(as.factor(class[i]))
  D<-data[ which(data$CoronaryEvent==as.character(class_i)),]
  D$CoronaryEvent<-as.factor(class_i)
  str(D)
  print(unique(D$CoronaryEvent))
  print(nrow(D))
if (nrow(D)>36){
  #creating a dataset for undersampling
Dataunder <-rbind(Dataunder,D)
  print(nrow(Dataunder))
}
else {
  #creating a dataset for oversampling
      Dataover <- rbind(Dataover, D)
  print( nrow(Dataover))
    }

    i=i+1
}
CoronaryEvent1 <-Dataover$CoronaryEvent
CoronaryEvent2 <-Dataunder$CoronaryEvent
class <- unique(CoronaryEvent)
for(i in 2:length(unique(CoronaryEvent1))-1){
  for(j in (i+1):length(unique(CoronaryEvent2))){
    print(paste(i,j,sep=","))
    print(paste(class[i],class[j],sep=","))
    #selecting subset of training set and testing set where CoronaryEvent equal to class i and class j
    class_i <- factor(as.factor(class[i]))
    class_j <- factor(as.factor(class[j]))

    a<-Dataover[match(as.character(Dataover$CoronaryEvent), class_i, nomatch = FALSE), ]
    a$CoronaryEvent<-class_i
    b<-Dataunder[match(as.character(Dataunder$CoronaryEvent), class_j, nomatch = FALSE), ]
    b$CoronaryEvent<-class_j
    data2 <- rbind(a, b)
   #oversampling 
    m<-36
    perc = as.integer((m/n)*100)
    print(perc)
    newdata <- SMOTE(CoronaryEvent ~ .,  data2, perc.over = perc)
    balancedataover<-rbind(balancedataover,newdata)
  }
  i=i+1
  }

我想通过使用smote技术创建合成示例并将过采样数据放在balancedataover中来对数据中的所有类进行过采样。  我收到了这个错误 $<-.data.frame*tmp*,“CoronaryEvent”,值= 1L)出错:   替换有1行,数据有0

1 个答案:

答案 0 :(得分:0)

我正在尝试采用chawla等2002的Smote技术,在以下链接中进行了介绍:https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html 对于虹膜这样的多类数据集,我下面的代码中描述的当前解决方案是根据DMwR软件包中存在的SMOTE函数的代码改编而成的,并且效果很好。

#NewSMOTE algorithm to balance a multiclass dataset without undersampling majority class(iris as example)
#get an unbalanced iris dataset 
set.seed(9850)
gp <-runif(nrow(iris))
iris<-iris[order(gp),]
s<-sample(150,71)
data<-iris[s,]
#sort classes in decreasing order
tab<-sort(table(data$Species),decreasing=TRUE) 
#majoritycases extrction
selMaj <- which(data$Species==names(tab[1]))
#takes data with minority classes in order to oversample it
data1<-data[data$Species!=names(tab[1]),]
#versicolor and viriginica are minority classes
Species <-factor(data1$Species)
dataminExs<-data.frame()
newExs1 <-data.frame()
class <- unique(Species)
for(i in 1:length(class)){
  # the column where the target variable is
  tgt <- which(names(data1) == "Species")
  #getting the minority class[i]
  minCl <- as.character(class[i])
   # get the cases of the minority class
  minExs <- which(data1[,tgt] == minCl)
  #concatenate all minority classes instances
   dataminExs<- rbind(dataminExs, data1[minExs,])
  print(nrow(dataminExs))
   #calculate percentage of oversampling
 perc.over= as.integer(length(selMaj)/length(minExs))*100
  # generate synthetic cases from these minExs
  newExs <- smote.exs(data1[minExs,],ncol(data1),perc.over,5)
    #all synthetic minority examples created
  print(nrow(newExs))
    newExs1<-rbind(newExs1, newExs)
  print(newExs1 )
#alldatamin <-rbind(newExs1, dataminExs)
     }
# the final data set (the undersample+the rare cases+the smoted exs)
newdataset <- rbind(data[selMaj,],newExs1)

#function to oversample minority class
smote.exs <- function(data,tgt,N,k)
  # INPUTS:
  # data are the rare cases (the minority "class" cases)
  # tgt is the name of the target variable
  # N is the percentage of over-sampling to carry out;
  # and k is the number of nearest neighours to use for the generation
  # OUTPUTS:
  # The result of the function is a (N/100)*T set of generated
  # examples with rare values on the target
{
  nomatr <- c()
  T <- matrix(nrow=dim(data)[1],ncol=dim(data)[2]-1)
  for(col in seq.int(dim(T)[2]))
    if (class(data[,col]) %in% c('factor','character')) {
      T[,col] <- as.integer(data[,col])
      nomatr <- c(nomatr,col)
    } else T[,col] <- data[,col]

  if (N < 100) { # only a percentage of the T cases will be SMOTEd
    nT <- NROW(T)
    idx <- sample(1:nT,as.integer((N/100)*nT))
    T <- T[idx,]
    N <- 100
  }

  p <- dim(T)[2]
  nT <- dim(T)[1]

  ranges <- apply(T,2,max)-apply(T,2,min)

  nexs <-  as.integer(N/100) # this is the number of artificial exs generated
  # for each member of T
  new <- matrix(nrow=nexs*nT,ncol=p)    # the new cases

  for(i in 1:nT) {

    # the k NNs of case T[i,]
    xd <- scale(T,T[i,],ranges)
    for(a in nomatr) xd[,a] <- xd[,a]==0
    dd <- drop(xd^2 %*% rep(1, ncol(xd)))
    kNNs <- order(dd)[2:(k+1)]

    for(n in 1:nexs) {
      # select randomly one of the k NNs
      neig <- sample(1:k,1)

      ex <- vector(length=ncol(T))

      # the attribute values of the generated case
      difs <- T[kNNs[neig],]-T[i,]
      new[(i-1)*nexs+n,] <- T[i,]+runif(1)*difs
      for(a in nomatr)
        new[(i-1)*nexs+n,a] <- c(T[kNNs[neig],a],T[i,a])[1+round(runif(1),0)]

    }
  }
  newCases <- data.frame(new)
  for(a in nomatr)
    newCases[,a] <- factor(newCases[,a],levels=1:nlevels(data[,a]),labels=levels(data[,a]))

  newCases[,tgt] <- factor(rep(data[1,tgt],nrow(newCases)),levels=levels(data[,tgt]))
  colnames(newCases) <- colnames(data)
  newCases
}