我正在努力调整chawla et al 2002的Smote技术,在以下链接https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html中进行描述 对于只有一个级别类的数据集是不可能的,我认为将我的数据集拆分为数据以进行过采样,称为Dataover包含所有类的过采样,同样用于欠采样和创建一个名为dataunder的数据,因为我想实现另一个名为SCUT的方法({ {3}}只有一个类别的数据。这个想法是实现过采样的smote,但是还没有采样,如下所示:
library("lattice", lib.loc="C:/Program Files/R/R-3.4.2/library")
library(grid)
library(DMwR)
library(caret)
set.seed(1234)
#claculating the mean number of instances of all classes
m<-36
Dataunder <- NULL
Dataover <- NULL
balancedataover<- NULL
#Split D into 6 datasets with each class
class <- unique(data$CoronaryEvent)
for(i in 1:length(unique(data$CoronaryEvent))){
class_i <- factor(as.factor(class[i]))
D<-data[ which(data$CoronaryEvent==as.character(class_i)),]
D$CoronaryEvent<-as.factor(class_i)
str(D)
print(unique(D$CoronaryEvent))
print(nrow(D))
if (nrow(D)>36){
#creating a dataset for undersampling
Dataunder <-rbind(Dataunder,D)
print(nrow(Dataunder))
}
else {
#creating a dataset for oversampling
Dataover <- rbind(Dataover, D)
print( nrow(Dataover))
}
i=i+1
}
CoronaryEvent1 <-Dataover$CoronaryEvent
CoronaryEvent2 <-Dataunder$CoronaryEvent
class <- unique(CoronaryEvent)
for(i in 2:length(unique(CoronaryEvent1))-1){
for(j in (i+1):length(unique(CoronaryEvent2))){
print(paste(i,j,sep=","))
print(paste(class[i],class[j],sep=","))
#selecting subset of training set and testing set where CoronaryEvent equal to class i and class j
class_i <- factor(as.factor(class[i]))
class_j <- factor(as.factor(class[j]))
a<-Dataover[match(as.character(Dataover$CoronaryEvent), class_i, nomatch = FALSE), ]
a$CoronaryEvent<-class_i
b<-Dataunder[match(as.character(Dataunder$CoronaryEvent), class_j, nomatch = FALSE), ]
b$CoronaryEvent<-class_j
data2 <- rbind(a, b)
#oversampling
m<-36
perc = as.integer((m/n)*100)
print(perc)
newdata <- SMOTE(CoronaryEvent ~ ., data2, perc.over = perc)
balancedataover<-rbind(balancedataover,newdata)
}
i=i+1
}
我想通过使用smote技术创建合成示例并将过采样数据放在balancedataover中来对数据中的所有类进行过采样。
我收到了这个错误
$<-.data.frame
(*tmp*
,“CoronaryEvent”,值= 1L)出错:
替换有1行,数据有0
答案 0 :(得分:0)
我正在尝试采用chawla等2002的Smote技术,在以下链接中进行了介绍:https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html 对于虹膜这样的多类数据集,我下面的代码中描述的当前解决方案是根据DMwR软件包中存在的SMOTE函数的代码改编而成的,并且效果很好。
#NewSMOTE algorithm to balance a multiclass dataset without undersampling majority class(iris as example)
#get an unbalanced iris dataset
set.seed(9850)
gp <-runif(nrow(iris))
iris<-iris[order(gp),]
s<-sample(150,71)
data<-iris[s,]
#sort classes in decreasing order
tab<-sort(table(data$Species),decreasing=TRUE)
#majoritycases extrction
selMaj <- which(data$Species==names(tab[1]))
#takes data with minority classes in order to oversample it
data1<-data[data$Species!=names(tab[1]),]
#versicolor and viriginica are minority classes
Species <-factor(data1$Species)
dataminExs<-data.frame()
newExs1 <-data.frame()
class <- unique(Species)
for(i in 1:length(class)){
# the column where the target variable is
tgt <- which(names(data1) == "Species")
#getting the minority class[i]
minCl <- as.character(class[i])
# get the cases of the minority class
minExs <- which(data1[,tgt] == minCl)
#concatenate all minority classes instances
dataminExs<- rbind(dataminExs, data1[minExs,])
print(nrow(dataminExs))
#calculate percentage of oversampling
perc.over= as.integer(length(selMaj)/length(minExs))*100
# generate synthetic cases from these minExs
newExs <- smote.exs(data1[minExs,],ncol(data1),perc.over,5)
#all synthetic minority examples created
print(nrow(newExs))
newExs1<-rbind(newExs1, newExs)
print(newExs1 )
#alldatamin <-rbind(newExs1, dataminExs)
}
# the final data set (the undersample+the rare cases+the smoted exs)
newdataset <- rbind(data[selMaj,],newExs1)
#function to oversample minority class
smote.exs <- function(data,tgt,N,k)
# INPUTS:
# data are the rare cases (the minority "class" cases)
# tgt is the name of the target variable
# N is the percentage of over-sampling to carry out;
# and k is the number of nearest neighours to use for the generation
# OUTPUTS:
# The result of the function is a (N/100)*T set of generated
# examples with rare values on the target
{
nomatr <- c()
T <- matrix(nrow=dim(data)[1],ncol=dim(data)[2]-1)
for(col in seq.int(dim(T)[2]))
if (class(data[,col]) %in% c('factor','character')) {
T[,col] <- as.integer(data[,col])
nomatr <- c(nomatr,col)
} else T[,col] <- data[,col]
if (N < 100) { # only a percentage of the T cases will be SMOTEd
nT <- NROW(T)
idx <- sample(1:nT,as.integer((N/100)*nT))
T <- T[idx,]
N <- 100
}
p <- dim(T)[2]
nT <- dim(T)[1]
ranges <- apply(T,2,max)-apply(T,2,min)
nexs <- as.integer(N/100) # this is the number of artificial exs generated
# for each member of T
new <- matrix(nrow=nexs*nT,ncol=p) # the new cases
for(i in 1:nT) {
# the k NNs of case T[i,]
xd <- scale(T,T[i,],ranges)
for(a in nomatr) xd[,a] <- xd[,a]==0
dd <- drop(xd^2 %*% rep(1, ncol(xd)))
kNNs <- order(dd)[2:(k+1)]
for(n in 1:nexs) {
# select randomly one of the k NNs
neig <- sample(1:k,1)
ex <- vector(length=ncol(T))
# the attribute values of the generated case
difs <- T[kNNs[neig],]-T[i,]
new[(i-1)*nexs+n,] <- T[i,]+runif(1)*difs
for(a in nomatr)
new[(i-1)*nexs+n,a] <- c(T[kNNs[neig],a],T[i,a])[1+round(runif(1),0)]
}
}
newCases <- data.frame(new)
for(a in nomatr)
newCases[,a] <- factor(newCases[,a],levels=1:nlevels(data[,a]),labels=levels(data[,a]))
newCases[,tgt] <- factor(rep(data[1,tgt],nrow(newCases)),levels=levels(data[,tgt]))
colnames(newCases) <- colnames(data)
newCases
}