通过随机样本增加加权平均值

时间:2017-04-28 10:56:30

标签: r performance function loops

我编写了代码,以便将点随机添加到数字变量,以便将加权平均分数增加10%,将新分数存储在变量S1中。

这是通过计算需要添加的总点数来增加平均值10%来完成的。下一步是随机选择,直到响应的加权和等于目标 - 但不添加分数已经为10的点,以便不通过比例上的最大值。最后一步是选择目标上方或下方的总和是否最接近并选择此样本以添加点。

代码工作正常,但效率不高。我是一个R新手并且已经阅读过应该尽可能避免循环,但是无法找到替代方案。是否有可能做我正在尝试的事情,但更有效率?

#Create random data    
library(stats)
    set.seed(21821)
    ncust <- 1000
    cust.df <- data.frame(cust.id=as.factor(c(1:ncust)))
    wtvar <- rnorm(ncust, mean=1, sd=0.2)
    V1 <- floor(rnorm(ncust, mean=7.5, sd=3))
    V1[V1 > 10] <- 10
    V1[V1 < 1] <- 1
    cust.df$V1 <- V1
    cust.df$wtvar <- wtvar

#Function to determine sample required   
    random.sample <- function(x) {
    (pctadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)/sum(cust.df$V1[!cust.df$V1==10]*cust.df$wtvar[!cust.df$V1==10])) #percentage of resps (weighted) who need points added to make 10% increase
    (numadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)) #sum of weights needed to make 10% increase

    wgttot <- vector(mode="numeric", length=0)
    idtot <- vector(mode="numeric", length=0)
    id.ref <- cust.df$cust.id[!cust.df$V1==10]

      repeat {
        preidtot <- idtot
        prewgttot <- wgttot
        (t.id <- as.numeric(sample(id.ref, 1)))
        (t.wgt <- cust.df$wtvar[cust.df$cust.id==t.id])
        id.ref <- id.ref[!id.ref==t.id]
        wgttot <- c(wgttot,t.wgt)
        idtot <- c(idtot, t.id)
        if (sum(wgttot) > numadd) break
      }
      prediff <- numadd - sum(prewgttot)
      postdiff <- sum(wgttot) - numadd
      if (prediff < postdiff) {
        x <- preidtot
      } else {
        x <- idtot
        }
      return(x)
    }

tempids <- random.sample()

#Apply sample rule
cust.df$S1 = ifelse(cust.df$cust.id %in% tempids, cust.df$V1 + 1, cust.df$V1)

#Check ~10% increase achieved
weighted.mean(cust.df$V1,cust.df$wtvar)
weighted.mean(cust.df$S1,cust.df$wtvar)

1 个答案:

答案 0 :(得分:7)

random.sample是您的第一个版本,random.sample1是无循环版本,random.sample1与random.sample类似,但结果却不同。您可以检查代码以查看如何使用random.sample1的结果。而且由于您的定义,所需的样本并不是唯一的,因此加权和的结果也不同,但都大约增加了10%。

#Create random data    
library(stats)
set.seed(21821)
ncust <- 1000
cust.df <- data.frame(cust.id=as.factor(c(1:ncust)))
wtvar <- rnorm(ncust, mean=1, sd=0.2)
V1 <- floor(rnorm(ncust, mean=7.5, sd=3))
V1[V1 > 10] <- 10
V1[V1 < 1] <- 1
cust.df$V1 <- V1
cust.df$wtvar <- wtvar

#Function to determine sample required   
random.sample <- function() {
    (pctadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)/sum(cust.df$V1[!cust.df$V1==10]*cust.df$wtvar[!cust.df$V1==10])) #percentage of resps (weighted) who need points added to make 10% increase
    (numadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)) #sum of weights needed to make 10% increase

    wgttot <- vector(mode="numeric", length=0)
    idtot <- vector(mode="numeric", length=0)
    id.ref <- cust.df$cust.id[!cust.df$V1==10]

    repeat {
        preidtot <- idtot
        prewgttot <- wgttot
        (t.id <- as.numeric(sample(id.ref, 1)))
        (t.wgt <- cust.df$wtvar[cust.df$cust.id==t.id])
        id.ref <- id.ref[!id.ref==t.id]
        wgttot <- c(wgttot,t.wgt)
        idtot <- c(idtot, t.id)
        if (sum(wgttot) > numadd) break
    }
    prediff <- numadd - sum(prewgttot)
    postdiff <- sum(wgttot) - numadd
    if (prediff < postdiff) {
        x <- preidtot
    } else {
        x <- idtot
    }
    return(x)
}

random.sample1 <- function() {
    numadd <- sum(cust.df$V1 * cust.df$wtvar) * 0.1 #sum of weights needed to make 10% increase
    id.ref <- which(cust.df$V1 != 10)
    pos <- sample(id.ref, length(id.ref))
    t.wgt <- cust.df$wtvar[pos]
    sumwgttot <- cumsum(t.wgt)
    return(pos[1:which.min(abs(sumwgttot - numadd))])
}

system.time(tempids <- random.sample())
## On my computer, it uses about 0.200s to finish the calculation.
system.time(tempids1 <- random.sample1())
## On my computer, the without loop version uses about 0.000s.

#Apply sample rule
cust.df$S1 = ifelse(cust.df$cust.id %in% tempids, cust.df$V1 + 1, cust.df$V1)
## Note that the usage of tempids1 is different, this usage is more 
## effective than the original one.
cust.df$S2 = cust.df$V1
cust.df$S2[tempids1] = cust.df$V1[tempids1] + 1

#Check ~10% increase achieved
weighted.mean(cust.df$V1,cust.df$wtvar)
weighted.mean(cust.df$S1,cust.df$wtvar)
weighted.mean(cust.df$S2,cust.df$wtvar)