是否有高效的R-package来处理以下问题:
我有一组数值观察(数千的N),范围从一百万到一百万。给定目标值并且舍入准确度是线性组合,权重-1(减法)/ 0(遗漏)/ 1(加法)使得总和等于舍入误差内的目标值并且还呈现权重?
答案 0 :(得分:0)
以下是我根据您的情况修改过的遗传算法,有关算法的解释,请参阅my answer there。可能有(当然)方法用较少的代码解决您的问题,但我已经在架子上有这个解决方案,并且适应它很简单。所需的输入是data.frame
,其中包含列值和列权重,可以全为零:
value weights
1 45 0
2 33 0
3 47 0
4 65 0
5 12 0
6 43 0
7 5 0
... ... ...
然后,算法将从集c(-1,0,1)
中找到一组权重,使其为
abs(target_value - sum(final_solution$value*final_solution$weights))
被最小化。
肯定还有改进的余地,例如权重现在完全随机设置,因此初始解决方案的预期加权总和始终为0.如果target_value非常高,则最好将1分配给更高的概率比-1,更快地收敛到最优解。
对于这种情况,似乎效果很好,100000
个对象和目标值12000
,它会在几分之一秒内找到最佳解决方案:
代码:
### PARAMETERS -------------------------------------------
n_population = 100 # the number of solutions in a population
n_iterations = 100 # The number of iterations
n_offspring_per_iter = 80 # number of offspring to create per iteration
frac_perm_init = 0.25 # fraction of columns to change from default solution while creating initial solutions
early_stopping_rounds = 100 # Stop if score not improved for this amount of iterations
### SAMPLE DATA -------------------------------------------------
n_objects = 100000
datain =data.frame(value=round(runif(n_objects,0,100)),weights = 0))
target_value=12000
### ALL OUR PREDEFINED FUNCTIONS ----------------------------------
# Score a solution
# We calculate the score by taking the sum of the squares of our overcapacity (so we punish very large overcapacity on a day)
score_solution <- function(solution,target_value)
{
abs(target_value-sum(solution$value*solution$weights))
}
# Merge solutions
# Get approx. 50% of tasks from solution1, and the remaining tasks from solution 2.
merge_solutions <- function(solution1,solution2)
{
solution1$weights = ifelse(runif(nrow(solution1),0,1)>0.5,solution1$weights,solution2$weights)
return(solution1)
}
# Randomize solution
# Create an initial solution
randomize_solution <- function(solution)
{
solution$weights = sample(c(-1,0,1),nrow(solution),replace=T)
return(solution)
}
# sort population based on scores
sort_pop <- function(population)
{
return(population[order(sapply(population,function(x) {x[['score']]}),decreasing = F)])
}
# return the scores of a population
pop_scores <- function(population)
{
sapply(population,function(x) {x[['score']]})
}
### RUN SCRIPT -------------------------------
# starting score
print(paste0('Starting score: ',score_solution(datain,target_value)))
# Create initial population
population = vector('list',n_population)
for(i in 1:n_population)
{
# create initial solutions by making changes to the initial solution
solution = randomize_solution(datain)
score = score_solution(solution,target_value)
population[[i]] = list('solution' = solution,'score'= score)
}
population = sort_pop(population)
score_per_iteration <- score_solution(datain,target_value)
# Run the algorithm
for(i in 1:n_iterations)
{
print(paste0('\n---- Iteration',i,' -----\n'))
# create some random perturbations in the population
for(j in 1:10)
{
sol_to_change = sample(2:n_population,1)
new_solution <- randomize_solution(population[[sol_to_change]][['solution']])
new_score <- score_solution(new_solution,target_value)
population[[sol_to_change]] <- list('solution' = new_solution,'score'= new_score)
}
# Create offspring, first determine which solutions to combine
# determine the probability that a solution will be selected to create offspring (some smoothing)
probs = sapply(population,function(x) {x[['score']]})
if(max(probs)==min(probs)){stop('No diversity in population left')}
probs = 1-(probs-min(probs))/(max(probs)-min(probs))+0.2
# create combinations
solutions_to_combine = lapply(1:n_offspring_per_iter, function(y){
sample(seq(length(population)),2,prob = probs)})
for(j in 1:n_offspring_per_iter)
{
new_solution <- merge_solutions(population[[solutions_to_combine[[j]][1]]][['solution']],
population[[solutions_to_combine[[j]][2]]][['solution']])
new_score <- score_solution(new_solution,target_value)
population[[length(population)+1]] <- list('solution' = new_solution,'score'= new_score)
}
population = sort_pop(population)
population= population[1:n_population]
print(paste0('Best score:',population[[1]]['score']))
score_per_iteration = c(score_per_iteration,population[[1]]['score'])
if(i>early_stopping_rounds+1)
{
if(score_per_iteration[[i]] == score_per_iteration[[i-10]])
{
stop(paste0("Score not improved in the past ",early_stopping_rounds," rounds. Halting algorithm."))
}
}
}
plot(x=seq(0,length(score_per_iteration)-1),y=score_per_iteration,xlab = 'iteration',ylab='score')
final_solution = population[[1]][['solution']]