我正在使用各种算法来实现强盗问题。我面临的问题是,对于5臂和2000视界,epsilon-greedy的性能要优于UCB,而epsilon值为0.95。 我知道以下事实:当视界与许多武器相当时,ε贪婪的确确实表现更好。 但是,由于我的手臂远远小于地平线,因此UCB的性能应该更好。知道为什么会这样吗? 我随函附上了UCB实现。
else if(algorithm.compare("UCB") == 0){
if(pulls == 0){
armpullfrequency = new int[numArms];
armRewards = new float[numArms];
armmean = new double[numArms];
UCB = new double[numArms];
for(int i=0; i<numArms; i++){
armpullfrequency[i] = 0;
armRewards[i] = 0.0;
armmean[i] = (double)0;
UCB[i] = (double)0;
}
}
else{
armpullfrequency[pulled_arm] = armpullfrequency[pulled_arm] + 1;
armRewards[pulled_arm] = armRewards[pulled_arm] + reward;
}
int selected_arm = 0;
//int randint = (rand() % 100);
if(pulls<=6){
for(int i=0;i<numArms;i++){
if(armpullfrequency[i]==0){
selected_arm = i;
return selected_arm;
}
}
}
for(int i=0;i<numArms;i++){
int freq = armpullfrequency[i];
float prize = armRewards[i];
double mean = eval_mean(freq, prize);
armmean[i] = mean;
}
for(int i=0; i<numArms;i++){
int freq = armpullfrequency[i];
double mean = armmean[i];
double UCBval = UCBUpdate(mean, freq, pulls);
UCB[i] = UCBval;
}
selected_arm = LargestElementIndex(UCB, numArms);
return(selected_arm);
我的UCB和LargestElementIndex函数是:-
int LargestElementIndex(double arr[], int size){
int max = 0;
for(int i=0;i<size; i++){
if(arr[i]>max){
max = arr[i];
}
}
return max;
}
int UCBUpdate(double mean, int freq, int pulls){
double result = mean + sqrt((double)2.0 *(log(pulls))/(double)freq);
return result;
}
对于UCB,结果为:- maxMean 0.5805 numTotalPulls 2000累积奖励716.308 遗憾= 444.692
Epsilon Greedy的结果是:- 最大平均值0.5805 numTotalPulls 2000累积奖励823.948 遗憾= 337.052
答案 0 :(得分:1)
我怀疑错误在于以下代码:
int LargestElementIndex(double arr[], int size){
int max = 0;
for(int i=0;i<size; i++){
if(arr[i]>max){
max = arr[i];
}
}
return max;
}
这不会返回具有最大UCB值(这可能是您想要的值)的机械臂的索引。将此代码强制转换为int
之后,仅返回数组本身的最大UCB值。可以将其固定如下:
int LargestElementIndex(double arr[], int size){
double max_val = -1000.0;
int max_idx = -1;
for(int i=0;i<size; i++){
if(arr[i]>max_val){
max_val = arr[i];
max_idx = i;
}
}
return max_idx;
}